From feadff736f017f65400b96240c1048c0ad695a92 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 10:23:08 -0700
Subject: [PATCH 01/25] feat: add push_to_hub integration for HuggingFace
 datasets

Implement HuggingFace Hub integration to upload DataDesigner datasets:
- Add HuggingFaceHubClient with upload_dataset method
- Upload main parquet files to data/ subset
- Upload processor outputs to data/{processor_name}/ subsets
- Generate dataset card from metadata.json with column statistics
- Include sdg.json and metadata.json configuration files
- Comprehensive validation and error handling
- Add push_to_hub() method to DatasetCreationResults
---
 packages/data-designer/pyproject.toml         |   1 +
 .../integrations/huggingface/__init__.py      |   7 +
 .../integrations/huggingface/client.py        | 262 +++++++++
 .../integrations/huggingface/dataset_card.py  | 114 ++++
 .../huggingface/dataset_card_template.md      |  82 +++
 .../src/data_designer/interface/results.py    |  42 ++
 .../integrations/huggingface/__init__.py      |   2 +
 .../integrations/huggingface/test_client.py   | 523 ++++++++++++++++++
 .../huggingface/test_dataset_card.py          | 125 +++++
 uv.lock                                       |   2 +
 10 files changed, 1160 insertions(+)
 create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
 create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/client.py
 create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
 create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
 create mode 100644 packages/data-designer/tests/integrations/huggingface/__init__.py
 create mode 100644 packages/data-designer/tests/integrations/huggingface/test_client.py
 create mode 100644 packages/data-designer/tests/integrations/huggingface/test_dataset_card.py

diff --git a/packages/data-designer/pyproject.toml b/packages/data-designer/pyproject.toml
index 31076704..883e18ea 100644
--- a/packages/data-designer/pyproject.toml
+++ b/packages/data-designer/pyproject.toml
@@ -22,6 +22,7 @@ classifiers = [
 dependencies = [
     "data-designer-config",
     "data-designer-engine",
+    "huggingface-hub>=1.0.1,<2",
     "prompt-toolkit>=3.0.0,<4",
     "typer>=0.12.0,<1",
 ]
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
new file mode 100644
index 00000000..99b9d93e
--- /dev/null
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError
+from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
+
+__all__ = ["HuggingFaceHubClient", "HuggingFaceUploadError", "DataDesignerDatasetCard"]
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
new file mode 100644
index 00000000..7121bff4
--- /dev/null
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -0,0 +1,262 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+
+from huggingface_hub import HfApi
+from huggingface_hub.utils import HfHubHTTPError
+
+from data_designer.errors import DataDesignerError
+from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
+
+
+class HuggingFaceUploadError(DataDesignerError):
+    """Error during HuggingFace dataset upload."""
+
+
+class HuggingFaceHubClient:
+    """Client for interacting with HuggingFace Hub to upload datasets."""
+
+    def __init__(self, token: str | None = None):
+        """Initialize HuggingFace Hub client.
+
+        Args:
+            token: HuggingFace API token. If None, the token is automatically
+                resolved from HF_TOKEN environment variable or cached credentials
+                from `huggingface-cli login`.
+        """
+        self.token = token
+        self._api = HfApi(token=token)
+
+    def upload_dataset(
+        self,
+        repo_id: str,
+        base_dataset_path: Path,
+        *,
+        private: bool = False,
+        create_pr: bool = False,
+    ) -> str:
+        """Upload dataset to HuggingFace Hub.
+
+        Uploads the complete dataset including:
+        - Main parquet batch files from parquet-files/ → data/
+        - Processor output batch files from processors-files/{name}/ → data/{name}/
+        - Existing sdg.json and metadata.json files
+        - Auto-generated README.md (dataset card)
+
+        Args:
+            repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
+            base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
+            private: Whether to create private repo
+            create_pr: Whether to create a PR instead of direct push
+
+        Returns:
+            URL to the uploaded dataset
+
+        Raises:
+            HuggingFaceUploadError: If validation fails or upload encounters errors
+        """
+        self._validate_repo_id(repo_id)
+        self._validate_dataset_path(base_dataset_path)
+
+        try:
+            self._api.create_repo(
+                repo_id=repo_id,
+                repo_type="dataset",
+                exist_ok=True,
+                private=private,
+            )
+        except HfHubHTTPError as e:
+            if e.response.status_code == 401:
+                raise HuggingFaceUploadError(
+                    "Authentication failed. Please provide a valid HuggingFace token. "
+                    "You can set it via the token parameter or HF_TOKEN environment variable, "
+                    "or run 'huggingface-cli login'."
+                ) from e
+            elif e.response.status_code == 403:
+                raise HuggingFaceUploadError(
+                    f"Permission denied. You don't have access to create repository '{repo_id}'. "
+                    "Check your token permissions or repository ownership."
+                ) from e
+            else:
+                raise HuggingFaceUploadError(f"Failed to create repository '{repo_id}': {e}") from e
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
+
+        try:
+            self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr)
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
+
+        parquet_folder = base_dataset_path / "parquet-files"
+        try:
+            self._api.upload_folder(
+                repo_id=repo_id,
+                folder_path=str(parquet_folder),
+                path_in_repo="data",
+                repo_type="dataset",
+                commit_message="Upload main dataset parquet files",
+                create_pr=create_pr,
+            )
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e
+
+        processors_folder = base_dataset_path / "processors-files"
+        if processors_folder.exists():
+            for processor_dir in processors_folder.iterdir():
+                if processor_dir.is_dir():
+                    try:
+                        self._api.upload_folder(
+                            repo_id=repo_id,
+                            folder_path=str(processor_dir),
+                            path_in_repo=f"data/{processor_dir.name}",
+                            repo_type="dataset",
+                            commit_message=f"Upload processor outputs: {processor_dir.name}",
+                            create_pr=create_pr,
+                        )
+                    except Exception as e:
+                        raise HuggingFaceUploadError(
+                            f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
+                        ) from e
+
+        for config_file in ["sdg.json", "metadata.json"]:
+            config_path = base_dataset_path / config_file
+            if config_path.exists():
+                try:
+                    self._api.upload_file(
+                        repo_id=repo_id,
+                        path_or_fileobj=str(config_path),
+                        path_in_repo=config_file,
+                        repo_type="dataset",
+                        commit_message=f"Upload {config_file}",
+                        create_pr=create_pr,
+                    )
+                except Exception as e:
+                    raise HuggingFaceUploadError(f"Failed to upload {config_file}: {e}") from e
+
+        return f"https://huggingface.co/datasets/{repo_id}"
+
+    def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+        """Generate and upload dataset card from metadata.json.
+
+        Args:
+            repo_id: HuggingFace repo ID
+            base_dataset_path: Path to dataset artifacts
+            create_pr: Whether to create a PR instead of direct push
+
+        Raises:
+            HuggingFaceUploadError: If dataset card generation or upload fails
+        """
+        metadata_path = base_dataset_path / "metadata.json"
+        try:
+            with open(metadata_path) as f:
+                metadata = json.load(f)
+        except json.JSONDecodeError as e:
+            raise HuggingFaceUploadError(f"Failed to parse metadata.json: {e}") from e
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Failed to read metadata.json: {e}") from e
+
+        sdg_path = base_dataset_path / "sdg.json"
+        sdg_config = None
+        if sdg_path.exists():
+            try:
+                with open(sdg_path) as f:
+                    sdg_config = json.load(f)
+            except json.JSONDecodeError as e:
+                raise HuggingFaceUploadError(f"Failed to parse sdg.json: {e}") from e
+            except Exception as e:
+                raise HuggingFaceUploadError(f"Failed to read sdg.json: {e}") from e
+
+        try:
+            card = DataDesignerDatasetCard.from_metadata(
+                metadata=metadata,
+                sdg_config=sdg_config,
+                repo_id=repo_id,
+            )
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e
+
+        try:
+            card.push_to_hub(repo_id, repo_type="dataset", create_pr=create_pr)
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Failed to push dataset card to hub: {e}") from e
+
+    @staticmethod
+    def _validate_repo_id(repo_id: str) -> None:
+        """Validate HuggingFace repository ID format.
+
+        Args:
+            repo_id: Repository ID to validate
+
+        Raises:
+            HuggingFaceUploadError: If repo_id format is invalid
+        """
+        if not repo_id or not isinstance(repo_id, str):
+            raise HuggingFaceUploadError("repo_id must be a non-empty string")
+
+        pattern = r"^[a-zA-Z0-9][-a-zA-Z0-9._]*/[a-zA-Z0-9][-a-zA-Z0-9._]*$"
+
+        if not re.match(pattern, repo_id):
+            raise HuggingFaceUploadError(
+                f"Invalid repo_id format: '{repo_id}'. "
+                "Expected format: 'username/dataset-name' or 'organization/dataset-name'. "
+                "Names can contain alphanumeric characters, dashes, underscores, and dots."
+            )
+
+    @staticmethod
+    def _validate_dataset_path(base_dataset_path: Path) -> None:
+        """Validate dataset directory structure.
+
+        Args:
+            base_dataset_path: Path to dataset directory
+
+        Raises:
+            HuggingFaceUploadError: If directory structure is invalid
+        """
+        if not base_dataset_path.exists():
+            raise HuggingFaceUploadError(f"Dataset path does not exist: {base_dataset_path}")
+
+        if not base_dataset_path.is_dir():
+            raise HuggingFaceUploadError(f"Dataset path is not a directory: {base_dataset_path}")
+
+        metadata_path = base_dataset_path / "metadata.json"
+        if not metadata_path.exists():
+            raise HuggingFaceUploadError(f"Required file not found: {metadata_path}")
+
+        if not metadata_path.is_file():
+            raise HuggingFaceUploadError(f"metadata.json is not a file: {metadata_path}")
+
+        parquet_dir = base_dataset_path / "parquet-files"
+        if not parquet_dir.exists():
+            raise HuggingFaceUploadError(
+                f"Required directory not found: {parquet_dir}. "
+                "Dataset must contain parquet-files directory with batch files."
+            )
+
+        if not parquet_dir.is_dir():
+            raise HuggingFaceUploadError(f"parquet-files is not a directory: {parquet_dir}")
+
+        if not any(parquet_dir.glob("*.parquet")):
+            raise HuggingFaceUploadError(
+                f"parquet-files directory is empty: {parquet_dir}. At least one .parquet file is required."
+            )
+
+        try:
+            with open(metadata_path) as f:
+                json.load(f)
+        except json.JSONDecodeError as e:
+            raise HuggingFaceUploadError(f"Invalid JSON in metadata.json: {e}")
+
+        sdg_path = base_dataset_path / "sdg.json"
+        if sdg_path.exists():
+            if not sdg_path.is_file():
+                raise HuggingFaceUploadError(f"sdg.json is not a file: {sdg_path}")
+            try:
+                with open(sdg_path) as f:
+                    json.load(f)
+            except json.JSONDecodeError as e:
+                raise HuggingFaceUploadError(f"Invalid JSON in sdg.json: {e}")
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
new file mode 100644
index 00000000..792e0a47
--- /dev/null
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from datetime import datetime
+from pathlib import Path
+
+from huggingface_hub import CardData, DatasetCard
+
+TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH = Path(__file__).parent / "dataset_card_template.md"
+
+
+class DataDesignerDatasetCard(DatasetCard):
+    """Dataset card for NeMo Data Designer datasets.
+
+    This class extends Hugging Face's DatasetCard with a custom template
+    specifically designed for Data Designer generated datasets.
+    The template is located at `data_designer/integrations/huggingface/dataset_card_template.md`.
+    """
+
+    default_template_path = TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH
+
+    @classmethod
+    def from_metadata(
+        cls,
+        metadata: dict,
+        sdg_config: dict | None,
+        repo_id: str,
+    ) -> DataDesignerDatasetCard:
+        """Create dataset card from metadata.json and sdg.json.
+
+        Args:
+            metadata: Contents of metadata.json
+            sdg_config: Contents of sdg.json (optional)
+            repo_id: HuggingFace repo ID
+
+        Returns:
+            DataDesignerDatasetCard instance ready to upload
+        """
+        # Extract info from metadata
+        target_num_records = metadata.get("target_num_records", 0)
+        schema = metadata.get("schema", {})
+        column_stats = metadata.get("column_statistics", [])
+
+        # Get actual num_records from column_statistics if available
+        if column_stats:
+            actual_num_records = column_stats[0].get("num_records", target_num_records)
+        else:
+            actual_num_records = target_num_records
+
+        # Compute size category
+        size_categories = cls._compute_size_category(actual_num_records)
+
+        # Extract column types from sdg.json if available
+        config_types: dict[str, int] = {}
+        num_columns_configured = 0
+        if sdg_config:
+            columns = sdg_config.get("data_designer", {}).get("columns", [])
+            num_columns_configured = len(columns)
+            for col in columns:
+                col_type = col.get("column_type", "unknown")
+                if isinstance(col_type, dict):
+                    col_type = col_type.get("value", "unknown")
+                config_types[col_type] = config_types.get(col_type, 0) + 1
+
+        # Prepare CardData (metadata for YAML frontmatter)
+        card_data = CardData(
+            library="datadesigner",
+            size_categories=size_categories,
+            tags=["synthetic", "nemo-data-designer"],
+        )
+
+        # Prepare template variables
+        template_vars = {
+            "repo_id": repo_id,
+            "num_records": actual_num_records,
+            "target_num_records": target_num_records,
+            "num_columns": len(schema),
+            "size_categories": size_categories,
+            "all_columns": schema,
+            "column_statistics": column_stats,
+            "num_columns_configured": num_columns_configured,
+            "config_types": config_types,
+            "percent_complete": 100 * actual_num_records / (target_num_records + 1e-10),
+            "current_year": datetime.now().year,
+        }
+
+        # Create card from template
+        card = cls.from_template(card_data, template_path=str(cls.default_template_path), **template_vars)
+        return card
+
+    @staticmethod
+    def _compute_size_category(num_records: int) -> str:
+        """Compute HuggingFace size category from record count.
+
+        Args:
+            num_records: Number of records in the dataset
+
+        Returns:
+            Size category string for HuggingFace Hub tags
+        """
+        if num_records < 1000:
+            return "n<1K"
+        elif num_records < 10000:
+            return "1K<n<10K"
+        elif num_records < 100000:
+            return "10K<n<100K"
+        elif num_records < 1000000:
+            return "100K<n<1M"
+        elif num_records < 10000000:
+            return "1M<n<10M"
+        else:
+            return "n>10M"
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
new file mode 100644
index 00000000..651741eb
--- /dev/null
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
@@ -0,0 +1,82 @@
+---
+library: datadesigner
+size_categories: {{ size_categories }}
+tags:
+  - synthetic
+  - nemo-data-designer
+---
+
+# {{ repo_id.split('/')[-1] | title }}
+
+This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a framework for creating high-quality synthetic datasets.
+
+## Dataset Summary
+
+- **Records**: {{ "{:,}".format(num_records) }}
+- **Columns**: {{ num_columns }}
+{% if target_num_records != num_records %}
+- **Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested)
+{% endif %}
+
+## Quick Start
+
+```python
+from datasets import load_dataset
+
+# Load the dataset
+dataset = load_dataset("{{ repo_id }}")
+df = dataset["train"].to_pandas()
+```
+
+## Schema & Statistics
+
+{% if column_statistics %}
+{% for stat in column_statistics %}
+### {{ stat.column_name }}
+
+- **Type**: `{{ stat.simple_dtype }}`
+- **Column Type**: {{ stat.column_type }}
+- **Unique Values**: {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%)
+{% if stat.num_null > 0 %}
+- **Null Values**: {{ stat.num_null }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%)
+{% endif %}
+{% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %}
+- **Avg Output Tokens**: {{ "%.1f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }}
+- **Avg Input Tokens**: {{ "%.1f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }}
+{% endif %}
+{% if stat.column_type == "sampler" and stat.sampler_type is defined %}
+- **Sampler Type**: {% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %}
+{% endif %}
+
+{% endfor %}
+{% else %}
+| Column | Type |
+|--------|------|
+{% for col_name, dtype in all_columns.items() | sort -%}
+| `{{ col_name }}` | {{ dtype }} |
+{% endfor %}
+{% endif %}
+
+## Generation Details
+
+{% if config_types %}
+Generated with {{ num_columns_configured }} column configuration(s):
+
+{% for col_type, count in config_types.items() | sort %}
+- **{{ col_type }}**: {{ count }} column(s)
+{% endfor %}
+{% endif %}
+
+Full configuration available in `sdg.json` and detailed metadata in `metadata.json`.
+
+## Citation
+
+```bibtex
+@misc{nemo-data-designer,
+  author = {The NeMo Data Designer Team, NVIDIA},
+  title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data},
+  howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}},
+  year = {{ current_year }},
+  note = {GitHub Repository},
+}
+```
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index b9467c58..b39c265e 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -12,6 +12,7 @@
 from data_designer.config.utils.visualization import WithRecordSamplerMixin
 from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
 from data_designer.engine.dataset_builders.errors import ArtifactStorageError
+from data_designer.integrations.huggingface.client import HuggingFaceHubClient
 from data_designer.lazy_heavy_imports import pd
 
 if TYPE_CHECKING:
@@ -96,3 +97,44 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
         if not self.artifact_storage.processors_outputs_path.exists():
             raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.")
         return self.artifact_storage.processors_outputs_path / processor_name
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        *,
+        token: str | None = None,
+        private: bool = False,
+        create_pr: bool = False,
+    ) -> str:
+        """Push dataset to HuggingFace Hub.
+
+        Uploads all artifacts including:
+        - Main parquet batch files (data subset)
+        - Processor output batch files (data/{processor_name} subsets)
+        - Configuration (sdg.json)
+        - Metadata (metadata.json)
+        - Auto-generated dataset card (README.md)
+
+        Args:
+            repo_id: HuggingFace repo ID (e.g., "username/my-dataset")
+            token: HuggingFace API token. If None, the token is automatically
+                resolved from HF_TOKEN environment variable or cached credentials
+                from `huggingface-cli login`.
+            private: Create private repo
+            create_pr: Create PR instead of direct push
+
+        Returns:
+            URL to the uploaded dataset
+
+        Example:
+            >>> results = data_designer.create(config, num_records=1000)
+            >>> results.push_to_hub("username/my-synthetic-dataset")
+            'https://huggingface.co/datasets/username/my-synthetic-dataset'
+        """
+        client = HuggingFaceHubClient(token=token)
+        return client.upload_dataset(
+            repo_id=repo_id,
+            base_dataset_path=self.artifact_storage.base_dataset_path,
+            private=private,
+            create_pr=create_pr,
+        )
diff --git a/packages/data-designer/tests/integrations/huggingface/__init__.py b/packages/data-designer/tests/integrations/huggingface/__init__.py
new file mode 100644
index 00000000..1a8431c3
--- /dev/null
+++ b/packages/data-designer/tests/integrations/huggingface/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
new file mode 100644
index 00000000..3e5bcdb1
--- /dev/null
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -0,0 +1,523 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError
+
+
+@pytest.fixture
+def mock_hf_api() -> MagicMock:
+    """Mock HfApi for testing."""
+    with patch("data_designer.integrations.huggingface.client.HfApi") as mock:
+        api_instance = MagicMock()
+        mock.return_value = api_instance
+        yield api_instance
+
+
+@pytest.fixture
+def sample_dataset_path(tmp_path: Path) -> Path:
+    """Create a sample dataset directory structure.
+
+    Structure mirrors actual DataDesigner output:
+    - parquet-files/: Main dataset batch files
+    - processors-files/{processor_name}/: Processor output batch files (same structure)
+    - metadata.json: Dataset metadata
+    - sdg.json: Configuration
+    """
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+
+    # Create parquet-files directory with batch files
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+    (parquet_dir / "batch_00000.parquet").write_text("dummy parquet data")
+    (parquet_dir / "batch_00001.parquet").write_text("dummy parquet data")
+
+    # Create processors-files directory with same structure as main parquet-files
+    processors_dir = base_path / "processors-files"
+    processors_dir.mkdir()
+    processor1_dir = processors_dir / "processor1"
+    processor1_dir.mkdir()
+    (processor1_dir / "batch_00000.parquet").write_text("dummy processor output")
+    (processor1_dir / "batch_00001.parquet").write_text("dummy processor output")
+
+    processor2_dir = processors_dir / "processor2"
+    processor2_dir.mkdir()
+    (processor2_dir / "batch_00000.parquet").write_text("dummy processor output")
+
+    # Create metadata.json with matching column statistics
+    metadata = {
+        "target_num_records": 100,
+        "total_num_batches": 2,
+        "buffer_size": 50,
+        "schema": {"col1": "string"},
+        "file_paths": {
+            "parquet-files": ["parquet-files/batch_00000.parquet", "parquet-files/batch_00001.parquet"],
+            "processor-files": {
+                "processor1": ["processors-files/processor1/batch_00000.parquet"],
+                "processor2": ["processors-files/processor2/batch_00000.parquet"],
+            },
+        },
+        "num_completed_batches": 2,
+        "dataset_name": "dataset",
+        "column_statistics": [
+            {
+                "column_name": "col1",
+                "num_records": 100,
+                "num_unique": 100,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "pyarrow_dtype": "string",
+                "column_type": "sampler",
+                "sampler_type": "uuid",
+            }
+        ],
+    }
+    (base_path / "metadata.json").write_text(json.dumps(metadata))
+
+    # Create sdg.json with realistic BuilderConfig structure
+    sdg_config = {
+        "data_designer": {
+            "columns": [
+                {
+                    "name": "col1",
+                    "column_type": "sampler",
+                    "sampler_type": "uuid",
+                    "params": {},
+                }
+            ],
+            "model_configs": [],
+            "constraints": None,
+            "seed_config": None,
+            "profilers": None,
+        }
+    }
+    (base_path / "sdg.json").write_text(json.dumps(sdg_config))
+
+    return base_path
+
+
+def test_client_initialization() -> None:
+    """Test HuggingFaceHubClient initialization."""
+    with patch("data_designer.integrations.huggingface.client.HfApi"):
+        client = HuggingFaceHubClient(token="test-token")
+        assert client.token == "test-token"
+
+
+def test_client_initialization_no_token() -> None:
+    """Test HuggingFaceHubClient initialization without token."""
+    with patch("data_designer.integrations.huggingface.client.HfApi"):
+        client = HuggingFaceHubClient()
+        assert client.token is None
+
+
+def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test that upload_dataset creates a repository."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+    mock_hf_api.create_repo.assert_called_once_with(
+        repo_id="test/dataset",
+        repo_type="dataset",
+        exist_ok=True,
+        private=False,
+    )
+
+
+def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test that upload_dataset uploads parquet files."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+    # Check that upload_folder was called for parquet files
+    calls = [call for call in mock_hf_api.upload_folder.call_args_list if "parquet-files" in str(call)]
+    assert len(calls) == 1
+    assert calls[0].kwargs["path_in_repo"] == "data"
+
+
+def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test that upload_dataset uploads processor outputs."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+    # Check that upload_folder was called for processor outputs
+    calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in str(call)]
+    assert len(calls) == 1
+    assert calls[0].kwargs["path_in_repo"] == "data/processor1"
+
+
+def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test that upload_dataset uploads sdg.json and metadata.json."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+    # Check that upload_file was called for config files
+    upload_file_calls = mock_hf_api.upload_file.call_args_list
+    assert len(upload_file_calls) == 2
+
+    uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls]
+    assert "sdg.json" in uploaded_files
+    assert "metadata.json" in uploaded_files
+
+
+def test_upload_dataset_returns_url(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test that upload_dataset returns the correct URL."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        url = client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+    assert url == "https://huggingface.co/datasets/test/dataset"
+
+
+def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test upload_dataset with private repository."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+            private=True,
+        )
+
+    mock_hf_api.create_repo.assert_called_once_with(
+        repo_id="test/dataset",
+        repo_type="dataset",
+        exist_ok=True,
+        private=True,
+    )
+
+
+def test_upload_dataset_with_create_pr(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test upload_dataset with create_pr option."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+            create_pr=True,
+        )
+
+    # Verify create_pr is passed to upload operations
+    for call in mock_hf_api.upload_folder.call_args_list:
+        assert call.kwargs["create_pr"] is True
+
+
+def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None:
+    """Test _upload_dataset_card raises error when metadata.json is missing."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    # Create directory without metadata.json
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+
+    with pytest.raises(HuggingFaceUploadError, match="Failed to read metadata.json"):
+        client._upload_dataset_card("test/dataset", base_path)
+
+
+def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> None:
+    """Test _upload_dataset_card generates card and pushes to hub."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock_card_class:
+        mock_card = MagicMock()
+        mock_card_class.from_metadata.return_value = mock_card
+
+        client._upload_dataset_card("test/dataset", sample_dataset_path)
+
+        # Verify card was created from metadata
+        mock_card_class.from_metadata.assert_called_once()
+        call_kwargs = mock_card_class.from_metadata.call_args.kwargs
+        assert call_kwargs["repo_id"] == "test/dataset"
+        assert "metadata" in call_kwargs
+        assert "sdg_config" in call_kwargs
+
+        # Verify card was pushed to hub
+        mock_card.push_to_hub.assert_called_once_with(
+            "test/dataset",
+            repo_type="dataset",
+            create_pr=False,
+        )
+
+
+def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Path) -> None:
+    """Test upload_dataset when no processor outputs exist."""
+    # Create dataset path without processors directory
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+    (parquet_dir / "batch_00000.parquet").write_text("dummy data")
+
+    metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []}
+    (base_path / "metadata.json").write_text(json.dumps(metadata))
+
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=base_path,
+        )
+
+    # Should only upload parquet files, not processors
+    folder_calls = mock_hf_api.upload_folder.call_args_list
+    assert len(folder_calls) == 1  # Only main parquet files
+    assert folder_calls[0].kwargs["path_in_repo"] == "data"
+
+
+def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Path) -> None:
+    """Test upload_dataset when sdg.json doesn't exist."""
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+    (parquet_dir / "batch_00000.parquet").write_text("dummy data")
+
+    metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []}
+    (base_path / "metadata.json").write_text(json.dumps(metadata))
+
+    # No sdg.json file
+
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=base_path,
+        )
+
+    # Should only upload metadata.json, not sdg.json
+    file_calls = mock_hf_api.upload_file.call_args_list
+    assert len(file_calls) == 1
+    assert file_calls[0].kwargs["path_in_repo"] == "metadata.json"
+
+
+def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test that multiple processor outputs are uploaded correctly."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with patch.object(client, "_upload_dataset_card"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+    # Check that both processors were uploaded
+    folder_calls = mock_hf_api.upload_folder.call_args_list
+    processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]]
+
+    assert len(processor_calls) == 2
+    processor_paths = [call.kwargs["path_in_repo"] for call in processor_calls]
+    assert "data/processor1" in processor_paths
+    assert "data/processor2" in processor_paths
+
+
+# Error handling and validation tests
+
+
+def test_validate_repo_id_invalid_format() -> None:
+    """Test repo_id validation with invalid formats."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    # Missing slash
+    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+        client._validate_repo_id("my-dataset")
+
+    # Too many slashes (caught by regex)
+    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+        client._validate_repo_id("user/org/dataset")
+
+    # Invalid characters (space)
+    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+        client._validate_repo_id("user/my dataset")
+
+    # Empty string
+    with pytest.raises(HuggingFaceUploadError, match="must be a non-empty string"):
+        client._validate_repo_id("")
+
+
+def test_validate_repo_id_valid_formats() -> None:
+    """Test repo_id validation with valid formats."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    # Valid formats should not raise
+    client._validate_repo_id("username/dataset")
+    client._validate_repo_id("org/my-dataset")
+    client._validate_repo_id("user/dataset_name")
+    client._validate_repo_id("user123/dataset-123")
+    client._validate_repo_id("user/dataset.v2")
+
+
+def test_validate_dataset_path_not_exists(tmp_path: Path) -> None:
+    """Test validation fails when dataset path doesn't exist."""
+    client = HuggingFaceHubClient(token="test-token")
+    non_existent = tmp_path / "does-not-exist"
+
+    with pytest.raises(HuggingFaceUploadError, match="does not exist"):
+        client._validate_dataset_path(non_existent)
+
+
+def test_validate_dataset_path_is_file(tmp_path: Path) -> None:
+    """Test validation fails when dataset path is a file."""
+    client = HuggingFaceHubClient(token="test-token")
+    file_path = tmp_path / "file.txt"
+    file_path.write_text("not a directory")
+
+    with pytest.raises(HuggingFaceUploadError, match="not a directory"):
+        client._validate_dataset_path(file_path)
+
+
+def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None:
+    """Test validation fails when metadata.json is missing."""
+    client = HuggingFaceHubClient(token="test-token")
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+
+    with pytest.raises(HuggingFaceUploadError, match="Required file not found.*metadata.json"):
+        client._validate_dataset_path(base_path)
+
+
+def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None:
+    """Test validation fails when parquet-files directory is missing."""
+    client = HuggingFaceHubClient(token="test-token")
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+    (base_path / "metadata.json").write_text('{"target_num_records": 10}')
+
+    with pytest.raises(HuggingFaceUploadError, match="Required directory not found.*parquet-files"):
+        client._validate_dataset_path(base_path)
+
+
+def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None:
+    """Test validation fails when parquet-files directory is empty."""
+    client = HuggingFaceHubClient(token="test-token")
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+    (base_path / "metadata.json").write_text('{"target_num_records": 10}')
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+
+    with pytest.raises(HuggingFaceUploadError, match="parquet-files directory is empty"):
+        client._validate_dataset_path(base_path)
+
+
+def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None:
+    """Test validation fails when metadata.json contains invalid JSON."""
+    client = HuggingFaceHubClient(token="test-token")
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+    (base_path / "metadata.json").write_text("invalid json {{{")
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+    (parquet_dir / "batch_00000.parquet").write_text("data")
+
+    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in metadata.json"):
+        client._validate_dataset_path(base_path)
+
+
+def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None:
+    """Test validation fails when sdg.json contains invalid JSON."""
+    client = HuggingFaceHubClient(token="test-token")
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+    (base_path / "metadata.json").write_text('{"target_num_records": 10}')
+    (base_path / "sdg.json").write_text("invalid json {{{")
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+    (parquet_dir / "batch_00000.parquet").write_text("data")
+
+    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in sdg.json"):
+        client._validate_dataset_path(base_path)
+
+
+def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test upload_dataset fails with invalid repo_id."""
+    client = HuggingFaceHubClient(token="test-token")
+
+    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+        client.upload_dataset(
+            repo_id="invalid-repo-id",  # Missing slash
+            base_dataset_path=sample_dataset_path,
+        )
+
+
+def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test upload_dataset handles authentication errors."""
+    from huggingface_hub.utils import HfHubHTTPError
+
+    client = HuggingFaceHubClient(token="invalid-token")
+
+    # Mock 401 authentication error
+    error_response = MagicMock()
+    error_response.status_code = 401
+    mock_hf_api.create_repo.side_effect = HfHubHTTPError("Unauthorized", response=error_response)
+
+    with pytest.raises(HuggingFaceUploadError, match="Authentication failed"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+
+def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test upload_dataset handles permission errors."""
+    from huggingface_hub.utils import HfHubHTTPError
+
+    client = HuggingFaceHubClient(token="test-token")
+
+    # Mock 403 permission error
+    error_response = MagicMock()
+    error_response.status_code = 403
+    mock_hf_api.create_repo.side_effect = HfHubHTTPError("Forbidden", response=error_response)
+
+    with pytest.raises(HuggingFaceUploadError, match="Permission denied"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+        )
+
+
+def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None:
+    """Test _upload_dataset_card handles corrupted metadata.json."""
+    client = HuggingFaceHubClient(token="test-token")
+    base_path = tmp_path / "dataset"
+    base_path.mkdir()
+    (base_path / "metadata.json").write_text("invalid json")
+
+    with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"):
+        client._upload_dataset_card("test/dataset", base_path)
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
new file mode 100644
index 00000000..e4b3b8eb
--- /dev/null
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
+
+
+def test_compute_size_category() -> None:
+    """Test size category computation for various dataset sizes."""
+    assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K"
+    assert DataDesignerDatasetCard._compute_size_category(5000) == "1K<n<10K"
+    assert DataDesignerDatasetCard._compute_size_category(50000) == "10K<n<100K"
+    assert DataDesignerDatasetCard._compute_size_category(500000) == "100K<n<1M"
+    assert DataDesignerDatasetCard._compute_size_category(5000000) == "1M<n<10M"
+    assert DataDesignerDatasetCard._compute_size_category(50000000) == "n>10M"
+
+
+def test_from_metadata_minimal() -> None:
+    """Test creating dataset card from minimal metadata."""
+    metadata = {
+        "target_num_records": 100,
+        "schema": {"col1": "string", "col2": "int64"},
+        "column_statistics": [
+            {
+                "column_name": "col1",
+                "num_records": 100,
+                "num_unique": 100,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "column_type": "sampler",
+            }
+        ],
+    }
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=metadata,
+        sdg_config=None,
+        repo_id="test/dataset",
+    )
+
+    # Verify card was created
+    assert card is not None
+    assert "test/dataset" in str(card)
+    assert "100" in str(card)
+    assert "col1" in str(card)
+    assert "2" in str(card)  # Number of columns
+
+
+def test_from_metadata_with_sdg_config() -> None:
+    """Test creating dataset card with sdg config."""
+    metadata = {
+        "target_num_records": 50,
+        "schema": {"name": "string", "age": "int64"},
+        "column_statistics": [
+            {
+                "column_name": "name",
+                "num_records": 50,
+                "num_unique": 50,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "column_type": "sampler",
+                "sampler_type": "person",
+            },
+            {
+                "column_name": "age",
+                "num_records": 50,
+                "num_unique": 30,
+                "num_null": 0,
+                "simple_dtype": "int64",
+                "column_type": "sampler",
+                "sampler_type": "uniform",
+            },
+        ],
+    }
+
+    sdg_config = {
+        "data_designer": {
+            "columns": [
+                {"name": "name", "column_type": "sampler"},
+                {"name": "age", "column_type": "sampler"},
+            ]
+        }
+    }
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=metadata,
+        sdg_config=sdg_config,
+        repo_id="test/dataset-with-config",
+    )
+
+    # Verify card includes config info
+    assert card is not None
+    assert "sampler" in str(card)
+    assert "2 column" in str(card)
+
+
+def test_from_metadata_with_llm_columns() -> None:
+    """Test creating dataset card with LLM column statistics."""
+    metadata = {
+        "target_num_records": 10,
+        "schema": {"prompt": "string", "response": "string"},
+        "column_statistics": [
+            {
+                "column_name": "response",
+                "num_records": 10,
+                "num_unique": 10,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "column_type": "llm-text",
+                "output_tokens_mean": 50.5,
+                "input_tokens_mean": 20.3,
+            }
+        ],
+    }
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=metadata,
+        sdg_config=None,
+        repo_id="test/llm-dataset",
+    )
+
+    # Verify LLM statistics are included
+    assert card is not None
+    assert "50.5" in str(card) or "Avg Output Tokens" in str(card)
diff --git a/uv.lock b/uv.lock
index 279f21de..46716729 100644
--- a/uv.lock
+++ b/uv.lock
@@ -690,6 +690,7 @@ source = { editable = "packages/data-designer" }
 dependencies = [
     { name = "data-designer-config" },
     { name = "data-designer-engine" },
+    { name = "huggingface-hub" },
     { name = "prompt-toolkit" },
     { name = "typer" },
 ]
@@ -698,6 +699,7 @@ dependencies = [
 requires-dist = [
     { name = "data-designer-config", editable = "packages/data-designer-config" },
     { name = "data-designer-engine", editable = "packages/data-designer-engine" },
+    { name = "huggingface-hub", specifier = ">=1.0.1,<2" },
     { name = "prompt-toolkit", specifier = ">=3.0.0,<4" },
     { name = "typer", specifier = ">=0.12.0,<1" },
 ]

From 3ff3aba25b53d2c42a3317452a4e9a1a9aa48044 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 11:44:56 -0700
Subject: [PATCH 02/25] feat: improve push_to_hub with logging, path mapping,
 and config definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add progress logging with emojis following codebase style
- Add repository exists check before creation
- Update metadata.json paths for HuggingFace structure (parquet-files/ → data/, processors-files/{name}/ → {name}/)
- Enhance dataset card with detailed intro, tabular schema/statistics, and clickable config links
- Add explicit configs in YAML frontmatter to fix schema mismatch between main dataset and processor outputs
- Set data config as default configuration
---
 .../integrations/huggingface/client.py        | 135 ++++++++++++++----
 .../integrations/huggingface/dataset_card.py  |  26 +++-
 .../huggingface/dataset_card_template.md      |  63 ++++----
 .../src/data_designer/interface/results.py    |   2 +-
 .../integrations/huggingface/test_client.py   |  63 ++++++--
 .../huggingface/test_dataset_card.py          |  41 +++++-
 6 files changed, 258 insertions(+), 72 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 7121bff4..8b182b25 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -4,7 +4,9 @@
 from __future__ import annotations
 
 import json
+import logging
 import re
+import tempfile
 from pathlib import Path
 
 from huggingface_hub import HfApi
@@ -12,6 +14,9 @@
 
 from data_designer.errors import DataDesignerError
 from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
+from data_designer.logging import RandomEmoji
+
+logger = logging.getLogger(__name__)
 
 
 class HuggingFaceUploadError(DataDesignerError):
@@ -44,7 +49,7 @@ def upload_dataset(
 
         Uploads the complete dataset including:
         - Main parquet batch files from parquet-files/ → data/
-        - Processor output batch files from processors-files/{name}/ → data/{name}/
+        - Processor output batch files from processors-files/{name}/ → {name}/
         - Existing sdg.json and metadata.json files
         - Auto-generated README.md (dataset card)
 
@@ -60,10 +65,19 @@ def upload_dataset(
         Raises:
             HuggingFaceUploadError: If validation fails or upload encounters errors
         """
+        logger.info(f"🤗 Uploading dataset to HuggingFace Hub: {repo_id}")
+
         self._validate_repo_id(repo_id)
         self._validate_dataset_path(base_dataset_path)
 
+        logger.info(f"|-- {RandomEmoji.working()} Checking if repository exists...")
         try:
+            repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset")
+            if repo_exists:
+                logger.info(f"|-- {RandomEmoji.success()} Repository already exists, updating content...")
+            else:
+                logger.info(f"|-- {RandomEmoji.working()} Creating new repository...")
+
             self._api.create_repo(
                 repo_id=repo_id,
                 repo_type="dataset",
@@ -87,11 +101,13 @@ def upload_dataset(
         except Exception as e:
             raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
 
+        logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
         try:
             self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr)
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
 
+        logger.info(f"|-- {RandomEmoji.loading()} Uploading main dataset files...")
         parquet_folder = base_dataset_path / "parquet-files"
         try:
             self._api.upload_folder(
@@ -99,7 +115,7 @@ def upload_dataset(
                 folder_path=str(parquet_folder),
                 path_in_repo="data",
                 repo_type="dataset",
-                commit_message="Upload main dataset parquet files",
+                commit_message="Upload main dataset files",
                 create_pr=create_pr,
             )
         except Exception as e:
@@ -107,38 +123,67 @@ def upload_dataset(
 
         processors_folder = base_dataset_path / "processors-files"
         if processors_folder.exists():
-            for processor_dir in processors_folder.iterdir():
-                if processor_dir.is_dir():
-                    try:
-                        self._api.upload_folder(
-                            repo_id=repo_id,
-                            folder_path=str(processor_dir),
-                            path_in_repo=f"data/{processor_dir.name}",
-                            repo_type="dataset",
-                            commit_message=f"Upload processor outputs: {processor_dir.name}",
-                            create_pr=create_pr,
-                        )
-                    except Exception as e:
-                        raise HuggingFaceUploadError(
-                            f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
-                        ) from e
-
-        for config_file in ["sdg.json", "metadata.json"]:
-            config_path = base_dataset_path / config_file
-            if config_path.exists():
+            processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()]
+            if processor_dirs:
+                logger.info(
+                    f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)..."
+                )
+            for processor_dir in processor_dirs:
                 try:
-                    self._api.upload_file(
+                    self._api.upload_folder(
                         repo_id=repo_id,
-                        path_or_fileobj=str(config_path),
-                        path_in_repo=config_file,
+                        folder_path=str(processor_dir),
+                        path_in_repo=processor_dir.name,
                         repo_type="dataset",
-                        commit_message=f"Upload {config_file}",
+                        commit_message=f"Upload {processor_dir.name} processor outputs",
                         create_pr=create_pr,
                     )
                 except Exception as e:
-                    raise HuggingFaceUploadError(f"Failed to upload {config_file}: {e}") from e
+                    raise HuggingFaceUploadError(
+                        f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
+                    ) from e
+
+        logger.info(f"|-- {RandomEmoji.loading()} Uploading configuration files...")
+
+        sdg_path = base_dataset_path / "sdg.json"
+        if sdg_path.exists():
+            try:
+                self._api.upload_file(
+                    repo_id=repo_id,
+                    path_or_fileobj=str(sdg_path),
+                    path_in_repo="sdg.json",
+                    repo_type="dataset",
+                    commit_message="Upload sdg.json",
+                    create_pr=create_pr,
+                )
+            except Exception as e:
+                raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e
 
-        return f"https://huggingface.co/datasets/{repo_id}"
+        metadata_path = base_dataset_path / "metadata.json"
+        if metadata_path.exists():
+            try:
+                updated_metadata = self._update_metadata_paths(metadata_path)
+                with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp_file:
+                    json.dump(updated_metadata, tmp_file, indent=2)
+                    tmp_path = tmp_file.name
+
+                try:
+                    self._api.upload_file(
+                        repo_id=repo_id,
+                        path_or_fileobj=tmp_path,
+                        path_in_repo="metadata.json",
+                        repo_type="dataset",
+                        commit_message="Upload metadata.json",
+                        create_pr=create_pr,
+                    )
+                finally:
+                    Path(tmp_path).unlink()
+            except Exception as e:
+                raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e
+
+        url = f"https://huggingface.co/datasets/{repo_id}"
+        logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
+        return url
 
     def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
         """Generate and upload dataset card from metadata.json.
@@ -207,6 +252,42 @@ def _validate_repo_id(repo_id: str) -> None:
                 "Names can contain alphanumeric characters, dashes, underscores, and dots."
             )
 
+    @staticmethod
+    def _update_metadata_paths(metadata_path: Path) -> dict:
+        """Update file paths in metadata.json to match HuggingFace Hub structure.
+
+        Local paths:
+        - parquet-files/batch_00000.parquet → data/batch_00000.parquet
+        - processors-files/processor1/batch_00000.parquet → processor1/batch_00000.parquet
+
+        Args:
+            metadata_path: Path to metadata.json file
+
+        Returns:
+            Updated metadata dictionary with corrected paths
+        """
+        with open(metadata_path) as f:
+            metadata = json.load(f)
+
+        if "file_paths" in metadata:
+            updated_file_paths = {}
+
+            if "parquet-files" in metadata["file_paths"]:
+                updated_file_paths["data"] = [
+                    path.replace("parquet-files/", "data/") for path in metadata["file_paths"]["parquet-files"]
+                ]
+
+            if "processor-files" in metadata["file_paths"]:
+                updated_file_paths["processor-files"] = {}
+                for processor_name, paths in metadata["file_paths"]["processor-files"].items():
+                    updated_file_paths["processor-files"][processor_name] = [
+                        path.replace(f"processors-files/{processor_name}/", f"{processor_name}/") for path in paths
+                    ]
+
+            metadata["file_paths"] = updated_file_paths
+
+        return metadata
+
     @staticmethod
     def _validate_dataset_path(base_dataset_path: Path) -> None:
         """Validate dataset directory structure.
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 792e0a47..7ec3b2de 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -64,11 +64,30 @@ def from_metadata(
                     col_type = col_type.get("value", "unknown")
                 config_types[col_type] = config_types.get(col_type, 0) + 1
 
+        # Extract processor names from file_paths
+        processor_names = []
+        if "file_paths" in metadata and "processor-files" in metadata["file_paths"]:
+            processor_names = list(metadata["file_paths"]["processor-files"].keys())
+
+        # Determine modalities based on column types
+        modalities = set()
+        has_text = False
+        for stat in column_stats:
+            col_type = stat.get("column_type", "")
+            if col_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"]:
+                has_text = True
+
+        if has_text:
+            modalities.add("text")
+        modalities.add("tabular")
+
+        # Prepare tags
+        tags = ["synthetic", "datadesigner"] + list(modalities)
+
         # Prepare CardData (metadata for YAML frontmatter)
         card_data = CardData(
-            library="datadesigner",
             size_categories=size_categories,
-            tags=["synthetic", "nemo-data-designer"],
+            tags=tags,
         )
 
         # Prepare template variables
@@ -84,6 +103,9 @@ def from_metadata(
             "config_types": config_types,
             "percent_complete": 100 * actual_num_records / (target_num_records + 1e-10),
             "current_year": datetime.now().year,
+            "has_processors": len(processor_names) > 0,
+            "processor_names": processor_names,
+            "tags": tags,
         }
 
         # Create card from template
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
index 651741eb..46de1474 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
@@ -1,14 +1,33 @@
 ---
-library: datadesigner
 size_categories: {{ size_categories }}
 tags:
-  - synthetic
-  - nemo-data-designer
+{% for tag in tags %}
+  - {{ tag }}
+{% endfor %}
+configs:
+- config_name: data
+  data_files: "data/*.parquet"
+  default: true
+{% if has_processors %}{% for processor_name in processor_names %}- config_name: {{ processor_name }}
+  data_files: "{{ processor_name }}/*.parquet"
+{% endfor %}{% endif %}
 ---
 
 # {{ repo_id.split('/')[-1] | title }}
 
-This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a framework for creating high-quality synthetic datasets.
+This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data.
+
+## About NeMo Data Designer
+
+NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides:
+
+- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets
+- **Relationship control** between fields with dependency-aware generation
+- **Quality validation** with built-in Python, SQL, and custom local and remote validators
+- **LLM-as-a-judge** scoring for quality assessment
+- **Fast iteration** with preview mode before full-scale generation
+
+For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`)
 
 ## Dataset Summary
 
@@ -23,32 +42,24 @@ This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDI
 ```python
 from datasets import load_dataset
 
-# Load the dataset
-dataset = load_dataset("{{ repo_id }}")
-df = dataset["train"].to_pandas()
+# Load the main dataset
+dataset = load_dataset("{{ repo_id }}", "data", split="train")
+df = dataset.to_pandas()
+{% if has_processors %}
+# Load processor outputs (if available){% for processor_name in processor_names %}
+processor_{{ processor_name }} = load_dataset("{{ repo_id }}", "{{ processor_name }}", split="train")
+df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas()
+{% endfor %}{% endif %}
 ```
 
 ## Schema & Statistics
 
 {% if column_statistics %}
-{% for stat in column_statistics %}
-### {{ stat.column_name }}
-
-- **Type**: `{{ stat.simple_dtype }}`
-- **Column Type**: {{ stat.column_type }}
-- **Unique Values**: {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%)
-{% if stat.num_null > 0 %}
-- **Null Values**: {{ stat.num_null }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%)
-{% endif %}
-{% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %}
-- **Avg Output Tokens**: {{ "%.1f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }}
-- **Avg Input Tokens**: {{ "%.1f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }}
-{% endif %}
-{% if stat.column_type == "sampler" and stat.sampler_type is defined %}
-- **Sampler Type**: {% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %}
-{% endif %}
-
-{% endfor %}
+| Column | Type | Column Type | Unique (%) | Null (%) | Details |
+|--------|------|-------------|------------|----------|---------|
+{% for stat in column_statistics -%}
+| `{{ stat.column_name }}` | `{{ stat.simple_dtype }}` | {{ stat.column_type }} | {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) | {{ stat.num_null if stat.num_null > 0 else 0 }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) | {% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %}Tokens: {{ "%.0f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }} out / {{ "%.0f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }} in{% elif stat.column_type == "sampler" and stat.sampler_type is defined %}{% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %}{% else %}-{% endif %} |
+{% endfor -%}
 {% else %}
 | Column | Type |
 |--------|------|
@@ -67,7 +78,7 @@ Generated with {{ num_columns_configured }} column configuration(s):
 {% endfor %}
 {% endif %}
 
-Full configuration available in `sdg.json` and detailed metadata in `metadata.json`.
+Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json).
 
 ## Citation
 
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index b39c265e..4281e6f0 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -110,7 +110,7 @@ def push_to_hub(
 
         Uploads all artifacts including:
         - Main parquet batch files (data subset)
-        - Processor output batch files (data/{processor_name} subsets)
+        - Processor output batch files ({processor_name} subsets)
         - Configuration (sdg.json)
         - Metadata (metadata.json)
         - Auto-generated dataset card (README.md)
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 3e5bcdb1..efc64ffb 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -147,9 +147,8 @@ def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dat
         )
 
     # Check that upload_folder was called for parquet files
-    calls = [call for call in mock_hf_api.upload_folder.call_args_list if "parquet-files" in str(call)]
-    assert len(calls) == 1
-    assert calls[0].kwargs["path_in_repo"] == "data"
+    calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "data"]
+    assert len(calls) >= 1
 
 
 def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
@@ -163,9 +162,8 @@ def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample
         )
 
     # Check that upload_folder was called for processor outputs
-    calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in str(call)]
-    assert len(calls) == 1
-    assert calls[0].kwargs["path_in_repo"] == "data/processor1"
+    calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in call.kwargs["path_in_repo"]]
+    assert len(calls) >= 1
 
 
 def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
@@ -180,8 +178,6 @@ def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_data
 
     # Check that upload_file was called for config files
     upload_file_calls = mock_hf_api.upload_file.call_args_list
-    assert len(upload_file_calls) == 2
-
     uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls]
     assert "sdg.json" in uploaded_files
     assert "metadata.json" in uploaded_files
@@ -295,8 +291,11 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat
 
     # Should only upload parquet files, not processors
     folder_calls = mock_hf_api.upload_folder.call_args_list
-    assert len(folder_calls) == 1  # Only main parquet files
-    assert folder_calls[0].kwargs["path_in_repo"] == "data"
+    data_calls = [call for call in folder_calls if call.kwargs["path_in_repo"] == "data"]
+    processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]]
+
+    assert len(data_calls) == 1  # Main parquet files uploaded
+    assert len(processor_calls) == 0  # No processor files
 
 
 def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Path) -> None:
@@ -323,8 +322,11 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat
 
     # Should only upload metadata.json, not sdg.json
     file_calls = mock_hf_api.upload_file.call_args_list
-    assert len(file_calls) == 1
-    assert file_calls[0].kwargs["path_in_repo"] == "metadata.json"
+    uploaded_files = [call.kwargs["path_in_repo"] for call in file_calls]
+
+    assert len(uploaded_files) == 1  # Only metadata.json
+    assert "metadata.json" in uploaded_files
+    assert "sdg.json" not in uploaded_files
 
 
 def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
@@ -341,10 +343,10 @@ def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_datas
     folder_calls = mock_hf_api.upload_folder.call_args_list
     processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]]
 
-    assert len(processor_calls) == 2
+    assert len(processor_calls) >= 2
     processor_paths = [call.kwargs["path_in_repo"] for call in processor_calls]
-    assert "data/processor1" in processor_paths
-    assert "data/processor2" in processor_paths
+    assert any("processor1" in path for path in processor_paths)
+    assert any("processor2" in path for path in processor_paths)
 
 
 # Error handling and validation tests
@@ -521,3 +523,34 @@ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None:
 
     with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"):
         client._upload_dataset_card("test/dataset", base_path)
+
+
+def test_update_metadata_paths(tmp_path: Path) -> None:
+    """Test that _update_metadata_paths correctly updates file paths for HuggingFace Hub."""
+    metadata = {
+        "target_num_records": 100,
+        "file_paths": {
+            "parquet-files": [
+                "parquet-files/batch_00000.parquet",
+                "parquet-files/batch_00001.parquet",
+            ],
+            "processor-files": {
+                "processor1": ["processors-files/processor1/batch_00000.parquet"],
+                "processor2": ["processors-files/processor2/batch_00000.parquet"],
+            },
+        },
+    }
+
+    metadata_path = tmp_path / "metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f)
+
+    updated = HuggingFaceHubClient._update_metadata_paths(metadata_path)
+
+    assert updated["file_paths"]["data"] == [
+        "data/batch_00000.parquet",
+        "data/batch_00001.parquet",
+    ]
+    assert updated["file_paths"]["processor-files"]["processor1"] == ["processor1/batch_00000.parquet"]
+    assert updated["file_paths"]["processor-files"]["processor2"] == ["processor2/batch_00000.parquet"]
+    assert "parquet-files" not in updated["file_paths"]
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
index e4b3b8eb..b2821c6c 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -122,4 +122,43 @@ def test_from_metadata_with_llm_columns() -> None:
 
     # Verify LLM statistics are included
     assert card is not None
-    assert "50.5" in str(card) or "Avg Output Tokens" in str(card)
+    assert "Tokens:" in str(card) and "out" in str(card) and "in" in str(card)
+
+
+def test_from_metadata_with_processors() -> None:
+    """Test creating dataset card with processor outputs includes loading examples."""
+    metadata = {
+        "target_num_records": 100,
+        "schema": {"col1": "string"},
+        "file_paths": {
+            "parquet-files": ["parquet-files/batch_00000.parquet"],
+            "processor-files": {
+                "processor1": ["processors-files/processor1/batch_00000.parquet"],
+                "processor2": ["processors-files/processor2/batch_00000.parquet"],
+            },
+        },
+        "column_statistics": [
+            {
+                "column_name": "col1",
+                "num_records": 100,
+                "num_unique": 100,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "column_type": "sampler",
+            }
+        ],
+    }
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=metadata,
+        sdg_config=None,
+        repo_id="test/dataset-with-processors",
+    )
+
+    card_str = str(card)
+    assert card is not None
+    assert "processor1" in card_str
+    assert "processor2" in card_str
+    assert '"processor1"' in card_str
+    assert '"processor2"' in card_str
+    assert "Load processor outputs" in card_str

From 0cd2dd13a365ac499dc5620c3cb9d6c136178cc5 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 11:55:34 -0700
Subject: [PATCH 03/25] feat: add optional description parameter to push_to_hub

- Add description parameter to push_to_hub() for custom dataset card content
- Description appears after NeMo Data Designer intro section
- Update dataset card template to conditionally render custom description
- Add tests for with/without custom description scenarios
---
 .../integrations/huggingface/client.py        | 10 +++-
 .../integrations/huggingface/dataset_card.py  |  3 +
 .../huggingface/dataset_card_template.md      |  4 ++
 .../src/data_designer/interface/results.py    |  8 +++
 .../huggingface/test_dataset_card.py          | 59 +++++++++++++++++++
 5 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 8b182b25..a5ada120 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -44,6 +44,7 @@ def upload_dataset(
         *,
         private: bool = False,
         create_pr: bool = False,
+        description: str | None = None,
     ) -> str:
         """Upload dataset to HuggingFace Hub.
 
@@ -58,6 +59,7 @@ def upload_dataset(
             base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
             private: Whether to create private repo
             create_pr: Whether to create a PR instead of direct push
+            description: Optional custom description text for dataset card
 
         Returns:
             URL to the uploaded dataset
@@ -103,7 +105,7 @@ def upload_dataset(
 
         logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
         try:
-            self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr)
+            self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr, description=description)
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
 
@@ -185,13 +187,16 @@ def upload_dataset(
         logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
         return url
 
-    def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+    def _upload_dataset_card(
+        self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False, description: str | None = None
+    ) -> None:
         """Generate and upload dataset card from metadata.json.
 
         Args:
             repo_id: HuggingFace repo ID
             base_dataset_path: Path to dataset artifacts
             create_pr: Whether to create a PR instead of direct push
+            description: Optional custom description text for dataset card
 
         Raises:
             HuggingFaceUploadError: If dataset card generation or upload fails
@@ -221,6 +226,7 @@ def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_
                 metadata=metadata,
                 sdg_config=sdg_config,
                 repo_id=repo_id,
+                description=description,
             )
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 7ec3b2de..02960a17 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -27,6 +27,7 @@ def from_metadata(
         metadata: dict,
         sdg_config: dict | None,
         repo_id: str,
+        description: str | None = None,
     ) -> DataDesignerDatasetCard:
         """Create dataset card from metadata.json and sdg.json.
 
@@ -34,6 +35,7 @@ def from_metadata(
             metadata: Contents of metadata.json
             sdg_config: Contents of sdg.json (optional)
             repo_id: HuggingFace repo ID
+            description: Optional custom description text
 
         Returns:
             DataDesignerDatasetCard instance ready to upload
@@ -106,6 +108,7 @@ def from_metadata(
             "has_processors": len(processor_names) > 0,
             "processor_names": processor_names,
             "tags": tags,
+            "custom_description": description,
         }
 
         # Create card from template
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
index 46de1474..1e63be49 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
@@ -16,6 +16,10 @@ configs:
 # {{ repo_id.split('/')[-1] | title }}
 
 This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data.
+{% if custom_description %}
+
+{{ custom_description }}
+{% endif %}
 
 ## About NeMo Data Designer
 
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index 4281e6f0..3f1072d7 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -105,6 +105,7 @@ def push_to_hub(
         token: str | None = None,
         private: bool = False,
         create_pr: bool = False,
+        description: str | None = None,
     ) -> str:
         """Push dataset to HuggingFace Hub.
 
@@ -122,6 +123,8 @@ def push_to_hub(
                 from `huggingface-cli login`.
             private: Create private repo
             create_pr: Create PR instead of direct push
+            description: Optional custom description text for the dataset card.
+                Appears after the NeMo Data Designer intro.
 
         Returns:
             URL to the uploaded dataset
@@ -130,6 +133,10 @@ def push_to_hub(
             >>> results = data_designer.create(config, num_records=1000)
             >>> results.push_to_hub("username/my-synthetic-dataset")
             'https://huggingface.co/datasets/username/my-synthetic-dataset'
+
+            >>> # With custom description
+            >>> description = "This dataset contains synthetic conversations for training chatbots."
+            >>> results.push_to_hub("username/my-dataset", description=description)
         """
         client = HuggingFaceHubClient(token=token)
         return client.upload_dataset(
@@ -137,4 +144,5 @@ def push_to_hub(
             base_dataset_path=self.artifact_storage.base_dataset_path,
             private=private,
             create_pr=create_pr,
+            description=description,
         )
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
index b2821c6c..40b8ecaa 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -162,3 +162,62 @@ def test_from_metadata_with_processors() -> None:
     assert '"processor1"' in card_str
     assert '"processor2"' in card_str
     assert "Load processor outputs" in card_str
+
+
+def test_from_metadata_with_custom_description() -> None:
+    """Test creating dataset card with custom description."""
+    metadata = {
+        "target_num_records": 100,
+        "schema": {"col1": "string", "col2": "int64"},
+        "column_statistics": [
+            {
+                "column_name": "col1",
+                "num_records": 100,
+                "num_unique": 100,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "column_type": "sampler",
+            }
+        ],
+    }
+
+    description = "This dataset contains synthetic data for testing chatbot responses."
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=metadata,
+        sdg_config=None,
+        repo_id="test/dataset-with-description",
+        description=description,
+    )
+
+    card_str = str(card)
+    assert card is not None
+    assert "This dataset contains synthetic data for testing chatbot responses." in card_str
+
+
+def test_from_metadata_without_custom_description() -> None:
+    """Test creating dataset card without custom description."""
+    metadata = {
+        "target_num_records": 50,
+        "schema": {"col1": "string"},
+        "column_statistics": [
+            {
+                "column_name": "col1",
+                "num_records": 50,
+                "num_unique": 50,
+                "num_null": 0,
+                "simple_dtype": "string",
+                "column_type": "sampler",
+            }
+        ],
+    }
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=metadata,
+        sdg_config=None,
+        repo_id="test/dataset-no-description",
+    )
+
+    card_str = str(card)
+    assert card is not None
+    assert "About NeMo Data Designer" in card_str

From da2acc881264cbaf677fba65f2165d85e5b1ae36 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 14:38:29 -0700
Subject: [PATCH 04/25] feat: make description required and enhance dataset
 card design
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Make description parameter required in push_to_hub()
- Improve dataset card layout with flexbox header (title + right-aligned tagline)
- Add horizontal dividers between sections for visual separation
- Add emoji icons to section headers for better readability
- Move About NeMo Data Designer section after Citation
- Update section order: Description → Quick Start → Dataset Summary → Schema & Statistics → Generation Details → Citation → About
- Update all tests to provide required description parameter
---
 .../integrations/huggingface/client.py        | 10 +--
 .../integrations/huggingface/dataset_card.py  |  4 +-
 .../huggingface/dataset_card_template.md      | 70 +++++++++++--------
 .../src/data_designer/interface/results.py    | 13 ++--
 .../integrations/huggingface/test_client.py   | 19 ++++-
 .../huggingface/test_dataset_card.py          | 16 ++++-
 6 files changed, 83 insertions(+), 49 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index a5ada120..b655c0dd 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -41,10 +41,10 @@ def upload_dataset(
         self,
         repo_id: str,
         base_dataset_path: Path,
+        description: str,
         *,
         private: bool = False,
         create_pr: bool = False,
-        description: str | None = None,
     ) -> str:
         """Upload dataset to HuggingFace Hub.
 
@@ -57,9 +57,9 @@ def upload_dataset(
         Args:
             repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
             base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
+            description: Custom description text for dataset card
             private: Whether to create private repo
             create_pr: Whether to create a PR instead of direct push
-            description: Optional custom description text for dataset card
 
         Returns:
             URL to the uploaded dataset
@@ -105,7 +105,7 @@ def upload_dataset(
 
         logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
         try:
-            self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr, description=description)
+            self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr)
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
 
@@ -188,15 +188,15 @@ def upload_dataset(
         return url
 
     def _upload_dataset_card(
-        self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False, description: str | None = None
+        self, repo_id: str, base_dataset_path: Path, description: str, *, create_pr: bool = False
     ) -> None:
         """Generate and upload dataset card from metadata.json.
 
         Args:
             repo_id: HuggingFace repo ID
             base_dataset_path: Path to dataset artifacts
+            description: Custom description text for dataset card
             create_pr: Whether to create a PR instead of direct push
-            description: Optional custom description text for dataset card
 
         Raises:
             HuggingFaceUploadError: If dataset card generation or upload fails
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 02960a17..43ccb48d 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -27,7 +27,7 @@ def from_metadata(
         metadata: dict,
         sdg_config: dict | None,
         repo_id: str,
-        description: str | None = None,
+        description: str,
     ) -> DataDesignerDatasetCard:
         """Create dataset card from metadata.json and sdg.json.
 
@@ -35,7 +35,7 @@ def from_metadata(
             metadata: Contents of metadata.json
             sdg_config: Contents of sdg.json (optional)
             repo_id: HuggingFace repo ID
-            description: Optional custom description text
+            description: Custom description text
 
         Returns:
             DataDesignerDatasetCard instance ready to upload
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
index 1e63be49..89c87e67 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
@@ -13,35 +13,18 @@ configs:
 {% endfor %}{% endif %}
 ---
 
-# {{ repo_id.split('/')[-1] | title }}
+<div style="display: flex; justify-content: space-between; align-items: flex-end; width: 100%; margin-bottom: 1rem;">
+<h1 style="flex: 1; margin: 0;">{{ repo_id.split('/')[-1] | title }}</h1>
+<sub style="white-space: nowrap;">Made with ❤️ using 🎨 <a href="https://github.com/NVIDIA-NeMo/DataDesigner">NeMo Data Designer</a></sub>
+</div>
 
-This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data.
-{% if custom_description %}
+---
 
 {{ custom_description }}
-{% endif %}
-
-## About NeMo Data Designer
-
-NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides:
-
-- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets
-- **Relationship control** between fields with dependency-aware generation
-- **Quality validation** with built-in Python, SQL, and custom local and remote validators
-- **LLM-as-a-judge** scoring for quality assessment
-- **Fast iteration** with preview mode before full-scale generation
 
-For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`)
-
-## Dataset Summary
-
-- **Records**: {{ "{:,}".format(num_records) }}
-- **Columns**: {{ num_columns }}
-{% if target_num_records != num_records %}
-- **Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested)
-{% endif %}
+---
 
-## Quick Start
+## 🚀 Quick Start
 
 ```python
 from datasets import load_dataset
@@ -50,13 +33,26 @@ from datasets import load_dataset
 dataset = load_dataset("{{ repo_id }}", "data", split="train")
 df = dataset.to_pandas()
 {% if has_processors %}
+
 # Load processor outputs (if available){% for processor_name in processor_names %}
 processor_{{ processor_name }} = load_dataset("{{ repo_id }}", "{{ processor_name }}", split="train")
 df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas()
 {% endfor %}{% endif %}
 ```
 
-## Schema & Statistics
+---
+
+## 📊 Dataset Summary
+
+- **📈 Records**: {{ "{:,}".format(num_records) }}
+- **📋 Columns**: {{ num_columns }}
+{% if target_num_records != num_records %}
+- **✅ Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested)
+{% endif %}
+
+---
+
+## 📋 Schema & Statistics
 
 {% if column_statistics %}
 | Column | Type | Column Type | Unique (%) | Null (%) | Details |
@@ -72,7 +68,9 @@ df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas()
 {% endfor %}
 {% endif %}
 
-## Generation Details
+---
+
+## ⚙️ Generation Details
 
 {% if config_types %}
 Generated with {{ num_columns_configured }} column configuration(s):
@@ -80,11 +78,13 @@ Generated with {{ num_columns_configured }} column configuration(s):
 {% for col_type, count in config_types.items() | sort %}
 - **{{ col_type }}**: {{ count }} column(s)
 {% endfor %}
+
 {% endif %}
+📄 Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json).
 
-Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json).
+---
 
-## Citation
+## 📚 Citation
 
 ```bibtex
 @misc{nemo-data-designer,
@@ -95,3 +95,17 @@ Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in
   note = {GitHub Repository},
 }
 ```
+
+---
+
+## 💡 About NeMo Data Designer
+
+NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides:
+
+- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets
+- **Relationship control** between fields with dependency-aware generation
+- **Quality validation** with built-in Python, SQL, and custom local and remote validators
+- **LLM-as-a-judge** scoring for quality assessment
+- **Fast iteration** with preview mode before full-scale generation
+
+For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`)
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index 3f1072d7..cde3b95d 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -101,11 +101,11 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
     def push_to_hub(
         self,
         repo_id: str,
+        description: str,
         *,
         token: str | None = None,
         private: bool = False,
         create_pr: bool = False,
-        description: str | None = None,
     ) -> str:
         """Push dataset to HuggingFace Hub.
 
@@ -118,25 +118,22 @@ def push_to_hub(
 
         Args:
             repo_id: HuggingFace repo ID (e.g., "username/my-dataset")
+            description: Custom description text for the dataset card.
+                Appears after the title.
             token: HuggingFace API token. If None, the token is automatically
                 resolved from HF_TOKEN environment variable or cached credentials
                 from `huggingface-cli login`.
             private: Create private repo
             create_pr: Create PR instead of direct push
-            description: Optional custom description text for the dataset card.
-                Appears after the NeMo Data Designer intro.
 
         Returns:
             URL to the uploaded dataset
 
         Example:
             >>> results = data_designer.create(config, num_records=1000)
-            >>> results.push_to_hub("username/my-synthetic-dataset")
-            'https://huggingface.co/datasets/username/my-synthetic-dataset'
-
-            >>> # With custom description
             >>> description = "This dataset contains synthetic conversations for training chatbots."
-            >>> results.push_to_hub("username/my-dataset", description=description)
+            >>> results.push_to_hub("username/my-synthetic-dataset", description)
+            'https://huggingface.co/datasets/username/my-synthetic-dataset'
         """
         client = HuggingFaceHubClient(token=token)
         return client.upload_dataset(
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index efc64ffb..25f75f23 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -126,6 +126,7 @@ def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
     mock_hf_api.create_repo.assert_called_once_with(
@@ -144,6 +145,7 @@ def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dat
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
     # Check that upload_folder was called for parquet files
@@ -159,6 +161,7 @@ def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
     # Check that upload_folder was called for processor outputs
@@ -174,6 +177,7 @@ def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_data
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
     # Check that upload_file was called for config files
@@ -191,6 +195,7 @@ def test_upload_dataset_returns_url(mock_hf_api: MagicMock, sample_dataset_path:
         url = client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
     assert url == "https://huggingface.co/datasets/test/dataset"
@@ -204,6 +209,7 @@ def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
             private=True,
         )
 
@@ -223,6 +229,7 @@ def test_upload_dataset_with_create_pr(mock_hf_api: MagicMock, sample_dataset_pa
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
             create_pr=True,
         )
 
@@ -240,7 +247,7 @@ def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None:
     base_path.mkdir()
 
     with pytest.raises(HuggingFaceUploadError, match="Failed to read metadata.json"):
-        client._upload_dataset_card("test/dataset", base_path)
+        client._upload_dataset_card("test/dataset", base_path, "Test description")
 
 
 def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> None:
@@ -251,7 +258,7 @@ def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> Non
         mock_card = MagicMock()
         mock_card_class.from_metadata.return_value = mock_card
 
-        client._upload_dataset_card("test/dataset", sample_dataset_path)
+        client._upload_dataset_card("test/dataset", sample_dataset_path, "Test description")
 
         # Verify card was created from metadata
         mock_card_class.from_metadata.assert_called_once()
@@ -287,6 +294,7 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=base_path,
+            description="Test dataset",
         )
 
     # Should only upload parquet files, not processors
@@ -318,6 +326,7 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=base_path,
+            description="Test dataset",
         )
 
     # Should only upload metadata.json, not sdg.json
@@ -337,6 +346,7 @@ def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_datas
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
     # Check that both processors were uploaded
@@ -475,6 +485,7 @@ def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_p
         client.upload_dataset(
             repo_id="invalid-repo-id",  # Missing slash
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
 
@@ -493,6 +504,7 @@ def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_data
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
 
@@ -511,6 +523,7 @@ def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
+            description="Test dataset",
         )
 
 
@@ -522,7 +535,7 @@ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None:
     (base_path / "metadata.json").write_text("invalid json")
 
     with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"):
-        client._upload_dataset_card("test/dataset", base_path)
+        client._upload_dataset_card("test/dataset", base_path, "Test description")
 
 
 def test_update_metadata_paths(tmp_path: Path) -> None:
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
index 40b8ecaa..956720e9 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -37,6 +37,7 @@ def test_from_metadata_minimal() -> None:
         metadata=metadata,
         sdg_config=None,
         repo_id="test/dataset",
+        description="Test dataset for unit testing.",
     )
 
     # Verify card was created
@@ -87,6 +88,7 @@ def test_from_metadata_with_sdg_config() -> None:
         metadata=metadata,
         sdg_config=sdg_config,
         repo_id="test/dataset-with-config",
+        description="Test dataset with SDG config.",
     )
 
     # Verify card includes config info
@@ -118,6 +120,7 @@ def test_from_metadata_with_llm_columns() -> None:
         metadata=metadata,
         sdg_config=None,
         repo_id="test/llm-dataset",
+        description="Test dataset with LLM columns.",
     )
 
     # Verify LLM statistics are included
@@ -153,6 +156,7 @@ def test_from_metadata_with_processors() -> None:
         metadata=metadata,
         sdg_config=None,
         repo_id="test/dataset-with-processors",
+        description="Test dataset with processor outputs.",
     )
 
     card_str = str(card)
@@ -195,8 +199,8 @@ def test_from_metadata_with_custom_description() -> None:
     assert "This dataset contains synthetic data for testing chatbot responses." in card_str
 
 
-def test_from_metadata_without_custom_description() -> None:
-    """Test creating dataset card without custom description."""
+def test_from_metadata_description_placement() -> None:
+    """Test that description appears in the correct location."""
     metadata = {
         "target_num_records": 50,
         "schema": {"col1": "string"},
@@ -215,9 +219,15 @@ def test_from_metadata_without_custom_description() -> None:
     card = DataDesignerDatasetCard.from_metadata(
         metadata=metadata,
         sdg_config=None,
-        repo_id="test/dataset-no-description",
+        repo_id="test/dataset-description-placement",
+        description="Test description placement.",
     )
 
     card_str = str(card)
     assert card is not None
+    assert "Test description placement." in card_str
     assert "About NeMo Data Designer" in card_str
+    # Description should appear before Dataset Summary
+    desc_pos = card_str.find("Test description placement.")
+    summary_pos = card_str.find("Dataset Summary")
+    assert desc_pos < summary_pos

From 5b83a1a24c5b41fc2b94173a9e73bc0ae31f979e Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 14:50:29 -0700
Subject: [PATCH 05/25] fix license headers

---
 .../src/data_designer/integrations/huggingface/__init__.py      | 2 +-
 .../src/data_designer/integrations/huggingface/client.py        | 2 +-
 .../src/data_designer/integrations/huggingface/dataset_card.py  | 2 +-
 .../data-designer/tests/integrations/huggingface/__init__.py    | 2 +-
 .../data-designer/tests/integrations/huggingface/test_client.py | 2 +-
 .../tests/integrations/huggingface/test_dataset_card.py         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
index 99b9d93e..bbdaddff 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index b655c0dd..c73c6f89 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 43ccb48d..606a54f9 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/packages/data-designer/tests/integrations/huggingface/__init__.py b/packages/data-designer/tests/integrations/huggingface/__init__.py
index 1a8431c3..52a7a9da 100644
--- a/packages/data-designer/tests/integrations/huggingface/__init__.py
+++ b/packages/data-designer/tests/integrations/huggingface/__init__.py
@@ -1,2 +1,2 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 25f75f23..b946c473 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
index 956720e9..aa573cd6 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations

From 0ecba21c1f0b6d8d842bcc18f9f9b1719fbcd27b Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:02:42 -0700
Subject: [PATCH 06/25] remove modality deteciton

---
 .../integrations/huggingface/dataset_card.py       | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 606a54f9..766bde8c 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -71,20 +71,8 @@ def from_metadata(
         if "file_paths" in metadata and "processor-files" in metadata["file_paths"]:
             processor_names = list(metadata["file_paths"]["processor-files"].keys())
 
-        # Determine modalities based on column types
-        modalities = set()
-        has_text = False
-        for stat in column_stats:
-            col_type = stat.get("column_type", "")
-            if col_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"]:
-                has_text = True
-
-        if has_text:
-            modalities.add("text")
-        modalities.add("tabular")
-
         # Prepare tags
-        tags = ["synthetic", "datadesigner"] + list(modalities)
+        tags = ["synthetic", "datadesigner"]
 
         # Prepare CardData (metadata for YAML frontmatter)
         card_data = CardData(

From 08b8aa6021674b25dd60c2493f794ebe8de8dded Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:06:29 -0700
Subject: [PATCH 07/25] break up upload_dataset

---
 .../integrations/huggingface/client.py        | 107 +++++++++++++-----
 1 file changed, 79 insertions(+), 28 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index c73c6f89..3120934f 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -72,6 +72,32 @@ def upload_dataset(
         self._validate_repo_id(repo_id)
         self._validate_dataset_path(base_dataset_path)
 
+        self._create_or_get_repo(repo_id, private=private)
+
+        logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
+        try:
+            self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr)
+        except Exception as e:
+            raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
+
+        self._upload_main_dataset_files(repo_id, base_dataset_path, create_pr=create_pr)
+        self._upload_processor_files(repo_id, base_dataset_path, create_pr=create_pr)
+        self._upload_config_files(repo_id, base_dataset_path, create_pr=create_pr)
+
+        url = f"https://huggingface.co/datasets/{repo_id}"
+        logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
+        return url
+
+    def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
+        """Create or get existing repository on HuggingFace Hub.
+
+        Args:
+            repo_id: HuggingFace repo ID
+            private: Whether to create private repo
+
+        Raises:
+            HuggingFaceUploadError: If repository creation fails
+        """
         logger.info(f"|-- {RandomEmoji.working()} Checking if repository exists...")
         try:
             repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset")
@@ -103,12 +129,17 @@ def upload_dataset(
         except Exception as e:
             raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
 
-        logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
-        try:
-            self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr)
-        except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
+    def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+        """Upload main parquet dataset files.
+
+        Args:
+            repo_id: HuggingFace repo ID
+            base_dataset_path: Path to dataset directory
+            create_pr: Whether to create a PR instead of direct push
 
+        Raises:
+            HuggingFaceUploadError: If upload fails
+        """
         logger.info(f"|-- {RandomEmoji.loading()} Uploading main dataset files...")
         parquet_folder = base_dataset_path / "parquet-files"
         try:
@@ -123,28 +154,52 @@ def upload_dataset(
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e
 
+    def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+        """Upload processor output files.
+
+        Args:
+            repo_id: HuggingFace repo ID
+            base_dataset_path: Path to dataset directory
+            create_pr: Whether to create a PR instead of direct push
+
+        Raises:
+            HuggingFaceUploadError: If upload fails
+        """
         processors_folder = base_dataset_path / "processors-files"
-        if processors_folder.exists():
-            processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()]
-            if processor_dirs:
-                logger.info(
-                    f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)..."
+        if not processors_folder.exists():
+            return
+
+        processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()]
+        if not processor_dirs:
+            return
+
+        logger.info(f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...")
+        for processor_dir in processor_dirs:
+            try:
+                self._api.upload_folder(
+                    repo_id=repo_id,
+                    folder_path=str(processor_dir),
+                    path_in_repo=processor_dir.name,
+                    repo_type="dataset",
+                    commit_message=f"Upload {processor_dir.name} processor outputs",
+                    create_pr=create_pr,
                 )
-            for processor_dir in processor_dirs:
-                try:
-                    self._api.upload_folder(
-                        repo_id=repo_id,
-                        folder_path=str(processor_dir),
-                        path_in_repo=processor_dir.name,
-                        repo_type="dataset",
-                        commit_message=f"Upload {processor_dir.name} processor outputs",
-                        create_pr=create_pr,
-                    )
-                except Exception as e:
-                    raise HuggingFaceUploadError(
-                        f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
-                    ) from e
+            except Exception as e:
+                raise HuggingFaceUploadError(
+                    f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
+                ) from e
+
+    def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+        """Upload configuration files (sdg.json and metadata.json).
+
+        Args:
+            repo_id: HuggingFace repo ID
+            base_dataset_path: Path to dataset directory
+            create_pr: Whether to create a PR instead of direct push
 
+        Raises:
+            HuggingFaceUploadError: If upload fails
+        """
         logger.info(f"|-- {RandomEmoji.loading()} Uploading configuration files...")
 
         sdg_path = base_dataset_path / "sdg.json"
@@ -183,10 +238,6 @@ def upload_dataset(
             except Exception as e:
                 raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e
 
-        url = f"https://huggingface.co/datasets/{repo_id}"
-        logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
-        return url
-
     def _upload_dataset_card(
         self, repo_id: str, base_dataset_path: Path, description: str, *, create_pr: bool = False
     ) -> None:

From ddd562900facd45387b9e38b397a2b1a61739964 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:09:56 -0700
Subject: [PATCH 08/25] make token  private

---
 .../data_designer/integrations/huggingface/client.py  | 11 ++++++++++-
 .../tests/integrations/huggingface/test_client.py     |  4 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 3120934f..8cfbb9dd 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -34,9 +34,18 @@ def __init__(self, token: str | None = None):
                 resolved from HF_TOKEN environment variable or cached credentials
                 from `huggingface-cli login`.
         """
-        self.token = token
+        self._token = token
         self._api = HfApi(token=token)
 
+    @property
+    def has_token(self) -> bool:
+        """Check if a token was explicitly provided.
+
+        Returns:
+            True if a token was provided during initialization, False otherwise.
+        """
+        return self._token is not None
+
     def upload_dataset(
         self,
         repo_id: str,
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index b946c473..7c54722e 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -108,14 +108,14 @@ def test_client_initialization() -> None:
     """Test HuggingFaceHubClient initialization."""
     with patch("data_designer.integrations.huggingface.client.HfApi"):
         client = HuggingFaceHubClient(token="test-token")
-        assert client.token == "test-token"
+        assert client.has_token is True
 
 
 def test_client_initialization_no_token() -> None:
     """Test HuggingFaceHubClient initialization without token."""
     with patch("data_designer.integrations.huggingface.client.HfApi"):
         client = HuggingFaceHubClient()
-        assert client.token is None
+        assert client.has_token is False
 
 
 def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:

From 4590c7d4127b157bcfd44976207355408e978de9 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:15:46 -0700
Subject: [PATCH 09/25] HuggingFace -> Hugging Face

---
 .../integrations/huggingface/client.py        | 32 +++++++++----------
 .../integrations/huggingface/dataset_card.py  |  6 ++--
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 8cfbb9dd..ffd732bc 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -20,17 +20,17 @@
 
 
 class HuggingFaceUploadError(DataDesignerError):
-    """Error during HuggingFace dataset upload."""
+    """Error during Hugging Face dataset upload."""
 
 
 class HuggingFaceHubClient:
-    """Client for interacting with HuggingFace Hub to upload datasets."""
+    """Client for interacting with Hugging Face Hub to upload datasets."""
 
     def __init__(self, token: str | None = None):
-        """Initialize HuggingFace Hub client.
+        """Initialize Hugging Face Hub client.
 
         Args:
-            token: HuggingFace API token. If None, the token is automatically
+            token: Hugging Face API token. If None, the token is automatically
                 resolved from HF_TOKEN environment variable or cached credentials
                 from `huggingface-cli login`.
         """
@@ -55,7 +55,7 @@ def upload_dataset(
         private: bool = False,
         create_pr: bool = False,
     ) -> str:
-        """Upload dataset to HuggingFace Hub.
+        """Upload dataset to Hugging Face Hub.
 
         Uploads the complete dataset including:
         - Main parquet batch files from parquet-files/ → data/
@@ -64,7 +64,7 @@ def upload_dataset(
         - Auto-generated README.md (dataset card)
 
         Args:
-            repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
+            repo_id: Hugging Face dataset repo ID (e.g., "username/dataset-name")
             base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
             description: Custom description text for dataset card
             private: Whether to create private repo
@@ -76,7 +76,7 @@ def upload_dataset(
         Raises:
             HuggingFaceUploadError: If validation fails or upload encounters errors
         """
-        logger.info(f"🤗 Uploading dataset to HuggingFace Hub: {repo_id}")
+        logger.info(f"🤗 Uploading dataset to Hugging Face Hub: {repo_id}")
 
         self._validate_repo_id(repo_id)
         self._validate_dataset_path(base_dataset_path)
@@ -98,10 +98,10 @@ def upload_dataset(
         return url
 
     def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
-        """Create or get existing repository on HuggingFace Hub.
+        """Create or get existing repository on Hugging Face Hub.
 
         Args:
-            repo_id: HuggingFace repo ID
+            repo_id: Hugging Face dataset repo ID
             private: Whether to create private repo
 
         Raises:
@@ -124,7 +124,7 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
         except HfHubHTTPError as e:
             if e.response.status_code == 401:
                 raise HuggingFaceUploadError(
-                    "Authentication failed. Please provide a valid HuggingFace token. "
+                    "Authentication failed. Please provide a valid Hugging Face token. "
                     "You can set it via the token parameter or HF_TOKEN environment variable, "
                     "or run 'huggingface-cli login'."
                 ) from e
@@ -142,7 +142,7 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, c
         """Upload main parquet dataset files.
 
         Args:
-            repo_id: HuggingFace repo ID
+            repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset directory
             create_pr: Whether to create a PR instead of direct push
 
@@ -167,7 +167,7 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, crea
         """Upload processor output files.
 
         Args:
-            repo_id: HuggingFace repo ID
+            repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset directory
             create_pr: Whether to create a PR instead of direct push
 
@@ -202,7 +202,7 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_
         """Upload configuration files (sdg.json and metadata.json).
 
         Args:
-            repo_id: HuggingFace repo ID
+            repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset directory
             create_pr: Whether to create a PR instead of direct push
 
@@ -253,7 +253,7 @@ def _upload_dataset_card(
         """Generate and upload dataset card from metadata.json.
 
         Args:
-            repo_id: HuggingFace repo ID
+            repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset artifacts
             description: Custom description text for dataset card
             create_pr: Whether to create a PR instead of direct push
@@ -298,7 +298,7 @@ def _upload_dataset_card(
 
     @staticmethod
     def _validate_repo_id(repo_id: str) -> None:
-        """Validate HuggingFace repository ID format.
+        """Validate Hugging Face dataset repository ID format.
 
         Args:
             repo_id: Repository ID to validate
@@ -320,7 +320,7 @@ def _validate_repo_id(repo_id: str) -> None:
 
     @staticmethod
     def _update_metadata_paths(metadata_path: Path) -> dict:
-        """Update file paths in metadata.json to match HuggingFace Hub structure.
+        """Update file paths in metadata.json to match Hugging Face dataset repository structure.
 
         Local paths:
         - parquet-files/batch_00000.parquet → data/batch_00000.parquet
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 766bde8c..166750fc 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -34,7 +34,7 @@ def from_metadata(
         Args:
             metadata: Contents of metadata.json
             sdg_config: Contents of sdg.json (optional)
-            repo_id: HuggingFace repo ID
+            repo_id: Hugging Face dataset repo ID
             description: Custom description text
 
         Returns:
@@ -105,13 +105,13 @@ def from_metadata(
 
     @staticmethod
     def _compute_size_category(num_records: int) -> str:
-        """Compute HuggingFace size category from record count.
+        """Compute Hugging Face dataset size category from record count.
 
         Args:
             num_records: Number of records in the dataset
 
         Returns:
-            Size category string for HuggingFace Hub tags
+            Size category string for Hugging Face dataset repository tags
         """
         if num_records < 1000:
             return "n<1K"

From 5113069d767d998fdd2071bc6ca7322ecc272d88 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:19:39 -0700
Subject: [PATCH 10/25] remove inline imports

---
 .../tests/integrations/huggingface/test_client.py            | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 7c54722e..5fedf608 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -8,6 +8,7 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
+from huggingface_hub.utils import HfHubHTTPError
 
 from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError
 
@@ -491,8 +492,6 @@ def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_p
 
 def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
     """Test upload_dataset handles authentication errors."""
-    from huggingface_hub.utils import HfHubHTTPError
-
     client = HuggingFaceHubClient(token="invalid-token")
 
     # Mock 401 authentication error
@@ -510,8 +509,6 @@ def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_data
 
 def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
     """Test upload_dataset handles permission errors."""
-    from huggingface_hub.utils import HfHubHTTPError
-
     client = HuggingFaceHubClient(token="test-token")
 
     # Mock 403 permission error

From 02182f991a9c748ae6ef8069c321fb917609cfb9 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:32:33 -0700
Subject: [PATCH 11/25] simplify tests + remvoe create pr option for simplicity

---
 .../integrations/huggingface/client.py        |  30 +--
 .../src/data_designer/interface/results.py    |   3 -
 .../integrations/huggingface/test_client.py   | 184 ++++++++----------
 3 files changed, 95 insertions(+), 122 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index ffd732bc..e0a2a266 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -53,7 +53,6 @@ def upload_dataset(
         description: str,
         *,
         private: bool = False,
-        create_pr: bool = False,
     ) -> str:
         """Upload dataset to Hugging Face Hub.
 
@@ -68,7 +67,6 @@ def upload_dataset(
             base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
             description: Custom description text for dataset card
             private: Whether to create private repo
-            create_pr: Whether to create a PR instead of direct push
 
         Returns:
             URL to the uploaded dataset
@@ -85,13 +83,13 @@ def upload_dataset(
 
         logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
         try:
-            self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr)
+            self._upload_dataset_card(repo_id, base_dataset_path, description)
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
 
-        self._upload_main_dataset_files(repo_id, base_dataset_path, create_pr=create_pr)
-        self._upload_processor_files(repo_id, base_dataset_path, create_pr=create_pr)
-        self._upload_config_files(repo_id, base_dataset_path, create_pr=create_pr)
+        self._upload_main_dataset_files(repo_id, base_dataset_path)
+        self._upload_processor_files(repo_id, base_dataset_path)
+        self._upload_config_files(repo_id, base_dataset_path)
 
         url = f"https://huggingface.co/datasets/{repo_id}"
         logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
@@ -138,13 +136,12 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
         except Exception as e:
             raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
 
-    def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+    def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> None:
         """Upload main parquet dataset files.
 
         Args:
             repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset directory
-            create_pr: Whether to create a PR instead of direct push
 
         Raises:
             HuggingFaceUploadError: If upload fails
@@ -158,18 +155,16 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, c
                 path_in_repo="data",
                 repo_type="dataset",
                 commit_message="Upload main dataset files",
-                create_pr=create_pr,
             )
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e
 
-    def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+    def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None:
         """Upload processor output files.
 
         Args:
             repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset directory
-            create_pr: Whether to create a PR instead of direct push
 
         Raises:
             HuggingFaceUploadError: If upload fails
@@ -191,20 +186,18 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, crea
                     path_in_repo=processor_dir.name,
                     repo_type="dataset",
                     commit_message=f"Upload {processor_dir.name} processor outputs",
-                    create_pr=create_pr,
                 )
             except Exception as e:
                 raise HuggingFaceUploadError(
                     f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
                 ) from e
 
-    def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None:
+    def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None:
         """Upload configuration files (sdg.json and metadata.json).
 
         Args:
             repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset directory
-            create_pr: Whether to create a PR instead of direct push
 
         Raises:
             HuggingFaceUploadError: If upload fails
@@ -220,7 +213,6 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_
                     path_in_repo="sdg.json",
                     repo_type="dataset",
                     commit_message="Upload sdg.json",
-                    create_pr=create_pr,
                 )
             except Exception as e:
                 raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e
@@ -240,23 +232,19 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_
                         path_in_repo="metadata.json",
                         repo_type="dataset",
                         commit_message="Upload metadata.json",
-                        create_pr=create_pr,
                     )
                 finally:
                     Path(tmp_path).unlink()
             except Exception as e:
                 raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e
 
-    def _upload_dataset_card(
-        self, repo_id: str, base_dataset_path: Path, description: str, *, create_pr: bool = False
-    ) -> None:
+    def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, description: str) -> None:
         """Generate and upload dataset card from metadata.json.
 
         Args:
             repo_id: Hugging Face dataset repo ID
             base_dataset_path: Path to dataset artifacts
             description: Custom description text for dataset card
-            create_pr: Whether to create a PR instead of direct push
 
         Raises:
             HuggingFaceUploadError: If dataset card generation or upload fails
@@ -292,7 +280,7 @@ def _upload_dataset_card(
             raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e
 
         try:
-            card.push_to_hub(repo_id, repo_type="dataset", create_pr=create_pr)
+            card.push_to_hub(repo_id, repo_type="dataset")
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to push dataset card to hub: {e}") from e
 
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index cde3b95d..a37a5483 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -105,7 +105,6 @@ def push_to_hub(
         *,
         token: str | None = None,
         private: bool = False,
-        create_pr: bool = False,
     ) -> str:
         """Push dataset to HuggingFace Hub.
 
@@ -124,7 +123,6 @@ def push_to_hub(
                 resolved from HF_TOKEN environment variable or cached credentials
                 from `huggingface-cli login`.
             private: Create private repo
-            create_pr: Create PR instead of direct push
 
         Returns:
             URL to the uploaded dataset
@@ -140,6 +138,5 @@ def push_to_hub(
             repo_id=repo_id,
             base_dataset_path=self.artifact_storage.base_dataset_path,
             private=private,
-            create_pr=create_pr,
             description=description,
         )
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 5fedf608..6ba24647 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -22,6 +22,15 @@ def mock_hf_api() -> MagicMock:
         yield api_instance
 
 
+@pytest.fixture
+def mock_dataset_card() -> MagicMock:
+    """Mock DataDesignerDatasetCard for testing."""
+    with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock:
+        card_instance = MagicMock()
+        mock.from_metadata.return_value = card_instance
+        yield mock
+
+
 @pytest.fixture
 def sample_dataset_path(tmp_path: Path) -> Path:
     """Create a sample dataset directory structure.
@@ -119,67 +128,68 @@ def test_client_initialization_no_token() -> None:
         assert client.has_token is False
 
 
-def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_creates_repo(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test that upload_dataset creates a repository."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-        )
-
-    mock_hf_api.create_repo.assert_called_once_with(
+    client.upload_dataset(
         repo_id="test/dataset",
-        repo_type="dataset",
-        exist_ok=True,
-        private=False,
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
     )
 
+    # Verify repo creation was called
+    mock_hf_api.create_repo.assert_called_once()
+    assert mock_hf_api.create_repo.call_args.kwargs["repo_id"] == "test/dataset"
+
 
-def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_uploads_parquet_files(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test that upload_dataset uploads parquet files."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
+    )
 
     # Check that upload_folder was called for parquet files
     calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "data"]
     assert len(calls) >= 1
 
 
-def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_uploads_processor_outputs(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test that upload_dataset uploads processor outputs."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
+    )
 
     # Check that upload_folder was called for processor outputs
     calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in call.kwargs["path_in_repo"]]
     assert len(calls) >= 1
 
 
-def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_uploads_config_files(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test that upload_dataset uploads sdg.json and metadata.json."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
+    )
 
     # Check that upload_file was called for config files
     upload_file_calls = mock_hf_api.upload_file.call_args_list
@@ -188,31 +198,33 @@ def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_data
     assert "metadata.json" in uploaded_files
 
 
-def test_upload_dataset_returns_url(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_returns_url(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test that upload_dataset returns the correct URL."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        url = client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-        )
+    url = client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
+    )
 
     assert url == "https://huggingface.co/datasets/test/dataset"
 
 
-def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_with_private_repo(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test upload_dataset with private repository."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-            private=True,
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
+        private=True,
+    )
 
     mock_hf_api.create_repo.assert_called_once_with(
         repo_id="test/dataset",
@@ -222,23 +234,6 @@ def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset
     )
 
 
-def test_upload_dataset_with_create_pr(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
-    """Test upload_dataset with create_pr option."""
-    client = HuggingFaceHubClient(token="test-token")
-
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-            create_pr=True,
-        )
-
-    # Verify create_pr is passed to upload operations
-    for call in mock_hf_api.upload_folder.call_args_list:
-        assert call.kwargs["create_pr"] is True
-
-
 def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None:
     """Test _upload_dataset_card raises error when metadata.json is missing."""
     client = HuggingFaceHubClient(token="test-token")
@@ -261,22 +256,14 @@ def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> Non
 
         client._upload_dataset_card("test/dataset", sample_dataset_path, "Test description")
 
-        # Verify card was created from metadata
+        # Verify card was created and pushed
         mock_card_class.from_metadata.assert_called_once()
-        call_kwargs = mock_card_class.from_metadata.call_args.kwargs
-        assert call_kwargs["repo_id"] == "test/dataset"
-        assert "metadata" in call_kwargs
-        assert "sdg_config" in call_kwargs
-
-        # Verify card was pushed to hub
-        mock_card.push_to_hub.assert_called_once_with(
-            "test/dataset",
-            repo_type="dataset",
-            create_pr=False,
-        )
+        mock_card.push_to_hub.assert_called_once()
 
 
-def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Path) -> None:
+def test_upload_dataset_without_processors(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path
+) -> None:
     """Test upload_dataset when no processor outputs exist."""
     # Create dataset path without processors directory
     base_path = tmp_path / "dataset"
@@ -291,12 +278,11 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat
 
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=base_path,
-            description="Test dataset",
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=base_path,
+        description="Test dataset",
+    )
 
     # Should only upload parquet files, not processors
     folder_calls = mock_hf_api.upload_folder.call_args_list
@@ -307,7 +293,9 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat
     assert len(processor_calls) == 0  # No processor files
 
 
-def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Path) -> None:
+def test_upload_dataset_without_sdg_config(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path
+) -> None:
     """Test upload_dataset when sdg.json doesn't exist."""
     base_path = tmp_path / "dataset"
     base_path.mkdir()
@@ -323,12 +311,11 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat
 
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=base_path,
-            description="Test dataset",
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=base_path,
+        description="Test dataset",
+    )
 
     # Should only upload metadata.json, not sdg.json
     file_calls = mock_hf_api.upload_file.call_args_list
@@ -339,16 +326,17 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat
     assert "sdg.json" not in uploaded_files
 
 
-def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+def test_upload_dataset_multiple_processors(
+    mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
+) -> None:
     """Test that multiple processor outputs are uploaded correctly."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with patch.object(client, "_upload_dataset_card"):
-        client.upload_dataset(
-            repo_id="test/dataset",
-            base_dataset_path=sample_dataset_path,
-            description="Test dataset",
-        )
+    client.upload_dataset(
+        repo_id="test/dataset",
+        base_dataset_path=sample_dataset_path,
+        description="Test dataset",
+    )
 
     # Check that both processors were uploaded
     folder_calls = mock_hf_api.upload_folder.call_args_list

From 9b99aed18becc5022dc5e98d28a7bec5935e4c1d Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:46:49 -0700
Subject: [PATCH 12/25] Update
 packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 .../src/data_designer/integrations/huggingface/dataset_card.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 166750fc..04bc6324 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -91,7 +91,7 @@ def from_metadata(
             "column_statistics": column_stats,
             "num_columns_configured": num_columns_configured,
             "config_types": config_types,
-            "percent_complete": 100 * actual_num_records / (target_num_records + 1e-10),
+            "percent_complete": 100 * actual_num_records / target_num_records if target_num_records > 0 else 0,
             "current_year": datetime.now().year,
             "has_processors": len(processor_names) > 0,
             "processor_names": processor_names,

From ce05fa18b40cf4086179b9cd6d2ee8d15a39fc56 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:51:54 -0700
Subject: [PATCH 13/25] use consistent indentaion

---
 .../data_designer/engine/dataset_builders/artifact_storage.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
index 905b0350..0d22bb89 100644
--- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
+++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
@@ -259,7 +259,7 @@ def write_metadata(self, metadata: dict) -> Path:
         """
         self.mkdir_if_needed(self.base_dataset_path)
         with open(self.metadata_file_path, "w") as file:
-            json.dump(metadata, file, indent=4, sort_keys=True)
+            json.dump(metadata, file, indent=2, sort_keys=True)
         return self.metadata_file_path
 
     def update_metadata(self, updates: dict) -> Path:

From 243c08739844f871d7cfa9b84a5aa4777367df1d Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Fri, 30 Jan 2026 15:54:33 -0700
Subject: [PATCH 14/25] fix temp file clean up

---
 .../integrations/huggingface/client.py        | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index e0a2a266..b1670598 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -219,24 +219,25 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None:
 
         metadata_path = base_dataset_path / "metadata.json"
         if metadata_path.exists():
+            tmp_path = None
             try:
                 updated_metadata = self._update_metadata_paths(metadata_path)
                 with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp_file:
                     json.dump(updated_metadata, tmp_file, indent=2)
                     tmp_path = tmp_file.name
 
-                try:
-                    self._api.upload_file(
-                        repo_id=repo_id,
-                        path_or_fileobj=tmp_path,
-                        path_in_repo="metadata.json",
-                        repo_type="dataset",
-                        commit_message="Upload metadata.json",
-                    )
-                finally:
-                    Path(tmp_path).unlink()
+                self._api.upload_file(
+                    repo_id=repo_id,
+                    path_or_fileobj=tmp_path,
+                    path_in_repo="metadata.json",
+                    repo_type="dataset",
+                    commit_message="Upload metadata.json",
+                )
             except Exception as e:
                 raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e
+            finally:
+                if tmp_path and Path(tmp_path).exists():
+                    Path(tmp_path).unlink()
 
     def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, description: str) -> None:
         """Generate and upload dataset card from metadata.json.

From de61805b4f9c2a6b2f216c6395562e083a290e55 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Mon, 2 Feb 2026 10:24:12 -0700
Subject: [PATCH 15/25] huggingface hub already a dep in engine

---
 packages/data-designer/pyproject.toml | 1 -
 uv.lock                               | 2 --
 2 files changed, 3 deletions(-)

diff --git a/packages/data-designer/pyproject.toml b/packages/data-designer/pyproject.toml
index 883e18ea..31076704 100644
--- a/packages/data-designer/pyproject.toml
+++ b/packages/data-designer/pyproject.toml
@@ -22,7 +22,6 @@ classifiers = [
 dependencies = [
     "data-designer-config",
     "data-designer-engine",
-    "huggingface-hub>=1.0.1,<2",
     "prompt-toolkit>=3.0.0,<4",
     "typer>=0.12.0,<1",
 ]
diff --git a/uv.lock b/uv.lock
index 46716729..279f21de 100644
--- a/uv.lock
+++ b/uv.lock
@@ -690,7 +690,6 @@ source = { editable = "packages/data-designer" }
 dependencies = [
     { name = "data-designer-config" },
     { name = "data-designer-engine" },
-    { name = "huggingface-hub" },
     { name = "prompt-toolkit" },
     { name = "typer" },
 ]
@@ -699,7 +698,6 @@ dependencies = [
 requires-dist = [
     { name = "data-designer-config", editable = "packages/data-designer-config" },
     { name = "data-designer-engine", editable = "packages/data-designer-engine" },
-    { name = "huggingface-hub", specifier = ">=1.0.1,<2" },
     { name = "prompt-toolkit", specifier = ">=3.0.0,<4" },
     { name = "typer", specifier = ">=0.12.0,<1" },
 ]

From f0e3fcb5f6cbe4bad73d6da465c25b08d317240d Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Mon, 2 Feb 2026 10:28:30 -0700
Subject: [PATCH 16/25] add missing spaces

---
 .../integrations/huggingface/client.py           | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index b1670598..949a390b 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -81,7 +81,7 @@ def upload_dataset(
 
         self._create_or_get_repo(repo_id, private=private)
 
-        logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...")
+        logger.info(f"  |-- {RandomEmoji.data()} Uploading dataset card...")
         try:
             self._upload_dataset_card(repo_id, base_dataset_path, description)
         except Exception as e:
@@ -92,7 +92,7 @@ def upload_dataset(
         self._upload_config_files(repo_id, base_dataset_path)
 
         url = f"https://huggingface.co/datasets/{repo_id}"
-        logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
+        logger.info(f"  |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
         return url
 
     def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
@@ -105,13 +105,13 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
         Raises:
             HuggingFaceUploadError: If repository creation fails
         """
-        logger.info(f"|-- {RandomEmoji.working()} Checking if repository exists...")
+        logger.info(f"  |-- {RandomEmoji.working()} Checking if repository exists...")
         try:
             repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset")
             if repo_exists:
-                logger.info(f"|-- {RandomEmoji.success()} Repository already exists, updating content...")
+                logger.info(f"  |-- {RandomEmoji.success()} Repository already exists, updating content...")
             else:
-                logger.info(f"|-- {RandomEmoji.working()} Creating new repository...")
+                logger.info(f"  |-- {RandomEmoji.working()} Creating new repository...")
 
             self._api.create_repo(
                 repo_id=repo_id,
@@ -146,7 +146,7 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> N
         Raises:
             HuggingFaceUploadError: If upload fails
         """
-        logger.info(f"|-- {RandomEmoji.loading()} Uploading main dataset files...")
+        logger.info(f"  |-- {RandomEmoji.loading()} Uploading main dataset files...")
         parquet_folder = base_dataset_path / "parquet-files"
         try:
             self._api.upload_folder(
@@ -177,7 +177,7 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None
         if not processor_dirs:
             return
 
-        logger.info(f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...")
+        logger.info(f"  |-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...")
         for processor_dir in processor_dirs:
             try:
                 self._api.upload_folder(
@@ -202,7 +202,7 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None:
         Raises:
             HuggingFaceUploadError: If upload fails
         """
-        logger.info(f"|-- {RandomEmoji.loading()} Uploading configuration files...")
+        logger.info(f"  |-- {RandomEmoji.loading()} Uploading configuration files...")
 
         sdg_path = base_dataset_path / "sdg.json"
         if sdg_path.exists():

From 99c61fedafb4289d89aa57288d90b25aa44d627c Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Mon, 2 Feb 2026 11:36:17 -0700
Subject: [PATCH 17/25] reuse vars from artifact_storage.py

---
 .../dataset_builders/artifact_storage.py      |  9 +-
 .../integrations/huggingface/client.py        | 92 ++++++++++--------
 .../integrations/huggingface/test_client.py   | 97 ++++++++++---------
 3 files changed, 111 insertions(+), 87 deletions(-)

diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
index 0d22bb89..e2daaca6 100644
--- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
+++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
@@ -25,6 +25,9 @@
 
 BATCH_FILE_NAME_FORMAT = "batch_{batch_number:05d}.parquet"
 SDG_CONFIG_FILENAME = "sdg.json"
+METADATA_FILENAME = "metadata.json"
+FINAL_DATASET_FOLDER_NAME = "parquet-files"
+PROCESSORS_OUTPUTS_FOLDER_NAME = "processors-files"
 
 
 class BatchStage(StrEnum):
@@ -37,10 +40,10 @@ class BatchStage(StrEnum):
 class ArtifactStorage(BaseModel):
     artifact_path: Path | str
     dataset_name: str = "dataset"
-    final_dataset_folder_name: str = "parquet-files"
+    final_dataset_folder_name: str = FINAL_DATASET_FOLDER_NAME
     partial_results_folder_name: str = "tmp-partial-parquet-files"
     dropped_columns_folder_name: str = "dropped-columns-parquet-files"
-    processors_outputs_folder_name: str = "processors-files"
+    processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME
 
     @property
     def artifact_path_exists(self) -> bool:
@@ -72,7 +75,7 @@ def final_dataset_path(self) -> Path:
 
     @property
     def metadata_file_path(self) -> Path:
-        return self.base_dataset_path / "metadata.json"
+        return self.base_dataset_path / METADATA_FILENAME
 
     @property
     def partial_results_path(self) -> Path:
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 949a390b..df8e26e7 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -12,6 +12,12 @@
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 
+from data_designer.engine.dataset_builders.artifact_storage import (
+    FINAL_DATASET_FOLDER_NAME,
+    METADATA_FILENAME,
+    PROCESSORS_OUTPUTS_FOLDER_NAME,
+    SDG_CONFIG_FILENAME,
+)
 from data_designer.errors import DataDesignerError
 from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
 from data_designer.logging import RandomEmoji
@@ -76,20 +82,30 @@ def upload_dataset(
         """
         logger.info(f"🤗 Uploading dataset to Hugging Face Hub: {repo_id}")
 
-        self._validate_repo_id(repo_id)
-        self._validate_dataset_path(base_dataset_path)
-
-        self._create_or_get_repo(repo_id, private=private)
+        self._validate_repo_id(repo_id=repo_id)
+        self._validate_dataset_path(base_dataset_path=base_dataset_path)
+        self._create_or_get_repo(repo_id=repo_id, private=private)
 
         logger.info(f"  |-- {RandomEmoji.data()} Uploading dataset card...")
         try:
-            self._upload_dataset_card(repo_id, base_dataset_path, description)
+            self._upload_dataset_card(
+                repo_id=repo_id,
+                metadata_path=base_dataset_path / METADATA_FILENAME,
+                sdg_path=base_dataset_path / SDG_CONFIG_FILENAME,
+                description=description,
+            )
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
 
-        self._upload_main_dataset_files(repo_id, base_dataset_path)
-        self._upload_processor_files(repo_id, base_dataset_path)
-        self._upload_config_files(repo_id, base_dataset_path)
+        self._upload_main_dataset_files(repo_id=repo_id, parquet_folder=base_dataset_path / FINAL_DATASET_FOLDER_NAME)
+        self._upload_processor_files(
+            repo_id=repo_id, processors_folder=base_dataset_path / PROCESSORS_OUTPUTS_FOLDER_NAME
+        )
+        self._upload_config_files(
+            repo_id=repo_id,
+            metadata_path=base_dataset_path / METADATA_FILENAME,
+            sdg_path=base_dataset_path / SDG_CONFIG_FILENAME,
+        )
 
         url = f"https://huggingface.co/datasets/{repo_id}"
         logger.info(f"  |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
@@ -136,18 +152,17 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
         except Exception as e:
             raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
 
-    def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> None:
+    def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None:
         """Upload main parquet dataset files.
 
         Args:
             repo_id: Hugging Face dataset repo ID
-            base_dataset_path: Path to dataset directory
+            parquet_folder: Path to folder containing parquet files
 
         Raises:
             HuggingFaceUploadError: If upload fails
         """
         logger.info(f"  |-- {RandomEmoji.loading()} Uploading main dataset files...")
-        parquet_folder = base_dataset_path / "parquet-files"
         try:
             self._api.upload_folder(
                 repo_id=repo_id,
@@ -159,17 +174,16 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> N
         except Exception as e:
             raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e
 
-    def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None:
+    def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None:
         """Upload processor output files.
 
         Args:
             repo_id: Hugging Face dataset repo ID
-            base_dataset_path: Path to dataset directory
+            processors_folder: Path to folder containing processor output directories
 
         Raises:
             HuggingFaceUploadError: If upload fails
         """
-        processors_folder = base_dataset_path / "processors-files"
         if not processors_folder.exists():
             return
 
@@ -192,32 +206,31 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None
                     f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
                 ) from e
 
-    def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None:
+    def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path) -> None:
         """Upload configuration files (sdg.json and metadata.json).
 
         Args:
             repo_id: Hugging Face dataset repo ID
-            base_dataset_path: Path to dataset directory
+            metadata_path: Path to metadata.json file
+            sdg_path: Path to sdg.json file
 
         Raises:
             HuggingFaceUploadError: If upload fails
         """
         logger.info(f"  |-- {RandomEmoji.loading()} Uploading configuration files...")
 
-        sdg_path = base_dataset_path / "sdg.json"
         if sdg_path.exists():
             try:
                 self._api.upload_file(
                     repo_id=repo_id,
                     path_or_fileobj=str(sdg_path),
-                    path_in_repo="sdg.json",
+                    path_in_repo=SDG_CONFIG_FILENAME,
                     repo_type="dataset",
                     commit_message="Upload sdg.json",
                 )
             except Exception as e:
                 raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e
 
-        metadata_path = base_dataset_path / "metadata.json"
         if metadata_path.exists():
             tmp_path = None
             try:
@@ -229,37 +242,36 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None:
                 self._api.upload_file(
                     repo_id=repo_id,
                     path_or_fileobj=tmp_path,
-                    path_in_repo="metadata.json",
+                    path_in_repo=METADATA_FILENAME,
                     repo_type="dataset",
-                    commit_message="Upload metadata.json",
+                    commit_message=f"Upload {METADATA_FILENAME}",
                 )
             except Exception as e:
-                raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e
+                raise HuggingFaceUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e
             finally:
                 if tmp_path and Path(tmp_path).exists():
                     Path(tmp_path).unlink()
 
-    def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, description: str) -> None:
+    def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str) -> None:
         """Generate and upload dataset card from metadata.json.
 
         Args:
             repo_id: Hugging Face dataset repo ID
-            base_dataset_path: Path to dataset artifacts
+            metadata_path: Path to metadata.json file
+            sdg_path: Path to sdg.json file
             description: Custom description text for dataset card
 
         Raises:
             HuggingFaceUploadError: If dataset card generation or upload fails
         """
-        metadata_path = base_dataset_path / "metadata.json"
         try:
             with open(metadata_path) as f:
                 metadata = json.load(f)
         except json.JSONDecodeError as e:
-            raise HuggingFaceUploadError(f"Failed to parse metadata.json: {e}") from e
+            raise HuggingFaceUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e
         except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to read metadata.json: {e}") from e
+            raise HuggingFaceUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e
 
-        sdg_path = base_dataset_path / "sdg.json"
         sdg_config = None
         if sdg_path.exists():
             try:
@@ -327,16 +339,20 @@ def _update_metadata_paths(metadata_path: Path) -> dict:
         if "file_paths" in metadata:
             updated_file_paths = {}
 
-            if "parquet-files" in metadata["file_paths"]:
+            # Update parquet files path: parquet-files/ → data/
+            if FINAL_DATASET_FOLDER_NAME in metadata["file_paths"]:
                 updated_file_paths["data"] = [
-                    path.replace("parquet-files/", "data/") for path in metadata["file_paths"]["parquet-files"]
+                    path.replace(f"{FINAL_DATASET_FOLDER_NAME}/", "data/")
+                    for path in metadata["file_paths"][FINAL_DATASET_FOLDER_NAME]
                 ]
 
+            # Update processor files paths: processors-files/{name}/ → {name}/
             if "processor-files" in metadata["file_paths"]:
                 updated_file_paths["processor-files"] = {}
                 for processor_name, paths in metadata["file_paths"]["processor-files"].items():
                     updated_file_paths["processor-files"][processor_name] = [
-                        path.replace(f"processors-files/{processor_name}/", f"{processor_name}/") for path in paths
+                        path.replace(f"{PROCESSORS_OUTPUTS_FOLDER_NAME}/{processor_name}/", f"{processor_name}/")
+                        for path in paths
                     ]
 
             metadata["file_paths"] = updated_file_paths
@@ -359,14 +375,14 @@ def _validate_dataset_path(base_dataset_path: Path) -> None:
         if not base_dataset_path.is_dir():
             raise HuggingFaceUploadError(f"Dataset path is not a directory: {base_dataset_path}")
 
-        metadata_path = base_dataset_path / "metadata.json"
+        metadata_path = base_dataset_path / METADATA_FILENAME
         if not metadata_path.exists():
             raise HuggingFaceUploadError(f"Required file not found: {metadata_path}")
 
         if not metadata_path.is_file():
-            raise HuggingFaceUploadError(f"metadata.json is not a file: {metadata_path}")
+            raise HuggingFaceUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}")
 
-        parquet_dir = base_dataset_path / "parquet-files"
+        parquet_dir = base_dataset_path / FINAL_DATASET_FOLDER_NAME
         if not parquet_dir.exists():
             raise HuggingFaceUploadError(
                 f"Required directory not found: {parquet_dir}. "
@@ -385,14 +401,14 @@ def _validate_dataset_path(base_dataset_path: Path) -> None:
             with open(metadata_path) as f:
                 json.load(f)
         except json.JSONDecodeError as e:
-            raise HuggingFaceUploadError(f"Invalid JSON in metadata.json: {e}")
+            raise HuggingFaceUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}")
 
-        sdg_path = base_dataset_path / "sdg.json"
+        sdg_path = base_dataset_path / SDG_CONFIG_FILENAME
         if sdg_path.exists():
             if not sdg_path.is_file():
-                raise HuggingFaceUploadError(f"sdg.json is not a file: {sdg_path}")
+                raise HuggingFaceUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}")
             try:
                 with open(sdg_path) as f:
                     json.load(f)
             except json.JSONDecodeError as e:
-                raise HuggingFaceUploadError(f"Invalid JSON in sdg.json: {e}")
+                raise HuggingFaceUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}")
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 6ba24647..0c87ffa6 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -235,26 +235,34 @@ def test_upload_dataset_with_private_repo(
 
 
 def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None:
-    """Test _upload_dataset_card raises error when metadata.json is missing."""
+    """Test upload fails when metadata.json is missing."""
     client = HuggingFaceHubClient(token="test-token")
 
     # Create directory without metadata.json
     base_path = tmp_path / "dataset"
     base_path.mkdir()
 
-    with pytest.raises(HuggingFaceUploadError, match="Failed to read metadata.json"):
-        client._upload_dataset_card("test/dataset", base_path, "Test description")
+    with pytest.raises(HuggingFaceUploadError, match="Required file not found"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=base_path,
+            description="Test description",
+        )
 
 
-def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> None:
-    """Test _upload_dataset_card generates card and pushes to hub."""
+def test_upload_dataset_card_calls_push_to_hub(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
+    """Test upload_dataset generates and pushes dataset card."""
     client = HuggingFaceHubClient(token="test-token")
 
     with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock_card_class:
         mock_card = MagicMock()
         mock_card_class.from_metadata.return_value = mock_card
 
-        client._upload_dataset_card("test/dataset", sample_dataset_path, "Test description")
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=sample_dataset_path,
+            description="Test description",
+        )
 
         # Verify card was created and pushed
         mock_card_class.from_metadata.assert_called_once()
@@ -351,81 +359,69 @@ def test_upload_dataset_multiple_processors(
 # Error handling and validation tests
 
 
-def test_validate_repo_id_invalid_format() -> None:
-    """Test repo_id validation with invalid formats."""
+def test_validate_repo_id_invalid_format(sample_dataset_path: Path) -> None:
+    """Test upload fails with invalid repo_id formats."""
     client = HuggingFaceHubClient(token="test-token")
 
     # Missing slash
     with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
-        client._validate_repo_id("my-dataset")
+        client.upload_dataset("my-dataset", sample_dataset_path, "Test")
 
     # Too many slashes (caught by regex)
     with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
-        client._validate_repo_id("user/org/dataset")
+        client.upload_dataset("user/org/dataset", sample_dataset_path, "Test")
 
     # Invalid characters (space)
     with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
-        client._validate_repo_id("user/my dataset")
+        client.upload_dataset("user/my dataset", sample_dataset_path, "Test")
 
     # Empty string
     with pytest.raises(HuggingFaceUploadError, match="must be a non-empty string"):
-        client._validate_repo_id("")
-
-
-def test_validate_repo_id_valid_formats() -> None:
-    """Test repo_id validation with valid formats."""
-    client = HuggingFaceHubClient(token="test-token")
-
-    # Valid formats should not raise
-    client._validate_repo_id("username/dataset")
-    client._validate_repo_id("org/my-dataset")
-    client._validate_repo_id("user/dataset_name")
-    client._validate_repo_id("user123/dataset-123")
-    client._validate_repo_id("user/dataset.v2")
+        client.upload_dataset("", sample_dataset_path, "Test")
 
 
 def test_validate_dataset_path_not_exists(tmp_path: Path) -> None:
-    """Test validation fails when dataset path doesn't exist."""
+    """Test upload fails when dataset path doesn't exist."""
     client = HuggingFaceHubClient(token="test-token")
     non_existent = tmp_path / "does-not-exist"
 
     with pytest.raises(HuggingFaceUploadError, match="does not exist"):
-        client._validate_dataset_path(non_existent)
+        client.upload_dataset("test/dataset", non_existent, "Test")
 
 
 def test_validate_dataset_path_is_file(tmp_path: Path) -> None:
-    """Test validation fails when dataset path is a file."""
+    """Test upload fails when dataset path is a file."""
     client = HuggingFaceHubClient(token="test-token")
     file_path = tmp_path / "file.txt"
     file_path.write_text("not a directory")
 
     with pytest.raises(HuggingFaceUploadError, match="not a directory"):
-        client._validate_dataset_path(file_path)
+        client.upload_dataset("test/dataset", file_path, "Test")
 
 
 def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None:
-    """Test validation fails when metadata.json is missing."""
+    """Test upload fails when metadata.json is missing."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
 
-    with pytest.raises(HuggingFaceUploadError, match="Required file not found.*metadata.json"):
-        client._validate_dataset_path(base_path)
+    with pytest.raises(HuggingFaceUploadError, match="Required file not found"):
+        client.upload_dataset("test/dataset", base_path, "Test")
 
 
 def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None:
-    """Test validation fails when parquet-files directory is missing."""
+    """Test upload fails when parquet-files directory is missing."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
     (base_path / "metadata.json").write_text('{"target_num_records": 10}')
 
-    with pytest.raises(HuggingFaceUploadError, match="Required directory not found.*parquet-files"):
-        client._validate_dataset_path(base_path)
+    with pytest.raises(HuggingFaceUploadError, match="Required directory not found"):
+        client.upload_dataset("test/dataset", base_path, "Test")
 
 
 def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None:
-    """Test validation fails when parquet-files directory is empty."""
+    """Test upload fails when parquet-files directory is empty."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
@@ -433,12 +429,12 @@ def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None:
     parquet_dir = base_path / "parquet-files"
     parquet_dir.mkdir()
 
-    with pytest.raises(HuggingFaceUploadError, match="parquet-files directory is empty"):
-        client._validate_dataset_path(base_path)
+    with pytest.raises(HuggingFaceUploadError, match="directory is empty"):
+        client.upload_dataset("test/dataset", base_path, "Test")
 
 
 def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None:
-    """Test validation fails when metadata.json contains invalid JSON."""
+    """Test upload fails when metadata.json contains invalid JSON."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
@@ -447,12 +443,12 @@ def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None:
     parquet_dir.mkdir()
     (parquet_dir / "batch_00000.parquet").write_text("data")
 
-    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in metadata.json"):
-        client._validate_dataset_path(base_path)
+    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"):
+        client.upload_dataset("test/dataset", base_path, "Test")
 
 
 def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None:
-    """Test validation fails when sdg.json contains invalid JSON."""
+    """Test upload fails when sdg.json contains invalid JSON."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
@@ -462,8 +458,8 @@ def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None:
     parquet_dir.mkdir()
     (parquet_dir / "batch_00000.parquet").write_text("data")
 
-    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in sdg.json"):
-        client._validate_dataset_path(base_path)
+    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"):
+        client.upload_dataset("test/dataset", base_path, "Test")
 
 
 def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None:
@@ -513,14 +509,23 @@ def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_
 
 
 def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None:
-    """Test _upload_dataset_card handles corrupted metadata.json."""
+    """Test upload fails when metadata.json contains invalid JSON."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
     (base_path / "metadata.json").write_text("invalid json")
 
-    with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"):
-        client._upload_dataset_card("test/dataset", base_path, "Test description")
+    # Create parquet directory so validation reaches the metadata JSON check
+    parquet_dir = base_path / "parquet-files"
+    parquet_dir.mkdir()
+    (parquet_dir / "batch_00000.parquet").write_text("data")
+
+    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"):
+        client.upload_dataset(
+            repo_id="test/dataset",
+            base_dataset_path=base_path,
+            description="Test description",
+        )
 
 
 def test_update_metadata_paths(tmp_path: Path) -> None:

From 3270332e9554358a2ea92ce833ed956f44d4971b Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Mon, 2 Feb 2026 11:40:52 -0700
Subject: [PATCH 18/25] pull put hf hub datasets url to constants

---
 .../src/data_designer/config/utils/constants.py                | 2 ++
 .../src/data_designer/integrations/huggingface/client.py       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/packages/data-designer-config/src/data_designer/config/utils/constants.py b/packages/data-designer-config/src/data_designer/config/utils/constants.py
index 1a838f47..5c4cd38d 100644
--- a/packages/data-designer-config/src/data_designer/config/utils/constants.py
+++ b/packages/data-designer-config/src/data_designer/config/utils/constants.py
@@ -363,3 +363,5 @@ class NordColor(Enum):
 LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())
 
 NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-"
+
+HUGGINGFACE_HUB_DATASET_URL_PREFIX = "https://huggingface.co/datasets/"
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index df8e26e7..2afc27b7 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -12,6 +12,7 @@
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 
+from data_designer.config.utils.constants import HUGGINGFACE_HUB_DATASET_URL_PREFIX
 from data_designer.engine.dataset_builders.artifact_storage import (
     FINAL_DATASET_FOLDER_NAME,
     METADATA_FILENAME,
@@ -107,7 +108,7 @@ def upload_dataset(
             sdg_path=base_dataset_path / SDG_CONFIG_FILENAME,
         )
 
-        url = f"https://huggingface.co/datasets/{repo_id}"
+        url = f"{HUGGINGFACE_HUB_DATASET_URL_PREFIX}{repo_id}"
         logger.info(f"  |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}")
         return url
 

From ead52f5d871dbf0b1354bb9dd8b5e3355b1c9c9e Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Tue, 3 Feb 2026 09:08:28 -0700
Subject: [PATCH 19/25] HuggingfaceUploadError ->
 HuggingFaceHubClientUploadError

---
 .../integrations/huggingface/__init__.py      |  4 +-
 .../integrations/huggingface/client.py        | 56 +++++++++----------
 .../integrations/huggingface/test_client.py   | 34 +++++------
 3 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
index bbdaddff..9db42156 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError
+from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError
 from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
 
-__all__ = ["HuggingFaceHubClient", "HuggingFaceUploadError", "DataDesignerDatasetCard"]
+__all__ = ["HuggingFaceHubClient", "HuggingFaceHubClientUploadError", "DataDesignerDatasetCard"]
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 2afc27b7..8442597f 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -26,7 +26,7 @@
 logger = logging.getLogger(__name__)
 
 
-class HuggingFaceUploadError(DataDesignerError):
+class HuggingFaceHubClientUploadError(DataDesignerError):
     """Error during Hugging Face dataset upload."""
 
 
@@ -96,7 +96,7 @@ def upload_dataset(
                 description=description,
             )
         except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Failed to upload dataset card: {e}") from e
 
         self._upload_main_dataset_files(repo_id=repo_id, parquet_folder=base_dataset_path / FINAL_DATASET_FOLDER_NAME)
         self._upload_processor_files(
@@ -138,20 +138,20 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
             )
         except HfHubHTTPError as e:
             if e.response.status_code == 401:
-                raise HuggingFaceUploadError(
+                raise HuggingFaceHubClientUploadError(
                     "Authentication failed. Please provide a valid Hugging Face token. "
                     "You can set it via the token parameter or HF_TOKEN environment variable, "
                     "or run 'huggingface-cli login'."
                 ) from e
             elif e.response.status_code == 403:
-                raise HuggingFaceUploadError(
+                raise HuggingFaceHubClientUploadError(
                     f"Permission denied. You don't have access to create repository '{repo_id}'. "
                     "Check your token permissions or repository ownership."
                 ) from e
             else:
-                raise HuggingFaceUploadError(f"Failed to create repository '{repo_id}': {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to create repository '{repo_id}': {e}") from e
         except Exception as e:
-            raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e
 
     def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None:
         """Upload main parquet dataset files.
@@ -173,7 +173,7 @@ def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None
                 commit_message="Upload main dataset files",
             )
         except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Failed to upload parquet files: {e}") from e
 
     def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None:
         """Upload processor output files.
@@ -203,7 +203,7 @@ def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None
                     commit_message=f"Upload {processor_dir.name} processor outputs",
                 )
             except Exception as e:
-                raise HuggingFaceUploadError(
+                raise HuggingFaceHubClientUploadError(
                     f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
                 ) from e
 
@@ -230,7 +230,7 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path
                     commit_message="Upload sdg.json",
                 )
             except Exception as e:
-                raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to upload sdg.json: {e}") from e
 
         if metadata_path.exists():
             tmp_path = None
@@ -248,7 +248,7 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path
                     commit_message=f"Upload {METADATA_FILENAME}",
                 )
             except Exception as e:
-                raise HuggingFaceUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e
             finally:
                 if tmp_path and Path(tmp_path).exists():
                     Path(tmp_path).unlink()
@@ -269,9 +269,9 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path
             with open(metadata_path) as f:
                 metadata = json.load(f)
         except json.JSONDecodeError as e:
-            raise HuggingFaceUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e
         except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e
 
         sdg_config = None
         if sdg_path.exists():
@@ -279,9 +279,9 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path
                 with open(sdg_path) as f:
                     sdg_config = json.load(f)
             except json.JSONDecodeError as e:
-                raise HuggingFaceUploadError(f"Failed to parse sdg.json: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to parse sdg.json: {e}") from e
             except Exception as e:
-                raise HuggingFaceUploadError(f"Failed to read sdg.json: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to read sdg.json: {e}") from e
 
         try:
             card = DataDesignerDatasetCard.from_metadata(
@@ -291,12 +291,12 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path
                 description=description,
             )
         except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Failed to generate dataset card: {e}") from e
 
         try:
             card.push_to_hub(repo_id, repo_type="dataset")
         except Exception as e:
-            raise HuggingFaceUploadError(f"Failed to push dataset card to hub: {e}") from e
+            raise HuggingFaceHubClientUploadError(f"Failed to push dataset card to hub: {e}") from e
 
     @staticmethod
     def _validate_repo_id(repo_id: str) -> None:
@@ -309,12 +309,12 @@ def _validate_repo_id(repo_id: str) -> None:
             HuggingFaceUploadError: If repo_id format is invalid
         """
         if not repo_id or not isinstance(repo_id, str):
-            raise HuggingFaceUploadError("repo_id must be a non-empty string")
+            raise HuggingFaceHubClientUploadError("repo_id must be a non-empty string")
 
         pattern = r"^[a-zA-Z0-9][-a-zA-Z0-9._]*/[a-zA-Z0-9][-a-zA-Z0-9._]*$"
 
         if not re.match(pattern, repo_id):
-            raise HuggingFaceUploadError(
+            raise HuggingFaceHubClientUploadError(
                 f"Invalid repo_id format: '{repo_id}'. "
                 "Expected format: 'username/dataset-name' or 'organization/dataset-name'. "
                 "Names can contain alphanumeric characters, dashes, underscores, and dots."
@@ -371,30 +371,30 @@ def _validate_dataset_path(base_dataset_path: Path) -> None:
             HuggingFaceUploadError: If directory structure is invalid
         """
         if not base_dataset_path.exists():
-            raise HuggingFaceUploadError(f"Dataset path does not exist: {base_dataset_path}")
+            raise HuggingFaceHubClientUploadError(f"Dataset path does not exist: {base_dataset_path}")
 
         if not base_dataset_path.is_dir():
-            raise HuggingFaceUploadError(f"Dataset path is not a directory: {base_dataset_path}")
+            raise HuggingFaceHubClientUploadError(f"Dataset path is not a directory: {base_dataset_path}")
 
         metadata_path = base_dataset_path / METADATA_FILENAME
         if not metadata_path.exists():
-            raise HuggingFaceUploadError(f"Required file not found: {metadata_path}")
+            raise HuggingFaceHubClientUploadError(f"Required file not found: {metadata_path}")
 
         if not metadata_path.is_file():
-            raise HuggingFaceUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}")
+            raise HuggingFaceHubClientUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}")
 
         parquet_dir = base_dataset_path / FINAL_DATASET_FOLDER_NAME
         if not parquet_dir.exists():
-            raise HuggingFaceUploadError(
+            raise HuggingFaceHubClientUploadError(
                 f"Required directory not found: {parquet_dir}. "
                 "Dataset must contain parquet-files directory with batch files."
             )
 
         if not parquet_dir.is_dir():
-            raise HuggingFaceUploadError(f"parquet-files is not a directory: {parquet_dir}")
+            raise HuggingFaceHubClientUploadError(f"parquet-files is not a directory: {parquet_dir}")
 
         if not any(parquet_dir.glob("*.parquet")):
-            raise HuggingFaceUploadError(
+            raise HuggingFaceHubClientUploadError(
                 f"parquet-files directory is empty: {parquet_dir}. At least one .parquet file is required."
             )
 
@@ -402,14 +402,14 @@ def _validate_dataset_path(base_dataset_path: Path) -> None:
             with open(metadata_path) as f:
                 json.load(f)
         except json.JSONDecodeError as e:
-            raise HuggingFaceUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}")
+            raise HuggingFaceHubClientUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}")
 
         sdg_path = base_dataset_path / SDG_CONFIG_FILENAME
         if sdg_path.exists():
             if not sdg_path.is_file():
-                raise HuggingFaceUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}")
+                raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}")
             try:
                 with open(sdg_path) as f:
                     json.load(f)
             except json.JSONDecodeError as e:
-                raise HuggingFaceUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}")
+                raise HuggingFaceHubClientUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}")
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 0c87ffa6..75d25f6c 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -10,7 +10,7 @@
 import pytest
 from huggingface_hub.utils import HfHubHTTPError
 
-from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError
+from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError
 
 
 @pytest.fixture
@@ -242,7 +242,7 @@ def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None:
     base_path = tmp_path / "dataset"
     base_path.mkdir()
 
-    with pytest.raises(HuggingFaceUploadError, match="Required file not found"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Required file not found"):
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=base_path,
@@ -364,19 +364,19 @@ def test_validate_repo_id_invalid_format(sample_dataset_path: Path) -> None:
     client = HuggingFaceHubClient(token="test-token")
 
     # Missing slash
-    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"):
         client.upload_dataset("my-dataset", sample_dataset_path, "Test")
 
     # Too many slashes (caught by regex)
-    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"):
         client.upload_dataset("user/org/dataset", sample_dataset_path, "Test")
 
     # Invalid characters (space)
-    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"):
         client.upload_dataset("user/my dataset", sample_dataset_path, "Test")
 
     # Empty string
-    with pytest.raises(HuggingFaceUploadError, match="must be a non-empty string"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="must be a non-empty string"):
         client.upload_dataset("", sample_dataset_path, "Test")
 
 
@@ -385,7 +385,7 @@ def test_validate_dataset_path_not_exists(tmp_path: Path) -> None:
     client = HuggingFaceHubClient(token="test-token")
     non_existent = tmp_path / "does-not-exist"
 
-    with pytest.raises(HuggingFaceUploadError, match="does not exist"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="does not exist"):
         client.upload_dataset("test/dataset", non_existent, "Test")
 
 
@@ -395,7 +395,7 @@ def test_validate_dataset_path_is_file(tmp_path: Path) -> None:
     file_path = tmp_path / "file.txt"
     file_path.write_text("not a directory")
 
-    with pytest.raises(HuggingFaceUploadError, match="not a directory"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="not a directory"):
         client.upload_dataset("test/dataset", file_path, "Test")
 
 
@@ -405,7 +405,7 @@ def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None:
     base_path = tmp_path / "dataset"
     base_path.mkdir()
 
-    with pytest.raises(HuggingFaceUploadError, match="Required file not found"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Required file not found"):
         client.upload_dataset("test/dataset", base_path, "Test")
 
 
@@ -416,7 +416,7 @@ def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None:
     base_path.mkdir()
     (base_path / "metadata.json").write_text('{"target_num_records": 10}')
 
-    with pytest.raises(HuggingFaceUploadError, match="Required directory not found"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Required directory not found"):
         client.upload_dataset("test/dataset", base_path, "Test")
 
 
@@ -429,7 +429,7 @@ def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None:
     parquet_dir = base_path / "parquet-files"
     parquet_dir.mkdir()
 
-    with pytest.raises(HuggingFaceUploadError, match="directory is empty"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="directory is empty"):
         client.upload_dataset("test/dataset", base_path, "Test")
 
 
@@ -443,7 +443,7 @@ def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None:
     parquet_dir.mkdir()
     (parquet_dir / "batch_00000.parquet").write_text("data")
 
-    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"):
         client.upload_dataset("test/dataset", base_path, "Test")
 
 
@@ -458,7 +458,7 @@ def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None:
     parquet_dir.mkdir()
     (parquet_dir / "batch_00000.parquet").write_text("data")
 
-    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"):
         client.upload_dataset("test/dataset", base_path, "Test")
 
 
@@ -466,7 +466,7 @@ def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_p
     """Test upload_dataset fails with invalid repo_id."""
     client = HuggingFaceHubClient(token="test-token")
 
-    with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"):
         client.upload_dataset(
             repo_id="invalid-repo-id",  # Missing slash
             base_dataset_path=sample_dataset_path,
@@ -483,7 +483,7 @@ def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_data
     error_response.status_code = 401
     mock_hf_api.create_repo.side_effect = HfHubHTTPError("Unauthorized", response=error_response)
 
-    with pytest.raises(HuggingFaceUploadError, match="Authentication failed"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Authentication failed"):
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
@@ -500,7 +500,7 @@ def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_
     error_response.status_code = 403
     mock_hf_api.create_repo.side_effect = HfHubHTTPError("Forbidden", response=error_response)
 
-    with pytest.raises(HuggingFaceUploadError, match="Permission denied"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Permission denied"):
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=sample_dataset_path,
@@ -520,7 +520,7 @@ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None:
     parquet_dir.mkdir()
     (parquet_dir / "batch_00000.parquet").write_text("data")
 
-    with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"):
+    with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"):
         client.upload_dataset(
             repo_id="test/dataset",
             base_dataset_path=base_path,

From bc90dcb6895b925968b646b99063637f7efbff12 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Tue, 3 Feb 2026 11:01:06 -0700
Subject: [PATCH 20/25] defer to hfhub repo validation

---
 .../integrations/huggingface/client.py        | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 8442597f..fe789785 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -5,12 +5,12 @@
 
 import json
 import logging
-import re
 import tempfile
 from pathlib import Path
 
 from huggingface_hub import HfApi
-from huggingface_hub.utils import HfHubHTTPError
+from huggingface_hub.errors import HFValidationError
+from huggingface_hub.utils import HfHubHTTPError, validate_repo_id
 
 from data_designer.config.utils.constants import HUGGINGFACE_HUB_DATASET_URL_PREFIX
 from data_designer.engine.dataset_builders.artifact_storage import (
@@ -306,20 +306,24 @@ def _validate_repo_id(repo_id: str) -> None:
             repo_id: Repository ID to validate
 
         Raises:
-            HuggingFaceUploadError: If repo_id format is invalid
+            HuggingFaceHubClientUploadError: If repo_id format is invalid
         """
-        if not repo_id or not isinstance(repo_id, str):
+        # Check if repo_id is empty
+        if not repo_id or not repo_id.strip():
             raise HuggingFaceHubClientUploadError("repo_id must be a non-empty string")
 
-        pattern = r"^[a-zA-Z0-9][-a-zA-Z0-9._]*/[a-zA-Z0-9][-a-zA-Z0-9._]*$"
-
-        if not re.match(pattern, repo_id):
+        # Check for exactly one slash (username/dataset-name format). This is not enforced by huggingface_hub's validator.
+        if repo_id.count("/") != 1:
             raise HuggingFaceHubClientUploadError(
-                f"Invalid repo_id format: '{repo_id}'. "
-                "Expected format: 'username/dataset-name' or 'organization/dataset-name'. "
-                "Names can contain alphanumeric characters, dashes, underscores, and dots."
+                f"Invalid repo_id format: '{repo_id}'. Expected format: 'username/dataset-name'"
             )
 
+        # Use huggingface_hub's validator for additional checks (characters, length, etc.)
+        try:
+            validate_repo_id(repo_id)
+        except HFValidationError as e:
+            raise HuggingFaceHubClientUploadError(f"Invalid repo_id format: '{repo_id}': {e}") from e
+
     @staticmethod
     def _update_metadata_paths(metadata_path: Path) -> dict:
         """Update file paths in metadata.json to match Hugging Face dataset repository structure.

From 4f8c4a0ddaf0ba467203c17a83f0d3569c09e897 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Wed, 4 Feb 2026 09:55:19 -0700
Subject: [PATCH 21/25] Update
 packages/data-designer/src/data_designer/integrations/huggingface/client.py

Co-authored-by: Daniel van Strien <davanstrien@users.noreply.github.com>
---
 .../src/data_designer/integrations/huggingface/client.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index fe789785..5e69ce81 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -141,7 +141,7 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None:
                 raise HuggingFaceHubClientUploadError(
                     "Authentication failed. Please provide a valid Hugging Face token. "
                     "You can set it via the token parameter or HF_TOKEN environment variable, "
-                    "or run 'huggingface-cli login'."
+                    "or run 'hf auth login'."
                 ) from e
             elif e.response.status_code == 403:
                 raise HuggingFaceHubClientUploadError(

From d2fd6413ed72f810712c58b8d5f8c882d56576bd Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Wed, 4 Feb 2026 09:55:52 -0700
Subject: [PATCH 22/25] Update
 packages/data-designer/src/data_designer/interface/results.py

Co-authored-by: Daniel van Strien <davanstrien@users.noreply.github.com>
---
 packages/data-designer/src/data_designer/interface/results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index a37a5483..de66849c 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -121,7 +121,7 @@ def push_to_hub(
                 Appears after the title.
             token: HuggingFace API token. If None, the token is automatically
                 resolved from HF_TOKEN environment variable or cached credentials
-                from `huggingface-cli login`.
+                from `hf auth login`.
             private: Create private repo
 
         Returns:

From afbdac740e8d08165d35ad93c664e92208facc7e Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Wed, 4 Feb 2026 09:56:25 -0700
Subject: [PATCH 23/25] Update
 packages/data-designer/src/data_designer/integrations/huggingface/client.py

Co-authored-by: Daniel van Strien <davanstrien@users.noreply.github.com>
---
 .../src/data_designer/integrations/huggingface/client.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 5e69ce81..35723b04 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -39,7 +39,7 @@ def __init__(self, token: str | None = None):
         Args:
             token: Hugging Face API token. If None, the token is automatically
                 resolved from HF_TOKEN environment variable or cached credentials
-                from `huggingface-cli login`.
+                from `hf auth login`.
         """
         self._token = token
         self._api = HfApi(token=token)

From e56c846c3af25a463a9d2ec04d8b88178b626b67 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Wed, 4 Feb 2026 11:18:34 -0700
Subject: [PATCH 24/25] allow custom tags

---
 .../integrations/huggingface/client.py        |   9 +-
 .../integrations/huggingface/dataset_card.py  |  11 +-
 .../src/data_designer/interface/results.py    |   5 +-
 .../huggingface/test_dataset_card.py          | 258 ++++++++++--------
 4 files changed, 165 insertions(+), 118 deletions(-)

diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 35723b04..7d8c54e4 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -60,6 +60,7 @@ def upload_dataset(
         description: str,
         *,
         private: bool = False,
+        tags: list[str] | None = None,
     ) -> str:
         """Upload dataset to Hugging Face Hub.
 
@@ -74,6 +75,7 @@ def upload_dataset(
             base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
             description: Custom description text for dataset card
             private: Whether to create private repo
+            tags: Additional custom tags for the dataset
 
         Returns:
             URL to the uploaded dataset
@@ -94,6 +96,7 @@ def upload_dataset(
                 metadata_path=base_dataset_path / METADATA_FILENAME,
                 sdg_path=base_dataset_path / SDG_CONFIG_FILENAME,
                 description=description,
+                tags=tags,
             )
         except Exception as e:
             raise HuggingFaceHubClientUploadError(f"Failed to upload dataset card: {e}") from e
@@ -253,7 +256,9 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path
                 if tmp_path and Path(tmp_path).exists():
                     Path(tmp_path).unlink()
 
-    def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str) -> None:
+    def _upload_dataset_card(
+        self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str, tags: list[str] | None = None
+    ) -> None:
         """Generate and upload dataset card from metadata.json.
 
         Args:
@@ -261,6 +266,7 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path
             metadata_path: Path to metadata.json file
             sdg_path: Path to sdg.json file
             description: Custom description text for dataset card
+            tags: Additional custom tags for the dataset
 
         Raises:
             HuggingFaceUploadError: If dataset card generation or upload fails
@@ -289,6 +295,7 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path
                 sdg_config=sdg_config,
                 repo_id=repo_id,
                 description=description,
+                tags=tags,
             )
         except Exception as e:
             raise HuggingFaceHubClientUploadError(f"Failed to generate dataset card: {e}") from e
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index 04bc6324..ba5d2f0e 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -9,6 +9,7 @@
 from huggingface_hub import CardData, DatasetCard
 
 TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH = Path(__file__).parent / "dataset_card_template.md"
+DEFAULT_DATASET_CARD_TAGS = ["synthetic", "datadesigner"]
 
 
 class DataDesignerDatasetCard(DatasetCard):
@@ -28,6 +29,7 @@ def from_metadata(
         sdg_config: dict | None,
         repo_id: str,
         description: str,
+        tags: list[str] | None = None,
     ) -> DataDesignerDatasetCard:
         """Create dataset card from metadata.json and sdg.json.
 
@@ -36,6 +38,7 @@ def from_metadata(
             sdg_config: Contents of sdg.json (optional)
             repo_id: Hugging Face dataset repo ID
             description: Custom description text
+            tags: Additional custom tags for the dataset.
 
         Returns:
             DataDesignerDatasetCard instance ready to upload
@@ -71,13 +74,13 @@ def from_metadata(
         if "file_paths" in metadata and "processor-files" in metadata["file_paths"]:
             processor_names = list(metadata["file_paths"]["processor-files"].keys())
 
-        # Prepare tags
-        tags = ["synthetic", "datadesigner"]
+        # Prepare tags: default tags + custom tags
+        all_tags = DEFAULT_DATASET_CARD_TAGS + (tags or [])
 
         # Prepare CardData (metadata for YAML frontmatter)
         card_data = CardData(
             size_categories=size_categories,
-            tags=tags,
+            tags=all_tags,
         )
 
         # Prepare template variables
@@ -95,7 +98,7 @@ def from_metadata(
             "current_year": datetime.now().year,
             "has_processors": len(processor_names) > 0,
             "processor_names": processor_names,
-            "tags": tags,
+            "tags": all_tags,
             "custom_description": description,
         }
 
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index de66849c..5a071469 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -105,6 +105,7 @@ def push_to_hub(
         *,
         token: str | None = None,
         private: bool = False,
+        tags: list[str] | None = None,
     ) -> str:
         """Push dataset to HuggingFace Hub.
 
@@ -123,6 +124,7 @@ def push_to_hub(
                 resolved from HF_TOKEN environment variable or cached credentials
                 from `hf auth login`.
             private: Create private repo
+            tags: Additional custom tags for the dataset.
 
         Returns:
             URL to the uploaded dataset
@@ -130,7 +132,7 @@ def push_to_hub(
         Example:
             >>> results = data_designer.create(config, num_records=1000)
             >>> description = "This dataset contains synthetic conversations for training chatbots."
-            >>> results.push_to_hub("username/my-synthetic-dataset", description)
+            >>> results.push_to_hub("username/my-synthetic-dataset", description, tags=["chatbot", "conversation"])
             'https://huggingface.co/datasets/username/my-synthetic-dataset'
         """
         client = HuggingFaceHubClient(token=token)
@@ -139,4 +141,5 @@ def push_to_hub(
             base_dataset_path=self.artifact_storage.base_dataset_path,
             private=private,
             description=description,
+            tags=tags,
         )
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
index aa573cd6..a6342b0f 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -3,24 +3,17 @@
 
 from __future__ import annotations
 
-from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
-
+import pytest
 
-def test_compute_size_category() -> None:
-    """Test size category computation for various dataset sizes."""
-    assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K"
-    assert DataDesignerDatasetCard._compute_size_category(5000) == "1K<n<10K"
-    assert DataDesignerDatasetCard._compute_size_category(50000) == "10K<n<100K"
-    assert DataDesignerDatasetCard._compute_size_category(500000) == "100K<n<1M"
-    assert DataDesignerDatasetCard._compute_size_category(5000000) == "1M<n<10M"
-    assert DataDesignerDatasetCard._compute_size_category(50000000) == "n>10M"
+from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard
 
 
-def test_from_metadata_minimal() -> None:
-    """Test creating dataset card from minimal metadata."""
-    metadata = {
+@pytest.fixture
+def stub_metadata() -> dict:
+    """Stub metadata fixture with single column that can be used/modified by most tests."""
+    return {
         "target_num_records": 100,
-        "schema": {"col1": "string", "col2": "int64"},
+        "schema": {"col1": "string"},
         "column_statistics": [
             {
                 "column_name": "col1",
@@ -33,8 +26,24 @@ def test_from_metadata_minimal() -> None:
         ],
     }
 
+
+def test_compute_size_category() -> None:
+    """Test size category computation for various dataset sizes."""
+    assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K"
+    assert DataDesignerDatasetCard._compute_size_category(5000) == "1K<n<10K"
+    assert DataDesignerDatasetCard._compute_size_category(50000) == "10K<n<100K"
+    assert DataDesignerDatasetCard._compute_size_category(500000) == "100K<n<1M"
+    assert DataDesignerDatasetCard._compute_size_category(5000000) == "1M<n<10M"
+    assert DataDesignerDatasetCard._compute_size_category(50000000) == "n>10M"
+
+
+def test_from_metadata_minimal(stub_metadata: dict) -> None:
+    """Test creating dataset card from minimal metadata."""
+    # Add second column for this test
+    stub_metadata["schema"]["col2"] = "int64"
+
     card = DataDesignerDatasetCard.from_metadata(
-        metadata=metadata,
+        metadata=stub_metadata,
         sdg_config=None,
         repo_id="test/dataset",
         description="Test dataset for unit testing.",
@@ -48,32 +57,31 @@ def test_from_metadata_minimal() -> None:
     assert "2" in str(card)  # Number of columns
 
 
-def test_from_metadata_with_sdg_config() -> None:
+def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None:
     """Test creating dataset card with sdg config."""
-    metadata = {
-        "target_num_records": 50,
-        "schema": {"name": "string", "age": "int64"},
-        "column_statistics": [
-            {
-                "column_name": "name",
-                "num_records": 50,
-                "num_unique": 50,
-                "num_null": 0,
-                "simple_dtype": "string",
-                "column_type": "sampler",
-                "sampler_type": "person",
-            },
-            {
-                "column_name": "age",
-                "num_records": 50,
-                "num_unique": 30,
-                "num_null": 0,
-                "simple_dtype": "int64",
-                "column_type": "sampler",
-                "sampler_type": "uniform",
-            },
-        ],
-    }
+    # Customize for this test
+    stub_metadata["target_num_records"] = 50
+    stub_metadata["schema"] = {"name": "string", "age": "int64"}
+    stub_metadata["column_statistics"] = [
+        {
+            "column_name": "name",
+            "num_records": 50,
+            "num_unique": 50,
+            "num_null": 0,
+            "simple_dtype": "string",
+            "column_type": "sampler",
+            "sampler_type": "person",
+        },
+        {
+            "column_name": "age",
+            "num_records": 50,
+            "num_unique": 30,
+            "num_null": 0,
+            "simple_dtype": "int64",
+            "column_type": "sampler",
+            "sampler_type": "uniform",
+        },
+    ]
 
     sdg_config = {
         "data_designer": {
@@ -85,7 +93,7 @@ def test_from_metadata_with_sdg_config() -> None:
     }
 
     card = DataDesignerDatasetCard.from_metadata(
-        metadata=metadata,
+        metadata=stub_metadata,
         sdg_config=sdg_config,
         repo_id="test/dataset-with-config",
         description="Test dataset with SDG config.",
@@ -97,27 +105,26 @@ def test_from_metadata_with_sdg_config() -> None:
     assert "2 column" in str(card)
 
 
-def test_from_metadata_with_llm_columns() -> None:
+def test_from_metadata_with_llm_columns(stub_metadata: dict) -> None:
     """Test creating dataset card with LLM column statistics."""
-    metadata = {
-        "target_num_records": 10,
-        "schema": {"prompt": "string", "response": "string"},
-        "column_statistics": [
-            {
-                "column_name": "response",
-                "num_records": 10,
-                "num_unique": 10,
-                "num_null": 0,
-                "simple_dtype": "string",
-                "column_type": "llm-text",
-                "output_tokens_mean": 50.5,
-                "input_tokens_mean": 20.3,
-            }
-        ],
-    }
+    # Customize for LLM test
+    stub_metadata["target_num_records"] = 10
+    stub_metadata["schema"] = {"prompt": "string", "response": "string"}
+    stub_metadata["column_statistics"] = [
+        {
+            "column_name": "response",
+            "num_records": 10,
+            "num_unique": 10,
+            "num_null": 0,
+            "simple_dtype": "string",
+            "column_type": "llm-text",
+            "output_tokens_mean": 50.5,
+            "input_tokens_mean": 20.3,
+        }
+    ]
 
     card = DataDesignerDatasetCard.from_metadata(
-        metadata=metadata,
+        metadata=stub_metadata,
         sdg_config=None,
         repo_id="test/llm-dataset",
         description="Test dataset with LLM columns.",
@@ -128,32 +135,19 @@ def test_from_metadata_with_llm_columns() -> None:
     assert "Tokens:" in str(card) and "out" in str(card) and "in" in str(card)
 
 
-def test_from_metadata_with_processors() -> None:
+def test_from_metadata_with_processors(stub_metadata: dict) -> None:
     """Test creating dataset card with processor outputs includes loading examples."""
-    metadata = {
-        "target_num_records": 100,
-        "schema": {"col1": "string"},
-        "file_paths": {
-            "parquet-files": ["parquet-files/batch_00000.parquet"],
-            "processor-files": {
-                "processor1": ["processors-files/processor1/batch_00000.parquet"],
-                "processor2": ["processors-files/processor2/batch_00000.parquet"],
-            },
+    # Add processor files for this test
+    stub_metadata["file_paths"] = {
+        "parquet-files": ["parquet-files/batch_00000.parquet"],
+        "processor-files": {
+            "processor1": ["processors-files/processor1/batch_00000.parquet"],
+            "processor2": ["processors-files/processor2/batch_00000.parquet"],
         },
-        "column_statistics": [
-            {
-                "column_name": "col1",
-                "num_records": 100,
-                "num_unique": 100,
-                "num_null": 0,
-                "simple_dtype": "string",
-                "column_type": "sampler",
-            }
-        ],
     }
 
     card = DataDesignerDatasetCard.from_metadata(
-        metadata=metadata,
+        metadata=stub_metadata,
         sdg_config=None,
         repo_id="test/dataset-with-processors",
         description="Test dataset with processor outputs.",
@@ -168,27 +162,15 @@ def test_from_metadata_with_processors() -> None:
     assert "Load processor outputs" in card_str
 
 
-def test_from_metadata_with_custom_description() -> None:
+def test_from_metadata_with_custom_description(stub_metadata: dict) -> None:
     """Test creating dataset card with custom description."""
-    metadata = {
-        "target_num_records": 100,
-        "schema": {"col1": "string", "col2": "int64"},
-        "column_statistics": [
-            {
-                "column_name": "col1",
-                "num_records": 100,
-                "num_unique": 100,
-                "num_null": 0,
-                "simple_dtype": "string",
-                "column_type": "sampler",
-            }
-        ],
-    }
+    # Add second column for this test
+    stub_metadata["schema"]["col2"] = "int64"
 
     description = "This dataset contains synthetic data for testing chatbot responses."
 
     card = DataDesignerDatasetCard.from_metadata(
-        metadata=metadata,
+        metadata=stub_metadata,
         sdg_config=None,
         repo_id="test/dataset-with-description",
         description=description,
@@ -199,25 +181,14 @@ def test_from_metadata_with_custom_description() -> None:
     assert "This dataset contains synthetic data for testing chatbot responses." in card_str
 
 
-def test_from_metadata_description_placement() -> None:
+def test_from_metadata_description_placement(stub_metadata: dict) -> None:
     """Test that description appears in the correct location."""
-    metadata = {
-        "target_num_records": 50,
-        "schema": {"col1": "string"},
-        "column_statistics": [
-            {
-                "column_name": "col1",
-                "num_records": 50,
-                "num_unique": 50,
-                "num_null": 0,
-                "simple_dtype": "string",
-                "column_type": "sampler",
-            }
-        ],
-    }
+    # Use 50 records for this test
+    stub_metadata["target_num_records"] = 50
+    stub_metadata["column_statistics"][0]["num_records"] = 50
 
     card = DataDesignerDatasetCard.from_metadata(
-        metadata=metadata,
+        metadata=stub_metadata,
         sdg_config=None,
         repo_id="test/dataset-description-placement",
         description="Test description placement.",
@@ -231,3 +202,66 @@ def test_from_metadata_description_placement() -> None:
     desc_pos = card_str.find("Test description placement.")
     summary_pos = card_str.find("Dataset Summary")
     assert desc_pos < summary_pos
+
+
+def test_from_metadata_default_tags(stub_metadata: dict) -> None:
+    """Test that default tags are included when no custom tags are provided."""
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=stub_metadata,
+        sdg_config=None,
+        repo_id="test/dataset-default-tags",
+        description="Test dataset with default tags.",
+    )
+
+    card_str = str(card)
+    assert card is not None
+    # Check that default tags appear in the YAML frontmatter
+    assert "- synthetic" in card_str
+    assert "- datadesigner" in card_str
+
+
+def test_from_metadata_with_custom_tags(stub_metadata: dict) -> None:
+    """Test that custom tags are added to default tags."""
+    custom_tags = ["chatbot", "conversation", "qa"]
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=stub_metadata,
+        sdg_config=None,
+        repo_id="test/dataset-custom-tags",
+        description="Test dataset with custom tags.",
+        tags=custom_tags,
+    )
+
+    card_str = str(card)
+    assert card is not None
+    # Check that both default and custom tags appear in the YAML frontmatter
+    assert "- synthetic" in card_str
+    assert "- datadesigner" in card_str
+    assert "- chatbot" in card_str
+    assert "- conversation" in card_str
+    assert "- qa" in card_str
+
+
+def test_from_metadata_tags_in_yaml_frontmatter(stub_metadata: dict) -> None:
+    """Test that tags appear in the YAML frontmatter section."""
+    # Use 50 records for this test
+    stub_metadata["target_num_records"] = 50
+    stub_metadata["column_statistics"][0]["num_records"] = 50
+
+    card = DataDesignerDatasetCard.from_metadata(
+        metadata=stub_metadata,
+        sdg_config=None,
+        repo_id="test/dataset-tags-frontmatter",
+        description="Test dataset.",
+        tags=["custom-tag"],
+    )
+
+    card_str = str(card)
+    assert card is not None
+    # Tags should appear before the main content (in YAML frontmatter)
+    tags_section = card_str.find("tags:")
+    quick_start_section = card_str.find("## 🚀 Quick Start")
+    assert tags_section < quick_start_section
+    assert tags_section != -1  # Make sure tags section exists
+    # Verify tags appear before the closing of YAML frontmatter
+    assert tags_section < card_str.find("---", tags_section)

From 081ab2a6a413adc7304a2e766cc047261c034143 Mon Sep 17 00:00:00 2001
From: Nabin Mulepati <nmulepati@nvidia.com>
Date: Wed, 4 Feb 2026 11:28:35 -0700
Subject: [PATCH 25/25] change sdg.json -> builder_config.json

---
 .../dataset_builders/artifact_storage.py      |  2 +-
 .../integrations/huggingface/client.py        | 55 ++++++++++---------
 .../integrations/huggingface/dataset_card.py  | 12 ++--
 .../huggingface/dataset_card_template.md      |  2 +-
 .../src/data_designer/interface/results.py    |  2 +-
 .../integrations/huggingface/test_client.py   | 28 +++++-----
 .../huggingface/test_dataset_card.py          | 26 ++++-----
 7 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
index e2daaca6..35e7d4f8 100644
--- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
+++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py
@@ -24,7 +24,7 @@
 logger = logging.getLogger(__name__)
 
 BATCH_FILE_NAME_FORMAT = "batch_{batch_number:05d}.parquet"
-SDG_CONFIG_FILENAME = "sdg.json"
+SDG_CONFIG_FILENAME = "builder_config.json"
 METADATA_FILENAME = "metadata.json"
 FINAL_DATASET_FOLDER_NAME = "parquet-files"
 PROCESSORS_OUTPUTS_FOLDER_NAME = "processors-files"
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
index 7d8c54e4..0812b8de 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py
@@ -67,12 +67,12 @@ def upload_dataset(
         Uploads the complete dataset including:
         - Main parquet batch files from parquet-files/ → data/
         - Processor output batch files from processors-files/{name}/ → {name}/
-        - Existing sdg.json and metadata.json files
+        - Existing builder_config.json and metadata.json files
         - Auto-generated README.md (dataset card)
 
         Args:
             repo_id: Hugging Face dataset repo ID (e.g., "username/dataset-name")
-            base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.)
+            base_dataset_path: Path to base_dataset_path (contains parquet-files/, builder_config.json, etc.)
             description: Custom description text for dataset card
             private: Whether to create private repo
             tags: Additional custom tags for the dataset
@@ -94,7 +94,7 @@ def upload_dataset(
             self._upload_dataset_card(
                 repo_id=repo_id,
                 metadata_path=base_dataset_path / METADATA_FILENAME,
-                sdg_path=base_dataset_path / SDG_CONFIG_FILENAME,
+                builder_config_path=base_dataset_path / SDG_CONFIG_FILENAME,
                 description=description,
                 tags=tags,
             )
@@ -108,7 +108,7 @@ def upload_dataset(
         self._upload_config_files(
             repo_id=repo_id,
             metadata_path=base_dataset_path / METADATA_FILENAME,
-            sdg_path=base_dataset_path / SDG_CONFIG_FILENAME,
+            builder_config_path=base_dataset_path / SDG_CONFIG_FILENAME,
         )
 
         url = f"{HUGGINGFACE_HUB_DATASET_URL_PREFIX}{repo_id}"
@@ -210,30 +210,30 @@ def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None
                     f"Failed to upload processor outputs for '{processor_dir.name}': {e}"
                 ) from e
 
-    def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path) -> None:
-        """Upload configuration files (sdg.json and metadata.json).
+    def _upload_config_files(self, repo_id: str, metadata_path: Path, builder_config_path: Path) -> None:
+        """Upload configuration files (builder_config.json and metadata.json).
 
         Args:
             repo_id: Hugging Face dataset repo ID
             metadata_path: Path to metadata.json file
-            sdg_path: Path to sdg.json file
+            builder_config_path: Path to builder_config.json file
 
         Raises:
             HuggingFaceUploadError: If upload fails
         """
         logger.info(f"  |-- {RandomEmoji.loading()} Uploading configuration files...")
 
-        if sdg_path.exists():
+        if builder_config_path.exists():
             try:
                 self._api.upload_file(
                     repo_id=repo_id,
-                    path_or_fileobj=str(sdg_path),
+                    path_or_fileobj=str(builder_config_path),
                     path_in_repo=SDG_CONFIG_FILENAME,
                     repo_type="dataset",
-                    commit_message="Upload sdg.json",
+                    commit_message="Upload builder_config.json",
                 )
             except Exception as e:
-                raise HuggingFaceHubClientUploadError(f"Failed to upload sdg.json: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to upload builder_config.json: {e}") from e
 
         if metadata_path.exists():
             tmp_path = None
@@ -257,14 +257,19 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path
                     Path(tmp_path).unlink()
 
     def _upload_dataset_card(
-        self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str, tags: list[str] | None = None
+        self,
+        repo_id: str,
+        metadata_path: Path,
+        builder_config_path: Path,
+        description: str,
+        tags: list[str] | None = None,
     ) -> None:
         """Generate and upload dataset card from metadata.json.
 
         Args:
             repo_id: Hugging Face dataset repo ID
             metadata_path: Path to metadata.json file
-            sdg_path: Path to sdg.json file
+            builder_config_path: Path to builder_config.json file
             description: Custom description text for dataset card
             tags: Additional custom tags for the dataset
 
@@ -279,20 +284,20 @@ def _upload_dataset_card(
         except Exception as e:
             raise HuggingFaceHubClientUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e
 
-        sdg_config = None
-        if sdg_path.exists():
+        builder_config = None
+        if builder_config_path.exists():
             try:
-                with open(sdg_path) as f:
-                    sdg_config = json.load(f)
+                with open(builder_config_path) as f:
+                    builder_config = json.load(f)
             except json.JSONDecodeError as e:
-                raise HuggingFaceHubClientUploadError(f"Failed to parse sdg.json: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to parse builder_config.json: {e}") from e
             except Exception as e:
-                raise HuggingFaceHubClientUploadError(f"Failed to read sdg.json: {e}") from e
+                raise HuggingFaceHubClientUploadError(f"Failed to read builder_config.json: {e}") from e
 
         try:
             card = DataDesignerDatasetCard.from_metadata(
                 metadata=metadata,
-                sdg_config=sdg_config,
+                builder_config=builder_config,
                 repo_id=repo_id,
                 description=description,
                 tags=tags,
@@ -415,12 +420,12 @@ def _validate_dataset_path(base_dataset_path: Path) -> None:
         except json.JSONDecodeError as e:
             raise HuggingFaceHubClientUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}")
 
-        sdg_path = base_dataset_path / SDG_CONFIG_FILENAME
-        if sdg_path.exists():
-            if not sdg_path.is_file():
-                raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}")
+        builder_config_path = base_dataset_path / SDG_CONFIG_FILENAME
+        if builder_config_path.exists():
+            if not builder_config_path.is_file():
+                raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {builder_config_path}")
             try:
-                with open(sdg_path) as f:
+                with open(builder_config_path) as f:
                     json.load(f)
             except json.JSONDecodeError as e:
                 raise HuggingFaceHubClientUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}")
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
index ba5d2f0e..3c57f743 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py
@@ -26,16 +26,16 @@ class DataDesignerDatasetCard(DatasetCard):
     def from_metadata(
         cls,
         metadata: dict,
-        sdg_config: dict | None,
+        builder_config: dict | None,
         repo_id: str,
         description: str,
         tags: list[str] | None = None,
     ) -> DataDesignerDatasetCard:
-        """Create dataset card from metadata.json and sdg.json.
+        """Create dataset card from metadata.json and builder_config.json.
 
         Args:
             metadata: Contents of metadata.json
-            sdg_config: Contents of sdg.json (optional)
+            builder_config: Contents of builder_config.json (optional)
             repo_id: Hugging Face dataset repo ID
             description: Custom description text
             tags: Additional custom tags for the dataset.
@@ -57,11 +57,11 @@ def from_metadata(
         # Compute size category
         size_categories = cls._compute_size_category(actual_num_records)
 
-        # Extract column types from sdg.json if available
+        # Extract column types from builder_config.json if available
         config_types: dict[str, int] = {}
         num_columns_configured = 0
-        if sdg_config:
-            columns = sdg_config.get("data_designer", {}).get("columns", [])
+        if builder_config:
+            columns = builder_config.get("data_designer", {}).get("columns", [])
             num_columns_configured = len(columns)
             for col in columns:
                 col_type = col.get("column_type", "unknown")
diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
index 89c87e67..f01ce33d 100644
--- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
+++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md
@@ -80,7 +80,7 @@ Generated with {{ num_columns_configured }} column configuration(s):
 {% endfor %}
 
 {% endif %}
-📄 Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json).
+📄 Full configuration available in [`builder_config.json`](builder_config.json) and detailed metadata in [`metadata.json`](metadata.json).
 
 ---
 
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
index 5a071469..f86acced 100644
--- a/packages/data-designer/src/data_designer/interface/results.py
+++ b/packages/data-designer/src/data_designer/interface/results.py
@@ -112,7 +112,7 @@ def push_to_hub(
         Uploads all artifacts including:
         - Main parquet batch files (data subset)
         - Processor output batch files ({processor_name} subsets)
-        - Configuration (sdg.json)
+        - Configuration (builder_config.json)
         - Metadata (metadata.json)
         - Auto-generated dataset card (README.md)
 
diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py
index 75d25f6c..735ea3bc 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_client.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_client.py
@@ -39,7 +39,7 @@ def sample_dataset_path(tmp_path: Path) -> Path:
     - parquet-files/: Main dataset batch files
     - processors-files/{processor_name}/: Processor output batch files (same structure)
     - metadata.json: Dataset metadata
-    - sdg.json: Configuration
+    - builder_config.json: Configuration
     """
     base_path = tmp_path / "dataset"
     base_path.mkdir()
@@ -92,8 +92,8 @@ def sample_dataset_path(tmp_path: Path) -> Path:
     }
     (base_path / "metadata.json").write_text(json.dumps(metadata))
 
-    # Create sdg.json with realistic BuilderConfig structure
-    sdg_config = {
+    # Create builder_config.json with realistic BuilderConfig structure
+    builder_config = {
         "data_designer": {
             "columns": [
                 {
@@ -109,7 +109,7 @@ def sample_dataset_path(tmp_path: Path) -> Path:
             "profilers": None,
         }
     }
-    (base_path / "sdg.json").write_text(json.dumps(sdg_config))
+    (base_path / "builder_config.json").write_text(json.dumps(builder_config))
 
     return base_path
 
@@ -182,7 +182,7 @@ def test_upload_dataset_uploads_processor_outputs(
 def test_upload_dataset_uploads_config_files(
     mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path
 ) -> None:
-    """Test that upload_dataset uploads sdg.json and metadata.json."""
+    """Test that upload_dataset uploads builder_config.json and metadata.json."""
     client = HuggingFaceHubClient(token="test-token")
 
     client.upload_dataset(
@@ -194,7 +194,7 @@ def test_upload_dataset_uploads_config_files(
     # Check that upload_file was called for config files
     upload_file_calls = mock_hf_api.upload_file.call_args_list
     uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls]
-    assert "sdg.json" in uploaded_files
+    assert "builder_config.json" in uploaded_files
     assert "metadata.json" in uploaded_files
 
 
@@ -301,10 +301,10 @@ def test_upload_dataset_without_processors(
     assert len(processor_calls) == 0  # No processor files
 
 
-def test_upload_dataset_without_sdg_config(
+def test_upload_dataset_without_builder_config(
     mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path
 ) -> None:
-    """Test upload_dataset when sdg.json doesn't exist."""
+    """Test upload_dataset when builder_config.json doesn't exist."""
     base_path = tmp_path / "dataset"
     base_path.mkdir()
 
@@ -315,7 +315,7 @@ def test_upload_dataset_without_sdg_config(
     metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []}
     (base_path / "metadata.json").write_text(json.dumps(metadata))
 
-    # No sdg.json file
+    # No builder_config.json file
 
     client = HuggingFaceHubClient(token="test-token")
 
@@ -325,13 +325,13 @@ def test_upload_dataset_without_sdg_config(
         description="Test dataset",
     )
 
-    # Should only upload metadata.json, not sdg.json
+    # Should only upload metadata.json, not builder_config.json
     file_calls = mock_hf_api.upload_file.call_args_list
     uploaded_files = [call.kwargs["path_in_repo"] for call in file_calls]
 
     assert len(uploaded_files) == 1  # Only metadata.json
     assert "metadata.json" in uploaded_files
-    assert "sdg.json" not in uploaded_files
+    assert "builder_config.json" not in uploaded_files
 
 
 def test_upload_dataset_multiple_processors(
@@ -447,13 +447,13 @@ def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None:
         client.upload_dataset("test/dataset", base_path, "Test")
 
 
-def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None:
-    """Test upload fails when sdg.json contains invalid JSON."""
+def test_validate_dataset_path_invalid_builder_config_json(tmp_path: Path) -> None:
+    """Test upload fails when builder_config.json contains invalid JSON."""
     client = HuggingFaceHubClient(token="test-token")
     base_path = tmp_path / "dataset"
     base_path.mkdir()
     (base_path / "metadata.json").write_text('{"target_num_records": 10}')
-    (base_path / "sdg.json").write_text("invalid json {{{")
+    (base_path / "builder_config.json").write_text("invalid json {{{")
     parquet_dir = base_path / "parquet-files"
     parquet_dir.mkdir()
     (parquet_dir / "batch_00000.parquet").write_text("data")
diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
index a6342b0f..ce7b2832 100644
--- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
+++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py
@@ -44,7 +44,7 @@ def test_from_metadata_minimal(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset",
         description="Test dataset for unit testing.",
     )
@@ -57,8 +57,8 @@ def test_from_metadata_minimal(stub_metadata: dict) -> None:
     assert "2" in str(card)  # Number of columns
 
 
-def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None:
-    """Test creating dataset card with sdg config."""
+def test_from_metadata_with_builder_config(stub_metadata: dict) -> None:
+    """Test creating dataset card with builder config."""
     # Customize for this test
     stub_metadata["target_num_records"] = 50
     stub_metadata["schema"] = {"name": "string", "age": "int64"}
@@ -83,7 +83,7 @@ def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None:
         },
     ]
 
-    sdg_config = {
+    builder_config = {
         "data_designer": {
             "columns": [
                 {"name": "name", "column_type": "sampler"},
@@ -94,9 +94,9 @@ def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=sdg_config,
+        builder_config=builder_config,
         repo_id="test/dataset-with-config",
-        description="Test dataset with SDG config.",
+        description="Test dataset with builder config.",
     )
 
     # Verify card includes config info
@@ -125,7 +125,7 @@ def test_from_metadata_with_llm_columns(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/llm-dataset",
         description="Test dataset with LLM columns.",
     )
@@ -148,7 +148,7 @@ def test_from_metadata_with_processors(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset-with-processors",
         description="Test dataset with processor outputs.",
     )
@@ -171,7 +171,7 @@ def test_from_metadata_with_custom_description(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset-with-description",
         description=description,
     )
@@ -189,7 +189,7 @@ def test_from_metadata_description_placement(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset-description-placement",
         description="Test description placement.",
     )
@@ -208,7 +208,7 @@ def test_from_metadata_default_tags(stub_metadata: dict) -> None:
     """Test that default tags are included when no custom tags are provided."""
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset-default-tags",
         description="Test dataset with default tags.",
     )
@@ -226,7 +226,7 @@ def test_from_metadata_with_custom_tags(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset-custom-tags",
         description="Test dataset with custom tags.",
         tags=custom_tags,
@@ -250,7 +250,7 @@ def test_from_metadata_tags_in_yaml_frontmatter(stub_metadata: dict) -> None:
 
     card = DataDesignerDatasetCard.from_metadata(
         metadata=stub_metadata,
-        sdg_config=None,
+        builder_config=None,
         repo_id="test/dataset-tags-frontmatter",
         description="Test dataset.",
         tags=["custom-tag"],