Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
feadff7
feat: add push_to_hub integration for HuggingFace datasets
nabinchha Jan 30, 2026
3ff3aba
feat: improve push_to_hub with logging, path mapping, and config defi…
nabinchha Jan 30, 2026
0cd2dd1
feat: add optional description parameter to push_to_hub
nabinchha Jan 30, 2026
da2acc8
feat: make description required and enhance dataset card design
nabinchha Jan 30, 2026
5b83a1a
fix license headers
nabinchha Jan 30, 2026
0ecba21
remove modality deteciton
nabinchha Jan 30, 2026
08b8aa6
break up upload_dataset
nabinchha Jan 30, 2026
ddd5629
make token private
nabinchha Jan 30, 2026
4590c7d
HuggingFace -> Hugging Face
nabinchha Jan 30, 2026
5113069
remove inline imports
nabinchha Jan 30, 2026
02182f9
simplify tests + remvoe create pr option for simplicity
nabinchha Jan 30, 2026
9b99aed
Update packages/data-designer/src/data_designer/integrations/huggingf…
nabinchha Jan 30, 2026
ce05fa1
use consistent indentaion
nabinchha Jan 30, 2026
243c087
fix temp file clean up
nabinchha Jan 30, 2026
de61805
huggingface hub already a dep in engine
nabinchha Feb 2, 2026
f0e3fcb
add missing spaces
nabinchha Feb 2, 2026
99c61fe
reuse vars from artifact_storage.py
nabinchha Feb 2, 2026
3270332
pull put hf hub datasets url to constants
nabinchha Feb 2, 2026
ead52f5
HuggingfaceUploadError -> HuggingFaceHubClientUploadError
nabinchha Feb 3, 2026
bc90dcb
defer to hfhub repo validation
nabinchha Feb 3, 2026
6bfd2df
Merge branch 'main' into nmulepati/feat/7-push-to-hf
nabinchha Feb 3, 2026
5b8dc9c
Merge branch 'main' into nmulepati/feat/7-push-to-hf
nabinchha Feb 3, 2026
4f8c4a0
Update packages/data-designer/src/data_designer/integrations/huggingf…
nabinchha Feb 4, 2026
d2fd641
Update packages/data-designer/src/data_designer/interface/results.py
nabinchha Feb 4, 2026
afbdac7
Update packages/data-designer/src/data_designer/integrations/huggingf…
nabinchha Feb 4, 2026
e56c846
allow custom tags
nabinchha Feb 4, 2026
081ab2a
change sdg.json -> builder_config.json
nabinchha Feb 4, 2026
6ab60d4
Merge branch 'main' into nmulepati/feat/7-push-to-hf
nabinchha Feb 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -372,3 +372,5 @@ class NordColor(Enum):
LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())

NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-"

HUGGINGFACE_HUB_DATASET_URL_PREFIX = "https://huggingface.co/datasets/"
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
logger = logging.getLogger(__name__)

BATCH_FILE_NAME_FORMAT = "batch_{batch_number:05d}.parquet"
SDG_CONFIG_FILENAME = "sdg.json"
SDG_CONFIG_FILENAME = "builder_config.json"
METADATA_FILENAME = "metadata.json"
FINAL_DATASET_FOLDER_NAME = "parquet-files"
PROCESSORS_OUTPUTS_FOLDER_NAME = "processors-files"


class BatchStage(StrEnum):
Expand All @@ -37,10 +40,10 @@ class BatchStage(StrEnum):
class ArtifactStorage(BaseModel):
artifact_path: Path | str
dataset_name: str = "dataset"
final_dataset_folder_name: str = "parquet-files"
final_dataset_folder_name: str = FINAL_DATASET_FOLDER_NAME
partial_results_folder_name: str = "tmp-partial-parquet-files"
dropped_columns_folder_name: str = "dropped-columns-parquet-files"
processors_outputs_folder_name: str = "processors-files"
processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME

@property
def artifact_path_exists(self) -> bool:
Expand Down Expand Up @@ -72,7 +75,7 @@ def final_dataset_path(self) -> Path:

@property
def metadata_file_path(self) -> Path:
return self.base_dataset_path / "metadata.json"
return self.base_dataset_path / METADATA_FILENAME

@property
def partial_results_path(self) -> Path:
Expand Down Expand Up @@ -259,7 +262,7 @@ def write_metadata(self, metadata: dict) -> Path:
"""
self.mkdir_if_needed(self.base_dataset_path)
with open(self.metadata_file_path, "w") as file:
json.dump(metadata, file, indent=4, sort_keys=True)
json.dump(metadata, file, indent=2, sort_keys=True)
return self.metadata_file_path

def update_metadata(self, updates: dict) -> Path:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError
from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard

__all__ = ["HuggingFaceHubClient", "HuggingFaceHubClientUploadError", "DataDesignerDatasetCard"]
Loading