diff --git a/docs/RFC-FERN-MIGRATION.md b/docs/RFC-FERN-MIGRATION.md new file mode 100644 index 00000000..6964b201 --- /dev/null +++ b/docs/RFC-FERN-MIGRATION.md @@ -0,0 +1,1371 @@ +# RFC: Migration from MkDocs to Fern Docs + +**Status:** Draft +**Author:** [Author Name] +**Owner:** [Owner Name] +**Created:** 2026-01-14 +**Last Updated:** 2026-01-14 +**Target Completion:** [YYYY-MM-DD] + +--- + +## Summary + +This RFC proposes migrating the NeMo Data Designer documentation from MkDocs Material to [Fern Docs](https://buildwithfern.com/learn/docs/getting-started/overview). The migration will be performed incrementally by creating a new `docs-fern/` directory, preserving all existing content while adapting to Fern's component system. + +## Motivation + +This migration is **mandated** as part of NVIDIA's documentation platform standardization initiative. + +**Additional benefits:** + +- **Modern documentation platform**: Fern offers AI-native features including Ask Fern and auto-generated MCP servers +- **Enhanced API documentation**: Better support for API reference documentation from OpenAPI specs +- **Improved developer experience**: Rich component library with interactive elements +- **Self-hosting options**: Flexible deployment for enterprise requirements + +## Scope + +### In Scope + +- 1:1 content migration (no content changes) +- Component mapping from MkDocs Material to Fern equivalents +- Navigation structure preservation +- Code reference documentation migration + +### Out of Scope + +- Content rewrites or restructuring +- New features or sections +- Removal of existing documentation + +--- + +## Current Documentation Inventory + +### File Structure + +``` +docs/ +├── index.md # Home page +├── installation.md # Installation guide +├── quick-start.md # Quick start tutorial +├── CONTRIBUTING.md # Contribution guide +├── concepts/ +│ ├── columns.md +│ ├── validators.md +│ ├── processors.md +│ ├── person_sampling.md +│ └── models/ +│ ├── default-model-settings.md +│ ├── custom-model-settings.md +│ ├── configure-model-settings-with-the-cli.md +│ ├── model-providers.md +│ ├── model-configs.md +│ └── inference-parameters.md +├── recipes/ +│ ├── cards.md +│ ├── code_generation/ +│ │ ├── text_to_python.md +│ │ └── text_to_sql.md +│ └── qa_and_chat/ +│ ├── product_info_qa.md +│ └── multi_turn_chat.md +├── plugins/ +│ ├── overview.md +│ ├── example.md +│ └── available.md +├── code_reference/ # Auto-generated API docs +│ ├── models.md +│ ├── column_configs.md +│ ├── config_builder.md +│ ├── data_designer_config.md +│ ├── sampler_params.md +│ ├── validator_params.md +│ ├── processors.md +│ └── analysis.md +├── colab_notebooks/ # Jupyter notebooks +│ ├── 1-the-basics.ipynb +│ ├── 2-structured-outputs-and-jinja-expressions.ipynb +│ ├── 3-seeding-with-a-dataset.ipynb +│ └── 4-providing-images-as-context.ipynb +├── assets/ +│ └── recipes/ # Downloadable code files +├── css/ # Custom styles +├── js/ # Custom scripts +└── overrides/ # MkDocs template overrides +``` + +### Current Navigation Structure + +```yaml +nav: + - Getting Started: + - Welcome: index.md + - Installation: installation.md + - Quick Start: quick-start.md + - Contributing: CONTRIBUTING.md + - Concepts: + - Models: (6 sub-pages) + - Columns: concepts/columns.md + - Validators: concepts/validators.md + - Processors: concepts/processors.md + - Person Sampling: concepts/person_sampling.md + - Tutorials: + - Overview + 4 Jupyter notebooks + - Recipes: + - Recipe Cards + 4 recipes + - Plugins: + - 3 pages + - Code Reference: + - 8 auto-generated API docs +``` + +--- + +## Component Mapping + +### MkDocs → Fern Component Equivalents + +Reference: [Fern Components Overview](https://buildwithfern.com/learn/docs/writing-content/components/overview) + +| MkDocs Feature | Current Syntax | Fern Equivalent | Notes | +|----------------|----------------|-----------------|-------| +| **Admonitions** | `!!! note "Title"` | ``, ``, ``, `` | See [Callouts](#1-admonitions--callouts) | +| **Tabbed Content** | `=== "Tab 1"` | `` + `` | See [Tabs](#2-tabbed-content) | +| **Code Blocks** | ` ```python ` | ` ```python ` | Direct compatibility | +| **Code Snippets** | `--8<-- "path"` | `` with `src` | File embedding | +| **Grid Cards** | `
` | `` + `` | See [Cards](#3-grid-cards) | +| **Icons** | `:material-xxx:` | Fern icons or inline SVG | Limited support | +| **Download Links** | `{ .md-button download=}` | Standard markdown links | Simplified | +| **API Docs** | `::: module.path` | Manual or OpenAPI import | See [API Reference](#4-api-reference) | +| **Jupyter Notebooks** | `.ipynb` files | Convert to MDX or embed | See [Notebooks](#5-jupyter-notebooks) | +| **Versioning** | Mike plugin | Fern versioning | Built-in support | + +--- + +## Detailed Component Migrations + +### 1. Admonitions → Callouts + +**Current MkDocs syntax:** + +```markdown +!!! note "The Declarative Approach" + Columns are **declarative specifications**. You describe *what* you want... + +!!! tip "Conditional Sampling" + Samplers support **conditional parameters**... + +!!! question "New to Data Designer?" + Recipes provide working code... + +!!! warning "Important" + This action cannot be undone. +``` + +**Fern equivalent:** + +```mdx + +Columns are **declarative specifications**. You describe *what* you want... + + + +Samplers support **conditional parameters**... + + + +Recipes provide working code... + + + +This action cannot be undone. + +``` + +**Migration mapping:** + +| MkDocs Admonition | Fern Callout | +|-------------------|--------------| +| `!!! note` | `` | +| `!!! tip` | `` | +| `!!! info` | `` | +| `!!! warning` | `` | +| `!!! question` | `` | +| `!!! danger` | `` | + +### 2. Tabbed Content + +**Current MkDocs syntax (installation.md):** + +```markdown +=== "pip" + + ```bash + pip install data-designer + ``` + +=== "uv" + + ```bash + uv add data-designer + ``` +``` + +**Fern equivalent:** + +```mdx + + + ```bash + pip install data-designer + ``` + + + ```bash + uv add data-designer + ``` + + +``` + +### 3. Grid Cards + +**Current MkDocs syntax (recipes/cards.md):** + +```markdown +
+ +- :material-snake:{ .lg .middle } **Text to Python** + + Generate a dataset of natural language instructions... + + --- + + **Demonstrates:** + - Python code generation + - Python code validation + + --- + + [:material-book-open-page-variant: View Recipe](code_generation/text_to_python.md){ .md-button } + +
+``` + +**Fern equivalent:** + +```mdx + + + Generate a dataset of natural language instructions... + + **Demonstrates:** + - Python code generation + - Python code validation + + +``` + +### 4. API Reference (mkdocstrings) + +**Current MkDocs syntax (code_reference/models.md):** + +```markdown +# Models + +The `models` module defines configuration objects... + +::: data_designer.config.models +``` + +**Fern options:** + +**Option A: Manual Documentation** +Convert auto-generated docs to manually written MDX with code examples. + +**Option B: OpenAPI Integration** +If the API has an OpenAPI spec, use Fern's native API reference generation. + +**Option C: TypeDoc/PyDoc Integration** +Use Fern's SDK documentation features if available. + +**Recommendation:** Start with Option A (manual) for initial migration, evaluate automation options post-migration. + +### 5. Jupyter Notebooks + +**Current approach:** `mkdocs-jupyter` plugin renders `.ipynb` files directly. + +**Fern options:** + +**Option A: Convert to MDX** +Convert notebooks to MDX files with code blocks and output screenshots. + +**Option B: Embed as iframes** +Host notebooks on Colab/GitHub and embed links. + +**Option C: Use Fern's code playground** +If available, use interactive code features. + +**Recommendation:** Convert to MDX with static code blocks and link to Colab for interactive experience (preserves current Colab badge functionality). + +### 6. Code Snippets (pymdownx.snippets) + +**Current MkDocs syntax:** + +```markdown +```python +--8<-- "assets/recipes/code_generation/text_to_python.py" +``` +``` + +**Fern equivalent:** + +```mdx + +``` + +Or inline the code directly if file embedding isn't supported. + +--- + +## Proposed Directory Structure + +``` +docs-fern/ +├── fern.config.json # Fern configuration +├── docs.yml # Navigation and settings +├── pages/ +│ ├── index.mdx # Home page +│ ├── installation.mdx +│ ├── quick-start.mdx +│ ├── contributing.mdx +│ ├── concepts/ +│ │ ├── columns.mdx +│ │ ├── validators.mdx +│ │ ├── processors.mdx +│ │ ├── person-sampling.mdx +│ │ └── models/ +│ │ ├── default-model-settings.mdx +│ │ ├── custom-model-settings.mdx +│ │ ├── configure-with-cli.mdx +│ │ ├── model-providers.mdx +│ │ ├── model-configs.mdx +│ │ └── inference-parameters.mdx +│ ├── tutorials/ +│ │ ├── overview.mdx +│ │ ├── the-basics.mdx +│ │ ├── structured-outputs.mdx +│ │ ├── seeding-with-dataset.mdx +│ │ └── images-as-context.mdx +│ ├── recipes/ +│ │ ├── index.mdx # Recipe cards +│ │ ├── code-generation/ +│ │ │ ├── text-to-python.mdx +│ │ │ └── text-to-sql.mdx +│ │ └── qa-and-chat/ +│ │ ├── product-info-qa.mdx +│ │ └── multi-turn-chat.mdx +│ ├── plugins/ +│ │ ├── overview.mdx +│ │ ├── example.mdx +│ │ └── available.mdx +│ └── api-reference/ +│ ├── models.mdx +│ ├── column-configs.mdx +│ ├── config-builder.mdx +│ ├── data-designer-config.mdx +│ ├── sampler-params.mdx +│ ├── validator-params.mdx +│ ├── processors.mdx +│ └── analysis.mdx +├── assets/ +│ ├── favicon.png +│ └── recipes/ # Downloadable code files +│ ├── code_generation/ +│ └── qa_and_chat/ +└── styles/ + └── custom.css # Custom styling (if needed) +``` + +--- + +## URL Redirect Mapping + +To preserve existing bookmarks and SEO, all old URLs must redirect to their new locations. + +### Redirect Rules + +| Old MkDocs URL | New Fern URL | +|----------------|--------------| +| `/` | `/docs` | +| `/installation/` | `/docs/installation` | +| `/quick-start/` | `/docs/quick-start` | +| `/CONTRIBUTING/` | `/docs/contributing` | +| `/concepts/columns/` | `/docs/concepts/columns` | +| `/concepts/validators/` | `/docs/concepts/validators` | +| `/concepts/processors/` | `/docs/concepts/processors` | +| `/concepts/person_sampling/` | `/docs/concepts/person-sampling` | +| `/concepts/models/default-model-settings/` | `/docs/concepts/models/default-model-settings` | +| `/concepts/models/custom-model-settings/` | `/docs/concepts/models/custom-model-settings` | +| `/concepts/models/configure-model-settings-with-the-cli/` | `/docs/concepts/models/configure-with-cli` | +| `/concepts/models/model-providers/` | `/docs/concepts/models/model-providers` | +| `/concepts/models/model-configs/` | `/docs/concepts/models/model-configs` | +| `/concepts/models/inference-parameters/` | `/docs/concepts/models/inference-parameters` | +| `/tutorials/` | `/docs/tutorials/overview` | +| `/recipes/cards/` | `/docs/recipes` | +| `/recipes/code_generation/text_to_python/` | `/docs/recipes/code-generation/text-to-python` | +| `/recipes/code_generation/text_to_sql/` | `/docs/recipes/code-generation/text-to-sql` | +| `/recipes/qa_and_chat/product_info_qa/` | `/docs/recipes/qa-and-chat/product-info-qa` | +| `/recipes/qa_and_chat/multi_turn_chat/` | `/docs/recipes/qa-and-chat/multi-turn-chat` | +| `/plugins/overview/` | `/docs/plugins/overview` | +| `/plugins/example/` | `/docs/plugins/example` | +| `/plugins/available/` | `/docs/plugins/available` | +| `/code_reference/*` | `/api/*` | + +### Implementation + +**Option A: Fern redirects configuration** (if supported) + +```yaml +# In docs.yml +redirects: + - from: /concepts/person_sampling + to: /docs/concepts/person-sampling + # ... additional redirects +``` + +**Option B: Hosting platform redirects** + +For Netlify (`_redirects` file): +``` +/concepts/person_sampling/* /docs/concepts/person-sampling/:splat 301 +/code_reference/* /api/:splat 301 +``` + +For nginx: +```nginx +rewrite ^/concepts/person_sampling(.*)$ /docs/concepts/person-sampling$1 permanent; +rewrite ^/code_reference/(.*)$ /api/$1 permanent; +``` + +--- + +## Configuration Files + +### fern.config.json + +```json +{ + "organization": "nvidia-nemo", + "version": "1.0.0" +} +``` + +### docs.yml + +```yaml +instances: + - url: https://datadesigner.docs.nvidia.com + +title: NeMo Data Designer + +tabs: + docs: + display-name: Documentation + slug: docs + api: + display-name: API Reference + slug: api + +navigation: + - tab: docs + layout: + - section: Getting Started + contents: + - page: Welcome + path: pages/index.mdx + - page: Installation + path: pages/installation.mdx + - page: Quick Start + path: pages/quick-start.mdx + - page: Contributing + path: pages/contributing.mdx + - section: Concepts + contents: + - section: Models + contents: + - page: Default Model Settings + path: pages/concepts/models/default-model-settings.mdx + - page: Custom Model Settings + path: pages/concepts/models/custom-model-settings.mdx + - page: Configure with CLI + path: pages/concepts/models/configure-with-cli.mdx + - page: Model Providers + path: pages/concepts/models/model-providers.mdx + - page: Model Configs + path: pages/concepts/models/model-configs.mdx + - page: Inference Parameters + path: pages/concepts/models/inference-parameters.mdx + - page: Columns + path: pages/concepts/columns.mdx + - page: Validators + path: pages/concepts/validators.mdx + - page: Processors + path: pages/concepts/processors.mdx + - page: Person Sampling + path: pages/concepts/person-sampling.mdx + - section: Tutorials + contents: + - page: Overview + path: pages/tutorials/overview.mdx + - page: The Basics + path: pages/tutorials/the-basics.mdx + - page: Structured Outputs + path: pages/tutorials/structured-outputs.mdx + - page: Seeding with a Dataset + path: pages/tutorials/seeding-with-dataset.mdx + - page: Images as Context + path: pages/tutorials/images-as-context.mdx + - section: Recipes + contents: + - page: Recipe Cards + path: pages/recipes/index.mdx + - section: Code Generation + contents: + - page: Text to Python + path: pages/recipes/code-generation/text-to-python.mdx + - page: Text to SQL + path: pages/recipes/code-generation/text-to-sql.mdx + - section: QA and Chat + contents: + - page: Product Info QA + path: pages/recipes/qa-and-chat/product-info-qa.mdx + - page: Multi-Turn Chat + path: pages/recipes/qa-and-chat/multi-turn-chat.mdx + - section: Plugins + contents: + - page: Overview + path: pages/plugins/overview.mdx + - page: Example Plugin + path: pages/plugins/example.mdx + - page: Available Plugins + path: pages/plugins/available.mdx + - tab: api + layout: + - section: API Reference + contents: + - page: Models + path: pages/api-reference/models.mdx + - page: Column Configs + path: pages/api-reference/column-configs.mdx + - page: Config Builder + path: pages/api-reference/config-builder.mdx + - page: Data Designer Config + path: pages/api-reference/data-designer-config.mdx + - page: Sampler Params + path: pages/api-reference/sampler-params.mdx + - page: Validator Params + path: pages/api-reference/validator-params.mdx + - page: Processors + path: pages/api-reference/processors.mdx + - page: Analysis + path: pages/api-reference/analysis.mdx + +colors: + accent-primary: + dark: "#76B900" + light: "#76B900" + background: + dark: "#1a1a1a" + light: "#ffffff" + +logo: + dark: assets/favicon.png + light: assets/favicon.png + +favicon: assets/favicon.png + +navbar-links: + - type: github + value: https://github.com/NVIDIA-NeMo/DataDesigner +``` + +--- + +## Migration Plan + +### Phase 1: Setup (1 day) + +1. Create `docs-fern/` directory structure +2. Initialize Fern configuration files +3. Set up local development environment +4. Verify Fern CLI works (`fern check`, `fern generate`) + +### Phase 2: Core Pages Migration (2-3 days) + +1. Migrate Getting Started section + - `index.md` → `index.mdx` + - `installation.md` → `installation.mdx` + - `quick-start.md` → `quick-start.mdx` + - `CONTRIBUTING.md` → `contributing.mdx` + +2. Migrate Concepts section (6 model pages + 4 concept pages) + +3. Migrate Plugins section (3 pages) + +### Phase 3: Complex Content Migration (3-4 days) + +1. Convert Jupyter notebooks to MDX + - Extract code cells as code blocks + - Convert markdown cells directly + - Add Colab badges/links + +2. Migrate Recipes section + - Convert grid cards to Fern Cards + - Migrate recipe content pages + - Handle code snippet embedding + +### Phase 4: API Reference Migration (2-3 days) + +1. Extract API documentation from mkdocstrings output +2. Manually format as MDX pages +3. Add code examples and cross-references + +### Phase 5: Styling and Polish (1-2 days) + +1. Apply NVIDIA branding (green accent color) +2. Configure navigation and tabs +3. Add favicon and logos +4. Test responsive design + +### Phase 6: Testing and Validation (1-2 days) + +1. Review all pages for rendering issues +2. Verify all links work +3. Test navigation flow +4. Compare against original docs for completeness + +--- + +## CI/CD Pipeline Changes + +### Current MkDocs Pipeline + +```yaml +# Current workflow (to be replaced) +- name: Build docs + run: mkdocs build + +- name: Deploy docs + run: mkdocs gh-deploy +``` + +### New Fern Pipeline + +```yaml +# .github/workflows/docs.yml +name: Documentation + +on: + push: + branches: [main] + paths: + - 'docs-fern/**' + pull_request: + paths: + - 'docs-fern/**' + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install Fern CLI + run: npm install -g fern-api + + - name: Validate Fern config + run: fern check + working-directory: docs-fern + + - name: Generate docs (PR preview) + if: github.event_name == 'pull_request' + run: fern generate --docs --preview + working-directory: docs-fern + env: + FERN_TOKEN: ${{ secrets.FERN_TOKEN }} + + - name: Deploy docs (production) + if: github.ref == 'refs/heads/main' + run: fern generate --docs + working-directory: docs-fern + env: + FERN_TOKEN: ${{ secrets.FERN_TOKEN }} +``` + +### Required Secrets + +| Secret | Description | +|--------|-------------| +| `FERN_TOKEN` | API token from Fern dashboard for deployments | + +### Local Development + +```bash +# Install Fern CLI +npm install -g fern-api + +# Navigate to docs directory +cd docs-fern + +# Validate configuration +fern check + +# Local preview (starts dev server) +fern docs dev + +# Generate static output +fern generate --docs +``` + +--- + +## Deprecation Timeline + +### Week 1-2: Parallel Operation + +- `docs-fern/` is the primary documentation source +- `docs/` remains for reference and rollback capability +- Both directories exist in repository +- MkDocs config (`mkdocs.yml`) remains but is not used in CI + +### Week 3: Soft Deprecation + +- Remove MkDocs from CI/CD pipeline +- Add deprecation notice to `docs/README.md`: + ```markdown + > ⚠️ **DEPRECATED**: This directory is no longer maintained. + > Documentation has moved to `docs-fern/`. + > This directory will be removed on [DATE]. + ``` +- Update `CONTRIBUTING.md` to reference new docs location + +### Week 4: Hard Deprecation + +- Delete `docs/` directory +- Delete `mkdocs.yml` +- Remove MkDocs dependencies from `pyproject.toml`: + - `mkdocs` + - `mkdocs-material` + - `mkdocs-jupyter` + - `mkdocstrings` + - `mkdocstrings-python` +- Update `.gitignore` to remove MkDocs artifacts (`site/`) +- Archive final MkDocs state in git tag: `mkdocs-final` + +### Post-Migration Cleanup + +- Remove custom CSS (`docs/css/`) +- Remove custom JS (`docs/js/`) +- Remove template overrides (`docs/overrides/`) +- Update README.md documentation links + +--- + +## Risks and Mitigations + +| Risk | Impact | Likelihood | Mitigation | Owner | +|------|--------|------------|------------|-------| +| API reference quality loss | High | Medium | Document Python APIs manually with curated examples; add to PR checklist | [Owner] | +| Notebook interactivity loss | Medium | Low | Link to Colab badges at top of each tutorial; keep `.ipynb` files hosted | [Owner] | +| Icon support gaps | Low | High | Replace `:material-xxx:` with emoji or text labels; document in style guide | [Owner] | +| Custom CSS incompatibility | Low | Medium | Use Fern's built-in components; minimal custom CSS only if essential | [Owner] | +| Build/deploy workflow breaks | Medium | Medium | Test CI/CD in separate branch before merging; keep MkDocs as fallback for 2 weeks | [Owner] | +| SEO ranking drop | Medium | Medium | Implement all redirects before deprecating old URLs; submit sitemap to search engines | [Owner] | +| Broken links post-migration | Medium | High | Run automated link checker before go-live; fix all broken links | [Owner] | + +--- + +## Common Pitfalls & Troubleshooting + +### Pitfall 1: Nested Admonitions + +MkDocs supports nested admonitions; Fern callouts do not nest well. + +**Problem:** +```markdown +!!! note + Some text + !!! warning + Nested warning +``` + +**Solution:** Flatten to sequential callouts: +```mdx + +Some text + + + +Nested warning (now separate) + +``` + +### Pitfall 2: Code Blocks Inside Tabs + +Indentation is critical. Fern expects proper nesting. + +**Problem (incorrect indentation):** +```mdx + + +```python +code +``` + + +``` + +**Solution (correct indentation):** +```mdx + + + ```python + code + ``` + + +``` + +### Pitfall 3: MkDocs-Specific Syntax + +These MkDocs features have no direct Fern equivalent: + +| MkDocs Syntax | Action | +|---------------|--------| +| `{ .md-button }` | Remove, use standard links | +| `{ .annotate }` | Remove, use inline notes | +| `[TOC]` | Remove, Fern auto-generates TOC | +| `--8<-- "file"` | Inline the code or use `` | +| `::: module.path` | Convert to manual documentation | + +### Pitfall 4: Image Paths + +MkDocs resolves images relative to the markdown file; Fern resolves from project root. + +**MkDocs:** +```markdown +![Alt](../assets/image.png) +``` + +**Fern:** +```mdx +![Alt](/assets/image.png) +``` + +### Pitfall 5: Front Matter + +Fern uses YAML front matter for page metadata. Add to each file: + +```mdx +--- +title: Page Title +description: Optional description for SEO +--- +``` + +### Troubleshooting Commands + +```bash +# Validate all Fern configuration +fern check + +# See detailed errors +fern check --log-level debug + +# Preview locally before deploying +fern docs dev + +# Check for broken internal links +grep -r '](/[^)]*\.mdx)' docs-fern/pages/ | grep -v '^#' +``` + +--- + +## Rollback Plan + +If critical issues are discovered post-migration, follow this rollback procedure: + +### Trigger Conditions + +Initiate rollback if any of these occur within 2 weeks of go-live: + +- [ ] >10% of pages have rendering issues +- [ ] Search functionality broken +- [ ] CI/CD pipeline repeatedly failing +- [ ] Critical content missing or incorrect +- [ ] Stakeholder requests rollback + +### Rollback Steps + +**Step 1: Restore MkDocs CI/CD (15 minutes)** + +```yaml +# Revert .github/workflows/docs.yml to MkDocs version +git revert +git push origin main +``` + +**Step 2: Restore DNS/Hosting (if changed)** + +Point documentation URL back to MkDocs deployment location. + +**Step 3: Communicate** + +Notify team: +> Documentation rollback initiated due to [REASON]. +> MkDocs docs restored at [URL]. +> Fern migration paused pending [ISSUE] resolution. + +**Step 4: Preserve Fern Work** + +```bash +# Don't delete - branch and preserve +git checkout -b fern-migration-paused +git push origin fern-migration-paused +``` + +**Step 5: Post-Mortem** + +Document: +- What triggered the rollback +- Root cause analysis +- Required fixes before retry +- Updated timeline + +### Rollback Window + +- **Weeks 1-2**: Full rollback capability (MkDocs still in repo) +- **Week 3+**: Rollback requires restoring from `mkdocs-final` tag +- **Week 4+**: Rollback requires significant effort (MkDocs deleted) + +--- + +## Pre-Flight Checklist + +Before starting migration, ensure: + +- [ ] Fern account created and `FERN_TOKEN` obtained +- [ ] Hosting decision finalized (Section: Decisions #4) +- [ ] Timeline approved and dates filled in (Section: Decisions #5) +- [ ] Owner assigned in RFC header +- [ ] Team notified of upcoming changes +- [ ] Current docs snapshot archived (`git tag mkdocs-snapshot-pre-migration`) + +--- + +## Conversion Checklist + +### File-by-File Migration Tracker + +Use this checklist during Phase 2-4 to track progress: + +#### Getting Started +- [ ] `index.md` → `pages/index.mdx` +- [ ] `installation.md` → `pages/installation.mdx` +- [ ] `quick-start.md` → `pages/quick-start.mdx` +- [ ] `CONTRIBUTING.md` → `pages/contributing.mdx` + +#### Concepts - Models +- [ ] `concepts/models/default-model-settings.md` → `pages/concepts/models/default-model-settings.mdx` +- [ ] `concepts/models/custom-model-settings.md` → `pages/concepts/models/custom-model-settings.mdx` +- [ ] `concepts/models/configure-model-settings-with-the-cli.md` → `pages/concepts/models/configure-with-cli.mdx` +- [ ] `concepts/models/model-providers.md` → `pages/concepts/models/model-providers.mdx` +- [ ] `concepts/models/model-configs.md` → `pages/concepts/models/model-configs.mdx` +- [ ] `concepts/models/inference-parameters.md` → `pages/concepts/models/inference-parameters.mdx` + +#### Concepts - Other +- [ ] `concepts/columns.md` → `pages/concepts/columns.mdx` +- [ ] `concepts/validators.md` → `pages/concepts/validators.mdx` +- [ ] `concepts/processors.md` → `pages/concepts/processors.mdx` +- [ ] `concepts/person_sampling.md` → `pages/concepts/person-sampling.mdx` + +#### Tutorials (Notebook Conversion) +- [ ] `colab_notebooks/1-the-basics.ipynb` → `pages/tutorials/the-basics.mdx` +- [ ] `colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb` → `pages/tutorials/structured-outputs.mdx` +- [ ] `colab_notebooks/3-seeding-with-a-dataset.ipynb` → `pages/tutorials/seeding-with-dataset.mdx` +- [ ] `colab_notebooks/4-providing-images-as-context.ipynb` → `pages/tutorials/images-as-context.mdx` +- [ ] Create `pages/tutorials/overview.mdx` (new index page) + +#### Recipes +- [ ] `recipes/cards.md` → `pages/recipes/index.mdx` +- [ ] `recipes/code_generation/text_to_python.md` → `pages/recipes/code-generation/text-to-python.mdx` +- [ ] `recipes/code_generation/text_to_sql.md` → `pages/recipes/code-generation/text-to-sql.mdx` +- [ ] `recipes/qa_and_chat/product_info_qa.md` → `pages/recipes/qa-and-chat/product-info-qa.mdx` +- [ ] `recipes/qa_and_chat/multi_turn_chat.md` → `pages/recipes/qa-and-chat/multi-turn-chat.mdx` + +#### Plugins +- [ ] `plugins/overview.md` → `pages/plugins/overview.mdx` +- [ ] `plugins/example.md` → `pages/plugins/example.mdx` +- [ ] `plugins/available.md` → `pages/plugins/available.mdx` + +#### API Reference +- [ ] `code_reference/models.md` → `pages/api-reference/models.mdx` +- [ ] `code_reference/column_configs.md` → `pages/api-reference/column-configs.mdx` +- [ ] `code_reference/config_builder.md` → `pages/api-reference/config-builder.mdx` +- [ ] `code_reference/data_designer_config.md` → `pages/api-reference/data-designer-config.mdx` +- [ ] `code_reference/sampler_params.md` → `pages/api-reference/sampler-params.mdx` +- [ ] `code_reference/validator_params.md` → `pages/api-reference/validator-params.mdx` +- [ ] `code_reference/processors.md` → `pages/api-reference/processors.mdx` +- [ ] `code_reference/analysis.md` → `pages/api-reference/analysis.mdx` + +#### Assets +- [ ] Copy `assets/palette-favicon.png` → `assets/favicon.png` +- [ ] Copy `assets/recipes/` → `assets/recipes/` + +--- + +## Success Criteria + +- [ ] All existing documentation pages migrated (32 pages total) +- [ ] Navigation structure preserved +- [ ] All code examples render correctly +- [ ] All internal links functional (automated check) +- [ ] All external links functional (automated check) +- [ ] NVIDIA branding applied (green accent: #76B900) +- [ ] Local development workflow documented +- [ ] CI/CD pipeline deployed and tested +- [ ] URL redirects configured and tested +- [ ] PR preview deployments working +- [ ] Page load time < 3 seconds + +--- + +## Decisions + +The following decisions have been made to ensure smooth execution: + +### 1. API Reference Approach + +**Decision:** Manual documentation with code examples (Option A) + +**Rationale:** +- Fastest path to migration completion +- Allows curated examples rather than raw docstring dumps +- Fern's Python SDK autodoc is not mature enough for our needs + +**Maintenance commitment:** +- API reference pages will be updated alongside code changes +- Add to PR checklist: "Update API docs if public interfaces changed" +- Revisit automation options in Q2 2026 + +### 2. Notebook Handling + +**Decision:** Convert to MDX with Colab links + +**Implementation:** +- Extract code cells as fenced code blocks +- Convert markdown cells directly to MDX +- Preserve Colab badge at top of each tutorial +- Link to hosted `.ipynb` files for interactive experience + +**Example header for converted notebooks:** +```mdx +--- +title: The Basics +--- + + +Run this tutorial interactively in [Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/1-the-basics.ipynb). + +``` + +### 3. Versioning + +**Decision:** Single version initially, evaluate multi-version post-launch + +**Rationale:** +- Current MkDocs setup is single-version +- No immediate need for versioned docs +- Fern supports versioning when needed + +### 4. Hosting + +**Decision:** [Fern-hosted | Self-hosted] _(fill in)_ + +**If Fern-hosted:** +- URL: `https://datadesigner.docs.buildwithfern.com` or custom domain +- Zero infrastructure management +- Built-in CDN and SSL + +**If self-hosted:** +- Deploy to existing NVIDIA infrastructure +- Use `fern generate --docs` to produce static output +- Configure redirects on hosting platform + +### 5. Timeline + +**Decision:** [X weeks] from RFC approval + +| Milestone | Target Date | +|-----------|-------------| +| Phase 1 (Setup) complete | [DATE] | +| Phase 2-3 (Content migration) complete | [DATE] | +| Phase 4 (API reference) complete | [DATE] | +| Phase 5-6 (Polish & testing) complete | [DATE] | +| Go-live | [DATE] | +| Old docs deprecated | [DATE + 2 weeks] | + +--- + +## Helper Scripts + +The following scripts can assist with automated conversion: + +### 1. Admonition Converter + +```python +#!/usr/bin/env python3 +"""Convert MkDocs admonitions to Fern callouts.""" +import re +import sys + +ADMONITION_MAP = { + "note": "Note", + "tip": "Tip", + "info": "Info", + "warning": "Warning", + "danger": "Warning", + "question": "Info", + "example": "Info", + "abstract": "Note", + "success": "Tip", + "failure": "Warning", + "bug": "Warning", +} + +def convert_admonitions(content: str) -> str: + """Convert !!! admonitions to components.""" + pattern = r'!!! (\w+)(?: "([^"]*)")?\n((?: .*\n?)*)' + + def replace(match: re.Match) -> str: + admon_type = match.group(1).lower() + title = match.group(2) or "" + body = match.group(3) + # Remove 4-space indent from body + body = re.sub(r'^ ', '', body, flags=re.MULTILINE).strip() + fern_type = ADMONITION_MAP.get(admon_type, "Note") + if title: + return f'<{fern_type} title="{title}">\n{body}\n\n' + return f'<{fern_type}>\n{body}\n\n' + + return re.sub(pattern, replace, content) + +if __name__ == "__main__": + content = sys.stdin.read() + print(convert_admonitions(content)) +``` + +**Usage:** +```bash +cat docs/concepts/columns.md | python scripts/convert_admonitions.py > docs-fern/pages/concepts/columns.mdx +``` + +### 2. Tabs Converter + +```python +#!/usr/bin/env python3 +"""Convert MkDocs tabs to Fern Tabs components.""" +import re +import sys + +def convert_tabs(content: str) -> str: + """Convert === tabs to components.""" + # Match tab groups + pattern = r'((?:=== "([^"]+)"\n((?: .*\n?)*)\n?)+)' + + def replace_group(match: re.Match) -> str: + group = match.group(0) + tabs = re.findall(r'=== "([^"]+)"\n((?: .*\n?)*)', group) + result = [""] + for title, body in tabs: + body = re.sub(r'^ ', '', body, flags=re.MULTILINE).strip() + result.append(f' ') + result.append(f' {body}') + result.append(' ') + result.append("") + return '\n'.join(result) + '\n' + + return re.sub(pattern, replace_group, content) + +if __name__ == "__main__": + content = sys.stdin.read() + print(convert_tabs(content)) +``` + +### 3. Notebook to MDX Converter + +```python +#!/usr/bin/env python3 +"""Convert Jupyter notebook to MDX.""" +import json +import sys +from pathlib import Path + +def notebook_to_mdx(notebook_path: str, colab_url: str) -> str: + """Convert a Jupyter notebook to MDX format.""" + with open(notebook_path) as f: + nb = json.load(f) + + lines = [ + "---", + f"title: {Path(notebook_path).stem.replace('-', ' ').title()}", + "---", + "", + '', + f"Run this tutorial interactively in [Google Colab]({colab_url}).", + "", + "", + ] + + for cell in nb.get("cells", []): + cell_type = cell.get("cell_type") + source = "".join(cell.get("source", [])) + + if cell_type == "markdown": + lines.append(source) + lines.append("") + elif cell_type == "code": + lines.append("```python") + lines.append(source) + lines.append("```") + lines.append("") + + return "\n".join(lines) + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: notebook_to_mdx.py ") + sys.exit(1) + print(notebook_to_mdx(sys.argv[1], sys.argv[2])) +``` + +**Usage:** +```bash +python scripts/notebook_to_mdx.py \ + docs/colab_notebooks/1-the-basics.ipynb \ + "https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/1-the-basics.ipynb" \ + > docs-fern/pages/tutorials/the-basics.mdx +``` + +### 4. Link Checker + +```bash +#!/bin/bash +# Check all links in Fern docs +cd docs-fern + +# Internal links +grep -roh '\[.*\]([^)]*\.mdx)' pages/ | sort | uniq + +# External links +grep -roh 'https://[^)]*' pages/ | sort | uniq | while read url; do + if ! curl -s --head "$url" | head -1 | grep -q "200\|301\|302"; then + echo "BROKEN: $url" + fi +done +``` + +### 5. Batch Conversion Script + +```bash +#!/bin/bash +# batch_convert.sh - Run all conversions + +set -e + +SCRIPTS_DIR="scripts" +DOCS_DIR="docs" +FERN_DIR="docs-fern/pages" + +# Create directory structure +mkdir -p "$FERN_DIR"/{concepts/models,tutorials,recipes/{code-generation,qa-and-chat},plugins,api-reference} + +# Convert simple pages (admonitions + tabs) +for file in index installation quick-start CONTRIBUTING; do + src="$DOCS_DIR/$file.md" + if [ -f "$src" ]; then + dst="$FERN_DIR/${file,,}.mdx" + cat "$src" | python "$SCRIPTS_DIR/convert_admonitions.py" | python "$SCRIPTS_DIR/convert_tabs.py" > "$dst" + echo "Converted: $src -> $dst" + fi +done + +echo "Batch conversion complete. Manual review required." +``` + +--- + +## References + +- [Fern Docs Getting Started](https://buildwithfern.com/learn/docs/getting-started/overview) +- [Fern Components Overview](https://buildwithfern.com/learn/docs/writing-content/components/overview) +- [Fern Configuration](https://buildwithfern.com/learn/docs/configuration/site-level-settings) +- [Current MkDocs Configuration](../mkdocs.yml) + +--- + +## Appendix: Sample Migration + +### Before (MkDocs - columns.md excerpt) + +```markdown +# Columns + +Columns are the fundamental building blocks in Data Designer. + +!!! note "The Declarative Approach" + Columns are **declarative specifications**. You describe *what* you want... + +## Column Types + +### 🎲 Sampler Columns + +Sampler columns generate data using numerical sampling... + +!!! tip "Conditional Sampling" + Samplers support **conditional parameters**... +``` + +### After (Fern - columns.mdx excerpt) + +```mdx +# Columns + +Columns are the fundamental building blocks in Data Designer. + + +Columns are **declarative specifications**. You describe *what* you want... + + +## Column Types + +### 🎲 Sampler Columns + +Sampler columns generate data using numerical sampling... + + +Samplers support **conditional parameters**... + +``` diff --git a/fern/README.md b/fern/README.md new file mode 100644 index 00000000..5ad2221c --- /dev/null +++ b/fern/README.md @@ -0,0 +1,160 @@ +# Fern Documentation Cheat Sheet + +This folder contains the Fern Docs configuration for NeMo Data Designer. + +## 📦 Installation + +```bash +# Install Fern CLI globally +npm install -g fern-api + +# Or use npx (no install needed) +npx fern-api --version +``` + +## 🔍 Local Preview + +```bash +# From the fern/ directory +cd fern/ +fern docs dev + +# Or from project root +fern docs dev --project ./fern +``` + +The docs will be available at `http://localhost:3000`. + +## 📁 Folder Structure + +``` +fern/ +├── docs.yml # Global config (title, colors, versions) +├── fern.config.json # Fern CLI config (org name) +├── versions/ +│ ├── v0.3.3.yml # Navigation for v0.3.3 +│ └── v0.4.0.yml # Navigation for v0.4.0 +├── v0.3.3/ +│ └── pages/ # MDX content for v0.3.3 +├── v0.4.0/ +│ └── pages/ # MDX content for v0.4.0 +└── assets/ # Shared images, favicons +``` + +## 🔄 Bumping the Version + +When releasing a new version (e.g., v0.5.0): + +### 1. Copy the previous version's content +```bash +cp -r fern/v0.4.0 fern/v0.5.0 +``` + +### 2. Create the navigation file +```bash +cp fern/versions/v0.4.0.yml fern/versions/v0.5.0.yml +``` + +### 3. Update paths in `versions/v0.5.0.yml` +Change all `../v0.4.0/pages/` → `../v0.5.0/pages/` + +### 4. Add the new version to `docs.yml` +```yaml +versions: + - display-name: v0.5.0 + path: versions/v0.5.0.yml + slug: v0.5.0 + - display-name: v0.4.0 + path: versions/v0.4.0.yml + slug: v0.4.0 + # ... older versions +``` + +### 5. Make your content changes +Edit files in `fern/v0.5.0/pages/` + +## ✏️ Editing Content + +### Adding a new page + +1. Create the MDX file in the appropriate version folder: + ```bash + touch fern/v0.3.3/pages/concepts/new-feature.mdx + ``` + +2. Add frontmatter: + ```mdx + --- + title: New Feature + description: Description for SEO. + --- + + Content starts here... + ``` + +3. Add to navigation in `versions/v0.3.3.yml`: + ```yaml + - page: New Feature + path: ../v0.3.3/pages/concepts/new-feature.mdx + ``` + +### MDX Components + +```mdx +# Callouts +Informational note +Helpful tip +Warning message +Info callout + +# Tabs + + + ```python + print("hello") + ``` + + + ```javascript + console.log("hello") + ``` + + + +# Cards + + + Description + + +``` + +## 🚀 Deploying + +```bash +# Generate static docs (for CI/CD) +fern generate --docs + +# Deploy to Fern hosting +fern docs deploy +``` + +## 🔗 Useful Links + +- [Fern Docs](https://buildwithfern.com/learn/docs) +- [MDX Components Reference](https://buildwithfern.com/learn/docs/components) +- [Versioning Guide](https://buildwithfern.com/learn/docs/configuration/versions) +- [Navigation Configuration](https://buildwithfern.com/learn/docs/configuration/navigation) + +## ⚠️ Common Issues + +### "EISDIR: illegal operation on a directory" +- Check that all `path:` values point to `.mdx` files, not directories + +### Page not showing +- Verify the page is listed in the version's navigation file +- Check the path is correct (relative to the versions/ folder) + +### Version selector not appearing +- Ensure `versions:` is defined in `docs.yml` +- Each version needs a valid `.yml` file in `versions/` diff --git a/fern/assets/favicon.png b/fern/assets/favicon.png new file mode 100644 index 00000000..11c795d3 Binary files /dev/null and b/fern/assets/favicon.png differ diff --git a/fern/assets/recipes/code_generation/text_to_python.py b/fern/assets/recipes/code_generation/text_to_python.py new file mode 100644 index 00000000..b5cb88d3 --- /dev/null +++ b/fern/assets/recipes/code_generation/text_to_python.py @@ -0,0 +1,318 @@ +from pathlib import Path + +from data_designer.essentials import ( + CategorySamplerParams, + CodeLang, + CodeValidatorParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, + ValidationColumnConfig, + ValidatorType, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="industry_sector", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Healthcare", + "Finance", + "Technology", + ], + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="industry_sector", + values={ + "Healthcare": [ + "Electronic Health Records (EHR) Systems", + "Telemedicine Platforms", + "AI-Powered Diagnostic Tools", + ], + "Finance": [ + "Fraud Detection Software", + "Automated Trading Systems", + "Personal Finance Apps", + ], + "Technology": [ + "Cloud Computing Platforms", + "Artificial Intelligence and Machine Learning Platforms", + "DevOps and CI/CD Tools", + ], + }, + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="code_complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Beginner", + "Intermediate", + "Advanced", + ], + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="code_concept", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="code_complexity", + values={ + "Beginner": [ + "Variables", + "Data Types", + "Functions", + "Loops", + "Classes", + ], + "Intermediate": [ + "List Comprehensions", + "Object-oriented programming", + "Lambda Functions", + "Web frameworks", + "Pandas", + ], + "Advanced": [ + "Multithreading", + "Context Managers", + "Generators", + ], + }, + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="instruction_phrase", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Write a function that", + "Create a class that", + "Implement a script", + "Can you create a function", + "Develop a module that", + ], + ), + ), + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="instruction", + model_alias=model_alias, + system_prompt=("You are an expert at generating clear and specific programming tasks."), + prompt=( + "Generate an instruction to create Python code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n" + "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" + ), + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="code_implementation", + model_alias=model_alias, + code_lang=CodeLang.PYTHON, + system_prompt=( + "You are an expert Python programmer who writes clean, efficient, and well-documented code." + ), + prompt=( + "Write Python code for the following instruction:\n" + "Instruction: {{ instruction }}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n" + "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n" + "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n" + "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{code_concept}}.\n" + ), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="instruction", + model_alias=model_alias, + system_prompt=("You are an expert at generating clear and specific programming tasks."), + prompt=( + "Generate an instruction to create Python code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n" + "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" + ), + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="code_implementation", + model_alias=model_alias, + code_lang=CodeLang.PYTHON, + system_prompt=( + "You are an expert Python programmer who writes clean, efficient, and well-documented code." + ), + prompt=( + "Write Python code for the following instruction:\n" + "Instruction: {{ instruction }}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n" + "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n" + "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n" + "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n" + ), + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="code_judge_result", + model_alias=model_alias, + prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE, + scores=python_scoring, + ) + ) + + config_builder.add_column( + ValidationColumnConfig( + name="code_validity_result", + validator_type=ValidatorType.CODE, + target_columns=["code_implementation"], + validator_params=CodeValidatorParams( + code_lang=CodeLang.PYTHON, + ), + batch_size=100, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\ +You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving. + +You think about potential flaws and errors in the code. You are a tough critic, but a fair one. + +Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS. + +#### INSTRUCTIONS +The Generated Python Code should be a valid response to the Natural Language Prompt below + +Natural Language Prompt: +{{ instruction }} + +Generated Python Code +{{ code_implementation }} +""" + + +python_scoring = [ + Score( + name="Relevance", + description="Adherence to INSTRUCTIONS and CONTEXT", + options={ + 4: "Perfectly meets all specified requirements.", + 3: "Meets most requirements with minor deviations.", + 2: "Moderate deviation from the instructions.", + 1: "Significant deviations from the instructions.", + 0: "Does not adhere to the instructions.", + }, + ), + Score( + name="Pythonic", + description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)", + options={ + 4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.", + 3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.", + 2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.", + 1: "The code loosely follows Python conventions, with several deviations from best practices.", + 0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.", + }, + ), + Score( + name="Readability", + description="Readability and Maintainability (Is the Python code easy to understand and maintain?)", + options={ + 4: ( + "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, " + "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format." + ), + 3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.", + 2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.", + 1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.", + 0: "The code is unreadable, with no attempt at formatting or description.", + }, + ), + Score( + name="Efficiency", + description="Efficiency and Performance (Is the code optimized for performance?)", + options={ + 4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.", + 3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.", + 2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.", + 1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.", + 0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.", + }, + ), +] + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/assets/recipes/code_generation/text_to_sql.py b/fern/assets/recipes/code_generation/text_to_sql.py new file mode 100644 index 00000000..a0fbf6e5 --- /dev/null +++ b/fern/assets/recipes/code_generation/text_to_sql.py @@ -0,0 +1,323 @@ +from pathlib import Path + +from data_designer.essentials import ( + CategorySamplerParams, + CodeLang, + CodeValidatorParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, + ValidationColumnConfig, + ValidatorType, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="industry_sector", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Healthcare", "Finance", "Technology"], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="industry_sector", + values={ + "Healthcare": [ + "Electronic Health Records (EHR) Systems", + "Telemedicine Platforms", + "AI-Powered Diagnostic Tools", + ], + "Finance": [ + "Fraud Detection Software", + "Automated Trading Systems", + "Personal Finance Apps", + ], + "Technology": [ + "Cloud Computing Platforms", + "Artificial Intelligence and Machine Learning Platforms", + "DevOps and CI/CD Tools", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Beginner", "Intermediate", "Advanced"], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_concept", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="sql_complexity", + values={ + "Beginner": [ + "Basic SELECT Statements", + "WHERE Clauses", + "Basic JOINs", + "INSERT, UPDATE, DELETE", + ], + "Intermediate": [ + "Aggregation Functions", + "Multiple JOINs", + "Subqueries", + "Views", + ], + "Advanced": [ + "Window Functions", + "Common Table Expressions (CTEs)", + "Stored Procedures", + "Query Optimization", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_task_type", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Data Retrieval", + "Data Manipulation", + "Analytics and Reporting", + "Data Transformation", + ], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="instruction_phrase", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Write an SQL query that", + "Create an SQL statement to", + "Develop an SQL query to", + "Can you write SQL that", + "Formulate an SQL query that", + ], + ), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="sql_prompt", + model_alias=model_alias, + system_prompt="You are an expert at generating clear and specific SQL tasks.", + prompt=SQL_PROMPT_TEXT, + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="sql_context", + model_alias=model_alias, + code_lang=CodeLang.SQL_ANSI, + system_prompt=( + "You are an expert SQL database designer who creates clean, efficient, and " + "well-structured database schemas." + ), + prompt=SQL_CONTEXT_TEXT, + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="sql", + model_alias=model_alias, + code_lang=CodeLang.SQL_ANSI, + system_prompt="You are an expert SQL programmer who writes clean, efficient, and well-structured queries.", + prompt=SQL_CODE_TEXT, + ) + ) + + config_builder.add_column( + ValidationColumnConfig( + name="code_validity_result", + validator_type=ValidatorType.CODE, + target_columns=["sql"], + validator_params=CodeValidatorParams( + code_lang=CodeLang.SQL_ANSI, + ), + batch_size=100, + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="code_judge_result", + model_alias=model_alias, + prompt=TEXT_TO_SQL_JUDGE_TEMPLATE, + scores=sql_scoring, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +SQL_PROMPT_TEXT = ( + "Generate an instruction to create SQL code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{instruction_phrase}}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{industry_sector}} sector and {{topic}} topic.\n" + "* SQL Complexity: Tailor the instruction to the {{sql_complexity}} level. Utilize relevant {{sql_concept}} " + "where appropriate to match the complexity level.\n" + "* Task Type: The instruction should involve a {{sql_task_type}} task.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to " + "understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" +) + +SQL_CONTEXT_TEXT = ( + "Generate the SQL for creating database tables that would be relevant for the following instruction:\n" + "Instruction: {{sql_prompt}}\n\n" + "Important Guidelines:\n" + "* Relevance: Ensure all tables are directly related to the {{industry_sector}} sector and {{topic}} topic.\n" + "* Completeness: Include all essential columns with appropriate data types, primary/foreign keys, and necessary constraints.\n" + "* Realism: Use realistic table structures typical for the specified industry.\n" + "* Executable SQL: Provide complete CREATE TABLE statements that can be run without modification.\n" + "* Consistency: Use consistent naming conventions (e.g., snake_case for table and column names).\n" + "* Sample Data: Include INSERT statements with sample data that makes sense for the tables (at least 5-10 rows per table)." +) + +SQL_CODE_TEXT = ( + "Write SQL code for the following instruction based on the provided database context:\n" + "Instruction: {{sql_prompt}}\n\n" + "Database Context:\n" + "{{sql_context}}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your SQL should be clean, complete, self-contained and accurate.\n" + "* Code Validity: Please ensure that your SQL code is executable and does not contain any errors.\n" + "* Context: Base your query on the provided database context. Only reference tables and columns that " + "exist in the context.\n" + "* Complexity & Concepts: The SQL should be written at a {{sql_complexity}} level, making use of " + "concepts such as {{sql_concept}}.\n" + "* Task Type: Ensure your solution implements the appropriate {{sql_task_type}} operation.\n" + "* Comments: Include brief comments explaining the key parts of your query.\n" +) + + +TEXT_TO_SQL_JUDGE_TEMPLATE = """\ +You are an expert in SQL with deep knowledge of relational modeling, query semantics, +and performance tuning across common dialects (e.g., PostgreSQL, MySQL, SQLite, SQL Server). +You think critically about correctness, readability, and efficiency. + +Use the SQL Query Quality Rubric below to score the **Generated SQL Query** based on the INSTRUCTIONS. + +#### INSTRUCTIONS +The Generated SQL Query should be a valid response to the Natural Language Prompt below + +Natural Language Prompt: +{{ sql_prompt }} + +Database Context: +{{ sql_context }} + +Generated SQL Query +{{ sql }} +""" + + +sql_scoring = [ + Score( + name="Relevance", + description="Adherence to INSTRUCTIONS and CONTEXT", + options={ + 4: "Perfectly meets all specified requirements.", + 3: "Meets most requirements with minor deviations.", + 2: "Moderate deviation from the instructions.", + 1: "Significant deviations from the instructions.", + 0: "Does not adhere to the instructions.", + }, + ), + Score( + name="SQL Correctness", + description="Syntax and semantic correctness; returns the intended result", + options={ + 4: "Valid SQL with correct joins, filters, grouping/aggregation, and NULL handling; produces the intended result set under the stated/implicit dialect.", + 3: "Generally correct with minor issues (e.g., edge-case NULLs, minor grouping detail) but still likely yields the intended result.", + 2: "Partially correct; noticeable semantic mistakes (joins, grouping, filters) that may change results or fail in edge cases.", + 1: "Largely incorrect; major semantic or syntactic errors likely causing failure or wrong results.", + 0: "Invalid SQL or unrelated to the task; will not run or cannot produce a meaningful result.", + }, + ), + Score( + name="Readability", + description="Formatting, clarity, and maintainability", + options={ + 4: "Cleanly formatted (keywords/clauses consistently styled), clear structure (CTEs/subqueries where helpful), meaningful table/column aliases, and concise.", + 3: "Generally readable with consistent formatting and understandable aliases; could be organized slightly better.", + 2: "Somewhat readable but inconsistent formatting or confusing aliasing; structure is harder to follow.", + 1: "Poorly formatted and hard to read; unclear structure and aliasing.", + 0: "Unreadable or chaotic; no meaningful structure or styling.", + }, + ), + Score( + name="Efficiency", + description="Query performance best practices", + options={ + 4: "Uses sargable predicates, appropriate joins, selective filters early, avoids SELECT *, unnecessary DISTINCT, and wasteful subqueries; likely to use indexes effectively.", + 3: "Mostly efficient; minor opportunities for improvement (e.g., simplifying expressions, reducing data early).", + 2: "Moderate inefficiencies (e.g., non-sargable filters, unnecessary nested subqueries, broad SELECT *).", + 1: "Notably inefficient patterns likely causing large scans or poor plans.", + 0: "Highly inefficient; ignores basic best practices and likely to perform very poorly.", + }, + ), +] + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/assets/recipes/qa_and_chat/multi_turn_chat.py b/fern/assets/recipes/qa_and_chat/multi_turn_chat.py new file mode 100644 index 00000000..b4debed7 --- /dev/null +++ b/fern/assets/recipes/qa_and_chat/multi_turn_chat.py @@ -0,0 +1,204 @@ +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel, Field + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="domain", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["Tech Support", "Personal Finances", "Educational Guidance"]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="domain", + values={ + "Tech Support": [ + "Troubleshooting a Laptop", + "Setting Up a Home Wi-Fi Network", + "Installing Software Updates", + ], + "Personal Finances": [ + "Budgeting Advice", + "Understanding Taxes", + "Investment Strategies", + ], + "Educational Guidance": [ + "Choosing a College Major", + "Effective Studying Techniques", + "Learning a New Language", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["Basic", "Intermediate", "Advanced"]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="conversation_length", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=[2, 4, 6, 8]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="user_mood", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["happy", "silly", "sarcastic", "combative", "disappointed", "toxic"]), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="assistant_system_prompt", + prompt=( + "Write a reasonable system prompt for a helpful AI assistant with expertise in " + "{{domain}} and {{topic}}. The AI assistant must not engage in harmful behaviors." + ), + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="user_task", + prompt="Define a simple task related to {{topic}} of {{complexity}} complexity for the user.", + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMStructuredColumnConfig( + name="conversation", + prompt=( + "\n{{user_task}}\n\n\n" + "{{assistant_system_prompt}}\n\n" + "Generate a conversation between a user and an AI assistant with about .\n" + "User is asking the assistant for advice and is in a {{user_mood}} mood.\n" + "The conversation must be {{conversation_length}} messages in length.\n" + "The conversation must come to a natural end in {{conversation_length}} messages and if the assistant is unable " + "to solve the user's needs by then, they should offer to continue the conversation later or redirect them to additional resources.\n" + "The conversation must be realistic and natural, incorporating details from both the user's and assistant's profiles.\n" + "The AI assistant role has the system prompt defined in . All of its responses must conform to :\n" + ), + output_format=ChatConversation, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="toxicity_evaluation", + prompt=TOXICITY_PROMPT_TEMPLATE, + scores=[toxicity_score], + model_alias=model_alias, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +toxicity_score = Score( + name="Toxicity", + description="Evaluation of harmful, harassing, or hateful language in conversations", + options={ + "4": "No toxicity detected; conversation is respectful and appropriate.", + "3": "Mild toxicity; slightly rude or sarcastic but not hateful or harmful.", + "2": "Moderate toxicity; contains disrespectful or harassing language.", + "1": "Severe toxicity; contains overt hate, harassment, or harmful content.", + "0": "Extreme toxicity; contains dangerous, threatening, or abusive content.", + }, +) + +TOXICITY_PROMPT_TEMPLATE = ( + "You are an expert in content moderation and digital communication ethics. " + "You are evaluating the toxicity level in user messages.\n" + "Take a deep breath and carefully analyze the conversation provided. \n" + "Use the rubric to score the level of toxicity in the USER'S messages only.\n" + "Consider:\n" + "- Focus primarily on harassment, hate, or harmful language\n" + "- Evaluate the full context of the conversation\n" + "- Be objective and consistent in your assessment\n" + "- If any single message exhibits toxicity, it should influence the final score\n" + "## CONVERSATION\n" + "{{ conversation }}" +) + + +class Message(BaseModel): + """A single message turn in the conversation.""" + + role: Literal["user", "assistant"] = Field(..., description="Which role is writing the message.") + content: str = Field(..., description="Message contents.") + + +class ChatConversation(BaseModel): + """A chat conversation between a specific user and an AI assistant. + * All conversations are initiated by the user role. + * The assistant role always responds to the user message. + * Turns alternate between user and assistant roles. + * The last message is always from the assistant role. + * Message content can be long or short. + * All assistant messages are faithful responses and must be answered fully. + """ + + conversation: list[Message] = Field(..., description="List of all messages in the conversation.") + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/assets/recipes/qa_and_chat/product_info_qa.py b/fern/assets/recipes/qa_and_chat/product_info_qa.py new file mode 100644 index 00000000..59a0110f --- /dev/null +++ b/fern/assets/recipes/qa_and_chat/product_info_qa.py @@ -0,0 +1,224 @@ +import string +from pathlib import Path + +from pydantic import BaseModel, Field + +from data_designer.essentials import ( + BernoulliSamplerParams, + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + ExpressionColumnConfig, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + UniformSamplerParams, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + config_builder.add_column( + SamplerColumnConfig( + name="category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Electronics", + "Clothing", + "Home Appliances", + "Groceries", + "Toiletries", + "Sports Equipment", + "Toys", + "Books", + "Pet Supplies", + "Tools & Home Improvement", + "Beauty", + "Health & Wellness", + "Outdoor Gear", + "Automotive", + "Jewelry", + "Watches", + "Office Supplies", + "Gifts", + "Arts & Crafts", + "Baby & Kids", + "Music", + "Video Games", + "Movies", + "Software", + "Tech Devices", + ] + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="price_tens_of_dollars", + sampler_type=SamplerType.UNIFORM, + params=UniformSamplerParams(low=1, high=200), + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="product_price", + expr="{{ (price_tens_of_dollars * 10) - 0.01 | round(2) }}", + dtype="float", + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="first_letter", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=list(string.ascii_uppercase)), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="is_hallucination", + sampler_type=SamplerType.BERNOULLI, + params=BernoulliSamplerParams(p=0.5), + ) + ) + + config_builder.add_column( + LLMStructuredColumnConfig( + name="product_info", + model_alias=model_alias, + prompt=( + "Generate a realistic product description for a product in the {{ category }} " + "category that costs {{ product_price }}.\n" + "The name of the product MUST start with the letter {{ first_letter }}.\n" + ), + output_format=ProductInfo, + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="question", + model_alias=model_alias, + prompt=("Ask a question about the following product:\n\n {{ product_info }}"), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="answer", + model_alias=model_alias, + prompt=( + "{%- if is_hallucination == 0 -%}\n" + "\n" + "{{ product_info }}\n" + "\n" + "{%- endif -%}\n" + "User Question: {{ question }}\n" + "Directly and succinctly answer the user's question.\n" + "{%- if is_hallucination == 1 -%}\n" + "Make up whatever information you need to in order to answer the user's request.\n" + "{%- endif -%}" + ), + ) + ) + + # Evaluate answer quality + config_builder.add_column( + LLMJudgeColumnConfig( + name="llm_answer_metrics", + model_alias=model_alias, + prompt=( + "\n" + "{{ product_info }}\n" + "\n" + "User Question: {{question }}\n" + "AI Assistant Answer: {{ answer }}\n" + "Judge the AI assistant's response to the user's question about the product described in ." + ), + scores=answer_quality_scores, + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="completeness_result", + expr="{{ llm_answer_metrics.Completeness.score }}", + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="accuracy_result", + expr="{{ llm_answer_metrics.Accuracy.score }}", + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +class ProductInfo(BaseModel): + product_name: str = Field(..., description="A realistic product name for the market.") + key_features: list[str] = Field(..., min_length=1, max_length=3, description="Key product features.") + description: str = Field( + ..., + description="A short, engaging description of what the product does, highlighting a unique but believable feature.", + ) + price_usd: float = Field(..., description="The price of the product", ge=10, le=1000, decimal_places=2) + + +completeness_score = Score( + name="Completeness", + description="Evaluation of AI assistant's thoroughness in addressing all aspects of the user's query.", + options={ + "Complete": "The response thoroughly covers all key points requested in the question, providing sufficient detail to satisfy the user's information needs.", + "PartiallyComplete": "The response addresses the core question but omits certain important details or fails to elaborate on relevant aspects that were requested.", + "Incomplete": "The response significantly lacks necessary information, missing major components of what was asked and leaving the query largely unanswered.", + }, +) + +accuracy_score = Score( + name="Accuracy", + description="Evaluation of how factually correct the AI assistant's response is relative to the product information.", + options={ + "Accurate": "The information provided aligns perfectly with the product specifications without introducing any misleading or incorrect details.", + "PartiallyAccurate": "While some information is correctly stated, the response contains minor factual errors or potentially misleading statements about the product.", + "Inaccurate": "The response presents significantly wrong information about the product, with claims that contradict the actual product details.", + }, +) + +answer_quality_scores = [completeness_score, accuracy_score] + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/docs.yml b/fern/docs.yml new file mode 100644 index 00000000..fedb5508 --- /dev/null +++ b/fern/docs.yml @@ -0,0 +1,27 @@ +instances: + - url: https://nemo-datadesigner.docs.buildwithfern.com + +title: NeMo Data Designer + +versions: + - display-name: v0.3.3 + path: versions/v0.3.3.yml + slug: v0.3.3 + +colors: + accent-primary: + dark: "#76B900" + light: "#4a7300" + background: + dark: "#1a1a1a" + light: "#ffffff" + +logo: + dark: assets/favicon.png + light: assets/favicon.png + +favicon: assets/favicon.png + +navbar-links: + - type: github + value: https://github.com/NVIDIA-NeMo/DataDesigner diff --git a/fern/fern.config.json b/fern/fern.config.json new file mode 100644 index 00000000..9f0a3e5d --- /dev/null +++ b/fern/fern.config.json @@ -0,0 +1,4 @@ +{ + "organization": "nvidia", + "version": "3.40.1" +} \ No newline at end of file diff --git a/fern/v0.3.3/pages/api-reference/analysis.mdx b/fern/v0.3.3/pages/api-reference/analysis.mdx new file mode 100644 index 00000000..a912d2b9 --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/analysis.mdx @@ -0,0 +1,160 @@ +--- +title: Analysis +description: API reference for dataset analysis and profiling. +--- + +The `analysis` modules provide tools for profiling and analyzing generated datasets. It includes statistics tracking, column profiling, and reporting capabilities. + +## Column Statistics + +Column statistics are automatically computed for every column after generation. They provide basic metrics specific to the column type. For example, LLM columns track token usage statistics, sampler columns track distribution information, and validation columns track validation success rates. + +### LLMColumnStatistics + +```python +class LLMColumnStatistics(BaseModel): + """Statistics for LLM-generated columns.""" + + total_input_tokens: int # Total prompt tokens across all generations + total_output_tokens: int # Total completion tokens + avg_input_tokens: float # Average prompt tokens per generation + avg_output_tokens: float # Average completion tokens per generation + generation_time_seconds: float # Total generation time + generations_per_second: float # Generation throughput +``` + +### SamplerColumnStatistics + +```python +class SamplerColumnStatistics(BaseModel): + """Statistics for sampler columns.""" + + unique_values: int # Number of unique values generated + value_counts: dict[str, int] # Counts per value (for categorical) + min_value: float | None # Minimum value (for numerical) + max_value: float | None # Maximum value (for numerical) + mean_value: float | None # Mean value (for numerical) + std_value: float | None # Standard deviation (for numerical) +``` + +### ValidationColumnStatistics + +```python +class ValidationColumnStatistics(BaseModel): + """Statistics for validation columns.""" + + total_validated: int # Total records validated + valid_count: int # Number of valid records + invalid_count: int # Number of invalid records + null_count: int # Number of null results + pass_rate: float # Percentage of valid records +``` + +### ExpressionColumnStatistics + +```python +class ExpressionColumnStatistics(BaseModel): + """Statistics for expression columns.""" + + unique_values: int # Number of unique values + null_count: int # Number of null results + evaluation_time_seconds: float # Time to evaluate expressions +``` + +## Column Profilers + +Column profilers are optional analysis tools that provide deeper insights into specific column types. Currently, the only column profiler available is the Judge Score Profiler. + +### JudgeScoreProfilerResults + +```python +class JudgeScoreProfilerResults(BaseModel): + """Profiling results for LLM judge columns.""" + + score_name: str # Name of the score dimension + score_distribution: dict[str, int] # Distribution of scores + avg_score: float | None # Average score (for numeric scores) + score_counts: dict[str | int, int] # Counts per score value +``` + +## Dataset Profiler + +The `DatasetProfilerResults` class contains complete profiling results for a generated dataset. It aggregates column-level statistics, metadata, and profiler results. + +### DatasetProfilerResults + +```python +class DatasetProfilerResults(BaseModel): + """Complete profiling results for a generated dataset.""" + + dataset_name: str # Name of the dataset + total_records: int # Total records generated + generation_time_seconds: float # Total generation time + column_statistics: dict[str, ColumnStatistics] # Per-column stats + column_profiler_results: dict[str, list[ProfilerResults]] # Profiler results + + def to_report( + self, + output_format: Literal["console", "html", "svg"] = "console", + ) -> None: + """Generate a formatted analysis report. + + Args: + output_format: Output format for the report. + """ + ... + + def get_column_statistics( + self, + column_name: str, + ) -> ColumnStatistics: + """Get statistics for a specific column. + + Args: + column_name: Name of the column. + + Returns: + Column statistics object. + """ + ... + + def filter_by_column_type( + self, + column_type: str, + ) -> dict[str, ColumnStatistics]: + """Filter statistics by column type. + + Args: + column_type: Type of columns to filter (e.g., "llm-text"). + + Returns: + Dictionary of column statistics for matching columns. + """ + ... +``` + +### Example: Accessing Analysis Results + +```python +from data_designer.essentials import DataDesigner, DataDesignerConfigBuilder + +# Generate a dataset +data_designer = DataDesigner() +builder = DataDesignerConfigBuilder() +# ... add columns ... + +results = data_designer.create(builder, num_records=100) + +# Load and display analysis +analysis = results.load_analysis() +analysis.to_report() + +# Access specific column statistics +llm_stats = analysis.get_column_statistics("generated_text") +print(f"Average output tokens: {llm_stats.avg_output_tokens}") + +# Filter by column type +all_llm_stats = analysis.filter_by_column_type("llm-text") +for col_name, stats in all_llm_stats.items(): + print(f"{col_name}: {stats.generations_per_second:.2f} gen/sec") +``` diff --git a/fern/v0.3.3/pages/api-reference/column-configs.mdx b/fern/v0.3.3/pages/api-reference/column-configs.mdx new file mode 100644 index 00000000..a692166f --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/column-configs.mdx @@ -0,0 +1,183 @@ +--- +title: Column Configurations +description: API reference for column configuration objects. +--- + +The `column_configs` module defines configuration objects for all Data Designer column types. Each configuration inherits from `SingleColumnConfig`, which provides shared arguments like the column `name`, whether to `drop` the column after generation, and the `column_type`. + + +The `column_type` argument is used to identify column types when deserializing the Data Designer Config from JSON/YAML. It acts as the discriminator in a [discriminated union](https://docs.pydantic.dev/latest/concepts/unions/#discriminated-unions), allowing Pydantic to automatically determine which column configuration class to instantiate. + + +## SingleColumnConfig (Base Class) + +```python +class SingleColumnConfig(BaseModel): + """Base configuration for all column types.""" + + name: str # Column name (unique identifier) + drop: bool = False # Whether to drop column from final output + column_type: str # Discriminator field for column type + + @property + def required_columns(self) -> list[str]: + """Columns that must be generated before this one.""" + ... + + @property + def side_effect_columns(self) -> list[str]: + """Columns created as side effects (e.g., reasoning traces).""" + ... +``` + +## SamplerColumnConfig + +```python +class SamplerColumnConfig(SingleColumnConfig): + """Configuration for sampler-based columns.""" + + column_type: Literal["sampler"] = "sampler" + sampler_type: SamplerType # Type of sampler to use + params: SamplerParams | None = None # Sampler-specific parameters + conditional_params: dict[str, SamplerParams] | None = None # Condition-based params + convert_to: Literal["int", "float", "str"] | None = None # Type conversion +``` + +## LLMTextColumnConfig + +```python +class LLMTextColumnConfig(SingleColumnConfig): + """Configuration for LLM text generation columns.""" + + column_type: Literal["llm-text"] = "llm-text" + model_alias: str # Reference to model configuration + prompt: str # Jinja2 template for the prompt + system_prompt: str | None = None # Optional system prompt + multi_modal_context: list[ImageContext] | None = None # Image inputs +``` + +## LLMCodeColumnConfig + +```python +class LLMCodeColumnConfig(SingleColumnConfig): + """Configuration for LLM code generation columns.""" + + column_type: Literal["llm-code"] = "llm-code" + model_alias: str # Reference to model configuration + prompt: str # Jinja2 template for the prompt + code_lang: CodeLang # Target programming language + system_prompt: str | None = None # Optional system prompt +``` + +## LLMStructuredColumnConfig + +```python +class LLMStructuredColumnConfig(SingleColumnConfig): + """Configuration for LLM structured output columns.""" + + column_type: Literal["llm-structured"] = "llm-structured" + model_alias: str # Reference to model configuration + prompt: str # Jinja2 template for the prompt + output_format: type[BaseModel] | dict # Pydantic model or JSON schema + system_prompt: str | None = None # Optional system prompt +``` + +## LLMJudgeColumnConfig + +```python +class LLMJudgeColumnConfig(SingleColumnConfig): + """Configuration for LLM judge/scoring columns.""" + + column_type: Literal["llm-judge"] = "llm-judge" + model_alias: str # Reference to model configuration + prompt: str # Jinja2 template for the judge prompt + scores: list[Score] # Scoring rubrics + system_prompt: str | None = None # Optional system prompt + +class Score(BaseModel): + """Scoring rubric definition.""" + + name: str # Score dimension name + description: str # Description of what's being evaluated + options: dict[str | int, str] # Score options with descriptions +``` + +## LLMEmbeddingColumnConfig + +```python +class LLMEmbeddingColumnConfig(SingleColumnConfig): + """Configuration for embedding generation columns.""" + + column_type: Literal["llm-embedding"] = "llm-embedding" + model_alias: str # Reference to model configuration + target_column: str # Column containing text to embed +``` + +## ExpressionColumnConfig + +```python +class ExpressionColumnConfig(SingleColumnConfig): + """Configuration for Jinja2 expression columns.""" + + column_type: Literal["expression"] = "expression" + expr: str # Jinja2 expression + dtype: Literal["str", "int", "float", "bool"] | None = None # Output type +``` + +## ValidationColumnConfig + +```python +class ValidationColumnConfig(SingleColumnConfig): + """Configuration for validation columns.""" + + column_type: Literal["validation"] = "validation" + validator_type: ValidatorType # Type of validator + target_columns: list[str] # Columns to validate + validator_params: ValidatorParams # Validator-specific parameters + batch_size: int = 10 # Number of records per validation batch +``` + +## SeedDatasetColumnConfig + +```python +class SeedDatasetColumnConfig(SingleColumnConfig): + """Configuration for seed dataset columns.""" + + column_type: Literal["seed-dataset"] = "seed-dataset" + source_column: str # Column name in the seed dataset +``` + +## CodeLang Enum + +```python +class CodeLang(str, Enum): + """Supported programming languages for code generation.""" + + PYTHON = "python" + JAVASCRIPT = "javascript" + TYPESCRIPT = "typescript" + JAVA = "java" + KOTLIN = "kotlin" + GO = "go" + RUST = "rust" + RUBY = "ruby" + SCALA = "scala" + SWIFT = "swift" + SQL_ANSI = "sql_ansi" + SQL_POSTGRES = "sql_postgres" + SQL_MYSQL = "sql_mysql" + SQL_SQLITE = "sql_sqlite" + SQL_TSQL = "sql_tsql" + SQL_BIGQUERY = "sql_bigquery" +``` + +## ValidatorType Enum + +```python +class ValidatorType(str, Enum): + """Supported validator types.""" + + CODE = "code" + LOCAL_CALLABLE = "local_callable" + REMOTE = "remote" +``` diff --git a/fern/v0.3.3/pages/api-reference/config-builder.mdx b/fern/v0.3.3/pages/api-reference/config-builder.mdx new file mode 100644 index 00000000..a301e35b --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/config-builder.mdx @@ -0,0 +1,170 @@ +--- +title: Data Designer's Config Builder +description: API reference for the DataDesignerConfigBuilder. +--- + +The `config_builder` module provides a high-level interface for constructing Data Designer configurations through the `DataDesignerConfigBuilder` class, enabling programmatic creation of `DataDesignerConfig` objects by incrementally adding column configurations, constraints, processors, and profilers. + +You can use the builder to create Data Designer configurations from scratch or from existing configurations stored in YAML/JSON files via `from_config()`. The builder includes validation capabilities to catch configuration errors early and can work with seed datasets from local sources or external datastores. Once configured, use `build()` to generate the final configuration object or `write_config()` to serialize it to disk. + + +`DataDesignerConfigBuilder` requires a list of model configurations at initialization. This tells the builder which model aliases can be referenced by LLM-generated columns (such as `LLMTextColumnConfig`, `LLMCodeColumnConfig`, `LLMStructuredColumnConfig`, and `LLMJudgeColumnConfig`). Each model configuration specifies the model alias, model provider, model ID, and inference parameters that will be used during data generation. + + +## DataDesignerConfigBuilder + +```python +class DataDesignerConfigBuilder: + """Builder for constructing Data Designer configurations.""" + + def __init__( + self, + model_configs: list[ModelConfig] | None = None, + ) -> None: + """Initialize the config builder. + + Args: + model_configs: List of model configurations. If None, loads defaults. + """ + ... + + @classmethod + def from_config( + cls, + config_path: str | Path, + model_configs: list[ModelConfig] | None = None, + ) -> "DataDesignerConfigBuilder": + """Create a builder from an existing configuration file. + + Args: + config_path: Path to YAML/JSON configuration file. + model_configs: Optional model configurations (overrides config file). + + Returns: + Configured DataDesignerConfigBuilder instance. + """ + ... + + def add_column( + self, + config: SingleColumnConfig | None = None, + **kwargs: Any, + ) -> "DataDesignerConfigBuilder": + """Add a column configuration to the builder. + + Args: + config: Column configuration object, OR + **kwargs: Keyword arguments to construct a column config. + + Returns: + Self for method chaining. + """ + ... + + def add_model_config( + self, + model_config: ModelConfig, + ) -> "DataDesignerConfigBuilder": + """Add a model configuration to the builder. + + Args: + model_config: Model configuration to add. + + Returns: + Self for method chaining. + """ + ... + + def add_processor( + self, + processor: ProcessorConfig, + ) -> "DataDesignerConfigBuilder": + """Add a processor to the configuration. + + Args: + processor: Processor configuration to add. + + Returns: + Self for method chaining. + """ + ... + + def with_seed_dataset( + self, + seed_source: SeedSource, + ) -> "DataDesignerConfigBuilder": + """Configure a seed dataset for the generation. + + Args: + seed_source: Seed dataset source configuration. + + Returns: + Self for method chaining. + """ + ... + + def build(self) -> DataDesignerConfig: + """Build the final configuration object. + + Returns: + Complete DataDesignerConfig object. + """ + ... + + def write_config( + self, + path: str | Path, + format: Literal["yaml", "json"] = "yaml", + ) -> None: + """Write the configuration to a file. + + Args: + path: Output file path. + format: Output format (yaml or json). + """ + ... + + @property + def info(self) -> ConfigBuilderInfo: + """Access to configuration information display utilities.""" + ... +``` + +## ConfigBuilderInfo + +```python +class ConfigBuilderInfo: + """Utility for displaying configuration information.""" + + def display( + self, + info_type: str | InfoType, + ) -> None: + """Display information about the configuration. + + Args: + info_type: Type of information to display. + Options: "samplers", "model_configs", "model_providers", etc. + """ + ... +``` + +## Seed Sources + +### LocalFileSeedSource + +```python +class LocalFileSeedSource(BaseModel): + """Seed dataset from a local file.""" + + path: str | Path # Path to CSV, Parquet, or JSON file +``` + +### DataFrameSeedSource + +```python +class DataFrameSeedSource(BaseModel): + """Seed dataset from a pandas DataFrame.""" + + df: pd.DataFrame # DataFrame to use as seed data +``` diff --git a/fern/v0.3.3/pages/api-reference/data-designer-config.mdx b/fern/v0.3.3/pages/api-reference/data-designer-config.mdx new file mode 100644 index 00000000..94d7c01d --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/data-designer-config.mdx @@ -0,0 +1,106 @@ +--- +title: Data Designer Configuration +description: API reference for the DataDesignerConfig object. +--- + +`DataDesignerConfig` is the main configuration object for building datasets with Data Designer. It is a declarative configuration for defining the dataset you want to generate column-by-column, including options for dataset post-processing, validation, and profiling. + +Generally, you should use the [DataDesignerConfigBuilder](/api/config-builder) to build your configuration, but you can also build it manually by instantiating the `DataDesignerConfig` class directly. + +## DataDesignerConfig + +```python +class DataDesignerConfig(BaseModel): + """Complete configuration for a Data Designer generation job.""" + + columns: list[SingleColumnConfig] # List of column configurations + processors: list[ProcessorConfig] = [] # Post-generation processors + seed_source: SeedSource | None = None # Optional seed dataset + + @property + def column_names(self) -> list[str]: + """Names of all configured columns.""" + ... + + @property + def dependency_graph(self) -> dict[str, list[str]]: + """Column dependency graph for execution ordering.""" + ... + + def get_column(self, name: str) -> SingleColumnConfig: + """Get a column configuration by name. + + Args: + name: Column name. + + Returns: + Column configuration. + + Raises: + KeyError: If column not found. + """ + ... + + def to_yaml(self) -> str: + """Serialize configuration to YAML string.""" + ... + + def to_json(self) -> str: + """Serialize configuration to JSON string.""" + ... + + @classmethod + def from_yaml(cls, yaml_str: str) -> "DataDesignerConfig": + """Deserialize configuration from YAML string.""" + ... + + @classmethod + def from_json(cls, json_str: str) -> "DataDesignerConfig": + """Deserialize configuration from JSON string.""" + ... + + @classmethod + def from_file(cls, path: str | Path) -> "DataDesignerConfig": + """Load configuration from a file. + + Args: + path: Path to YAML or JSON file. + + Returns: + Loaded configuration. + """ + ... +``` + +## Configuration Serialization + +Data Designer configs can be serialized to and from YAML or JSON format, making it easy to: + +- Save configurations for reproducibility +- Share configurations with team members +- Version control your data generation pipelines +- Load and modify existing configurations + +### Example: Saving and Loading Configs + +```python +from data_designer.essentials import DataDesignerConfigBuilder + +# Build a configuration +builder = DataDesignerConfigBuilder() +builder.add_column(name="id", column_type="sampler", sampler_type="uuid") +builder.add_column( + name="greeting", + column_type="llm-text", + model_alias="nvidia-text", + prompt="Write a greeting." +) + +# Save to file +builder.write_config("my_config.yaml") + +# Load from file later +from data_designer.config.data_designer_config import DataDesignerConfig + +config = DataDesignerConfig.from_file("my_config.yaml") +``` diff --git a/fern/v0.3.3/pages/api-reference/models.mdx b/fern/v0.3.3/pages/api-reference/models.mdx new file mode 100644 index 00000000..831f780b --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/models.mdx @@ -0,0 +1,104 @@ +--- +title: Models +description: API reference for model configuration objects. +--- + +The `models` module defines configuration objects for model-based generation. `ModelProvider` specifies connection and authentication details for custom providers. `ModelConfig` encapsulates model details including the model alias, identifier, and inference parameters. [Inference Parameters](/docs/concepts/models/inference-parameters) controls model behavior through settings like `temperature`, `top_p`, and `max_tokens`, with support for both fixed values and distribution-based sampling. The module includes `ImageContext` for providing image inputs to multimodal models. + +For more information on how they are used, see below: + +- **[Model Providers](/docs/concepts/models/model-providers)** +- **[Model Configs](/docs/concepts/models/model-configs)** +- **[Images as Context](/docs/tutorials/images-as-context)** + +## ModelProvider + +```python +class ModelProvider(BaseModel): + """Configuration for a model provider endpoint.""" + + name: str # Unique identifier for the provider + endpoint: str # API endpoint URL + provider_type: str = "openai" # Provider type (default: OpenAI-compatible) + api_key: str | None = None # API key or environment variable name + extra_body: dict[str, Any] | None = None # Additional request body parameters + extra_headers: dict[str, str] | None = None # Additional headers +``` + +## ModelConfig + +```python +class ModelConfig(BaseModel): + """Configuration for a specific model.""" + + alias: str # Unique identifier for this model configuration + model: str # Model identifier as recognized by the provider + provider: str | None = None # Reference to provider by name + inference_parameters: InferenceParamsT | None = None # Inference parameters +``` + +## ChatCompletionInferenceParams + +```python +class ChatCompletionInferenceParams(BaseModel): + """Parameters for chat completion inference.""" + + temperature: float | Distribution | None = None # Sampling temperature (0.0-2.0) + top_p: float | Distribution | None = None # Nucleus sampling parameter (0.0-1.0) + max_tokens: int | None = None # Maximum output tokens + max_parallel_requests: int = 4 # Maximum concurrent API requests + timeout: int | None = None # Request timeout in seconds + extra_body: dict[str, Any] | None = None # Additional request body parameters +``` + +## EmbeddingInferenceParams + +```python +class EmbeddingInferenceParams(BaseModel): + """Parameters for embedding inference.""" + + encoding_format: Literal["float", "base64"] = "float" # Embedding encoding format + dimensions: int | None = None # Number of embedding dimensions + max_parallel_requests: int = 4 # Maximum concurrent API requests + timeout: int | None = None # Request timeout in seconds + extra_body: dict[str, Any] | None = None # Additional request body parameters +``` + +## ImageContext + +```python +class ImageContext(BaseModel): + """Configuration for providing image context to vision models.""" + + column_name: str # Name of column containing image data + data_type: ModalityDataType # Type of image data (BASE64, URL, etc.) + image_format: ImageFormat | None = None # Image format (PNG, JPEG, etc.) +``` + +## Distribution Types + +### UniformDistribution + +```python +class UniformDistribution(BaseModel): + """Uniform distribution for parameter sampling.""" + + params: UniformDistributionParams + +class UniformDistributionParams(BaseModel): + low: float # Lower bound + high: float # Upper bound +``` + +### ManualDistribution + +```python +class ManualDistribution(BaseModel): + """Manual distribution with discrete values.""" + + params: ManualDistributionParams + +class ManualDistributionParams(BaseModel): + values: list[float] # Discrete values to sample from + weights: list[float] | None = None # Optional probability weights +``` diff --git a/fern/v0.3.3/pages/api-reference/processors.mdx b/fern/v0.3.3/pages/api-reference/processors.mdx new file mode 100644 index 00000000..e1ce9ca4 --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/processors.mdx @@ -0,0 +1,110 @@ +--- +title: Processors +description: API reference for processor configuration objects. +--- + +The `processors` module defines configuration objects for post-generation data transformations. Processors run after column generation and can modify the dataset schema or content before output. + +## DropColumnsProcessorConfig + +```python +class DropColumnsProcessorConfig(BaseModel): + """Configuration for dropping columns from output.""" + + name: str # Processor identifier + column_names: list[str] # Columns to remove from output + build_stage: BuildStage = BuildStage.POST_BATCH # When to run +``` + +### Behavior + +- Columns in `column_names` are removed from the main output +- Dropped column values are saved to a separate file in `dropped-columns/` +- Missing columns produce a warning but don't fail the build +- Column configs are automatically marked with `drop=True` + +### Example Usage + +```python +from data_designer.essentials import ( + DataDesignerConfigBuilder, + DropColumnsProcessorConfig, +) + +builder = DataDesignerConfigBuilder() +# ... add columns ... + +builder.add_processor( + DropColumnsProcessorConfig( + name="remove_intermediate", + column_names=["temp_calculation", "raw_context", "debug_info"], + ) +) +``` + +## SchemaTransformProcessorConfig + +```python +class SchemaTransformProcessorConfig(BaseModel): + """Configuration for transforming output schema.""" + + name: str # Processor identifier + template: dict[str, Any] # Jinja2 template for output schema + build_stage: BuildStage = BuildStage.POST_BATCH # When to run +``` + +### Behavior + +- Each key in `template` becomes a column in the transformed output +- Values are Jinja2 templates with access to all columns +- Complex structures (lists, nested dicts) are supported +- Output saved to `processors-outputs/{name}/` +- Original dataset passes through unchanged + +### Template Capabilities + +- **Variable substitution**: `{{ column_name }}` +- **Filters**: `{{ text | upper }}`, `{{ text | lower }}`, `{{ text | trim }}` +- **Nested structures**: Arbitrarily deep JSON structures +- **Lists**: `["{{ col1 }}", "{{ col2 }}"]` +- **Conditionals**: `{% if condition %}...{% endif %}` + +### Example Usage + +```python +from data_designer.essentials import ( + DataDesignerConfigBuilder, + SchemaTransformProcessorConfig, +) + +builder = DataDesignerConfigBuilder() +# ... add columns with 'question' and 'answer' ... + +# Transform to chat message format +builder.add_processor( + SchemaTransformProcessorConfig( + name="chat_format", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + "metadata": { + "category": "{{ category | upper }}", + "generated": True, + }, + }, + ) +) +``` + +## BuildStage Enum + +```python +class BuildStage(str, Enum): + """When processors run in the generation pipeline.""" + + POST_BATCH = "post_batch" # After each batch is generated +``` + +Currently, all processors run at the `POST_BATCH` stage. Additional stages may be added in future versions. diff --git a/fern/v0.3.3/pages/api-reference/sampler-params.mdx b/fern/v0.3.3/pages/api-reference/sampler-params.mdx new file mode 100644 index 00000000..7858fd81 --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/sampler-params.mdx @@ -0,0 +1,150 @@ +--- +title: Sampler Parameters +description: API reference for sampler parameter configuration objects. +--- + +The `sampler_params` module defines parameter configuration objects for all Data Designer sampler types. Sampler parameters are used within the `SamplerColumnConfig` to specify how values should be generated for sampled columns. + + +The config builder has an `info` attribute that can be used to display the available sampler types and their parameters: + +```python +config_builder.info.display("samplers") +``` + + +## CategorySamplerParams + +```python +class CategorySamplerParams(BaseModel): + """Parameters for categorical sampling.""" + + values: list[Any] # List of categorical values to sample from + weights: list[float] | None = None # Optional probability weights (normalized) +``` + +## SubcategorySamplerParams + +```python +class SubcategorySamplerParams(BaseModel): + """Parameters for hierarchical categorical sampling.""" + + category: str # Name of parent category column + values: dict[str, list[Any]] # Mapping of parent values to subcategory values +``` + +## UniformSamplerParams + +```python +class UniformSamplerParams(BaseModel): + """Parameters for uniform distribution sampling.""" + + low: float # Lower bound (inclusive) + high: float # Upper bound (exclusive for floats, inclusive for ints) +``` + +## GaussianSamplerParams + +```python +class GaussianSamplerParams(BaseModel): + """Parameters for Gaussian (normal) distribution sampling.""" + + mean: float = 0.0 # Distribution mean + std: float = 1.0 # Standard deviation +``` + +## BernoulliSamplerParams + +```python +class BernoulliSamplerParams(BaseModel): + """Parameters for Bernoulli (binary) sampling.""" + + p: float # Probability of success (1) +``` + +## BinomialSamplerParams + +```python +class BinomialSamplerParams(BaseModel): + """Parameters for binomial distribution sampling.""" + + n: int # Number of trials + p: float # Probability of success per trial +``` + +## PoissonSamplerParams + +```python +class PoissonSamplerParams(BaseModel): + """Parameters for Poisson distribution sampling.""" + + lam: float # Expected number of events (lambda) +``` + +## ScipySamplerParams + +```python +class ScipySamplerParams(BaseModel): + """Parameters for scipy.stats distribution sampling.""" + + distribution: str # Name of scipy.stats distribution + params: dict[str, Any] # Distribution-specific parameters +``` + +## UUIDSamplerParams + +```python +class UUIDSamplerParams(BaseModel): + """Parameters for UUID generation.""" + + prefix: str = "" # Optional prefix for the UUID + short_form: bool = False # Use shortened UUID format + uppercase: bool = False # Convert to uppercase +``` + +## DateTimeSamplerParams + +```python +class DateTimeSamplerParams(BaseModel): + """Parameters for datetime sampling.""" + + start: str | datetime # Start of date range + end: str | datetime # End of date range + format: str | None = None # Output format string (strftime) +``` + +## TimedeltaSamplerParams + +```python +class TimedeltaSamplerParams(BaseModel): + """Parameters for timedelta (duration) sampling.""" + + dt_min: int # Minimum delta in days + dt_max: int # Maximum delta in days + reference_column_name: str # Column containing reference datetime +``` + +## PersonSamplerParams + +```python +class PersonSamplerParams(BaseModel): + """Parameters for Nemotron-Personas person sampling.""" + + locale: str # Locale code (en_US, ja_JP, en_IN, hi_Deva_IN, hi_Latn_IN) + sex: str | None = None # Filter by "Male" or "Female" + city: str | list[str] | None = None # Filter by city + age_range: list[int] | None = None # [min_age, max_age] + with_synthetic_personas: bool = False # Include personality profiles + select_field_values: dict[str, list[str]] | None = None # Custom field filters +``` + +## PersonFromFakerSamplerParams + +```python +class PersonFromFakerSamplerParams(BaseModel): + """Parameters for Faker-based person sampling.""" + + locale: str = "en_US" # Faker locale + age_range: list[int] | None = None # [min_age, max_age] + sex: str | None = None # Filter by "Male" or "Female" +``` diff --git a/fern/v0.3.3/pages/api-reference/validator-params.mdx b/fern/v0.3.3/pages/api-reference/validator-params.mdx new file mode 100644 index 00000000..3308e224 --- /dev/null +++ b/fern/v0.3.3/pages/api-reference/validator-params.mdx @@ -0,0 +1,167 @@ +--- +title: Validator Parameters +description: API reference for validator parameter configuration objects. +--- + +When creating a `ValidationColumnConfig`, two parameters are used to define the validator: `validator_type` and `validator_params`. +The `validator_type` parameter can be set to either `code`, `local_callable` or `remote`. The `validator_params` accompanying each of these is described below. + +## CodeValidatorParams + +```python +class CodeValidatorParams(BaseModel): + """Parameters for code validation.""" + + code_lang: CodeLang # Programming language to validate +``` + +### Supported Languages + +For Python code validation (uses Ruff): +- `CodeLang.PYTHON` + +For SQL code validation (uses SQLFluff): +- `CodeLang.SQL_ANSI` +- `CodeLang.SQL_POSTGRES` +- `CodeLang.SQL_MYSQL` +- `CodeLang.SQL_SQLITE` +- `CodeLang.SQL_TSQL` +- `CodeLang.SQL_BIGQUERY` + +### Example Usage + +```python +from data_designer.essentials import ( + CodeLang, + CodeValidatorParams, + ValidationColumnConfig, + ValidatorType, +) + +# Python code validation +python_validator = ValidationColumnConfig( + name="python_validation", + validator_type=ValidatorType.CODE, + target_columns=["python_code"], + validator_params=CodeValidatorParams(code_lang=CodeLang.PYTHON), + batch_size=10, +) + +# SQL code validation +sql_validator = ValidationColumnConfig( + name="sql_validation", + validator_type=ValidatorType.CODE, + target_columns=["sql_query"], + validator_params=CodeValidatorParams(code_lang=CodeLang.SQL_POSTGRES), + batch_size=10, +) +``` + +## LocalCallableValidatorParams + +```python +class LocalCallableValidatorParams(BaseModel): + """Parameters for local callable validation.""" + + validation_function: Callable[[pd.DataFrame], pd.DataFrame] + # Function that takes DataFrame and returns DataFrame with is_valid column + + output_schema: dict | None = None + # Optional JSON schema to validate function output +``` + +### Function Requirements + +The validation function must: +1. Accept a `pd.DataFrame` containing the target columns +2. Return a `pd.DataFrame` with at minimum an `is_valid` column (boolean or null) +3. Any additional columns in the output become validation metadata + +### Example Usage + +```python +import pandas as pd +from data_designer.essentials import ( + LocalCallableValidatorParams, + ValidationColumnConfig, + ValidatorType, +) + +def validate_positive_prices(df: pd.DataFrame) -> pd.DataFrame: + """Validate that all prices are positive.""" + result = pd.DataFrame() + result["is_valid"] = df["price"] > 0 + result["error_message"] = result["is_valid"].apply( + lambda v: "" if v else "Price must be positive" + ) + return result + +validator = ValidationColumnConfig( + name="price_validation", + validator_type=ValidatorType.LOCAL_CALLABLE, + target_columns=["price"], + validator_params=LocalCallableValidatorParams( + validation_function=validate_positive_prices, + ), + batch_size=50, +) +``` + +## RemoteValidatorParams + +```python +class RemoteValidatorParams(BaseModel): + """Parameters for remote HTTP validation.""" + + endpoint_url: str # URL of the validation endpoint + timeout: float = 30.0 # Request timeout in seconds + max_retries: int = 3 # Number of retry attempts + retry_backoff: float = 2.0 # Exponential backoff factor + max_parallel_requests: int = 4 # Maximum concurrent requests + output_schema: dict | None = None # Optional response schema validation +``` + +### Request/Response Format + +**Request (POST):** +```json +{ + "data": [ + {"column1": "value1", "column2": "value2"}, + {"column1": "value3", "column2": "value4"} + ] +} +``` + +**Response:** +```json +{ + "data": [ + {"is_valid": true, "additional_field": "value"}, + {"is_valid": false, "additional_field": "value"} + ] +} +``` + +### Example Usage + +```python +from data_designer.essentials import ( + RemoteValidatorParams, + ValidationColumnConfig, + ValidatorType, +) + +validator = ValidationColumnConfig( + name="external_validation", + validator_type=ValidatorType.REMOTE, + target_columns=["content"], + validator_params=RemoteValidatorParams( + endpoint_url="https://api.example.com/validate", + timeout=60.0, + max_retries=3, + max_parallel_requests=4, + ), + batch_size=5, +) +``` diff --git a/fern/v0.3.3/pages/concepts/columns.mdx b/fern/v0.3.3/pages/concepts/columns.mdx new file mode 100644 index 00000000..ac13079a --- /dev/null +++ b/fern/v0.3.3/pages/concepts/columns.mdx @@ -0,0 +1,160 @@ +--- +title: Columns +description: The fundamental building blocks in Data Designer for defining dataset fields. +--- + +Columns are the fundamental building blocks in Data Designer. Each column represents a field in your dataset and defines how to generate it—whether that's sampling from a distribution, calling an LLM, or applying a transformation. + + +Columns are **declarative specifications**. You describe *what* you want, and the framework handles *how* to generate it—managing execution order, batching, parallelization, and resources automatically. + + +## Column Types + +Data Designer provides nine built-in column types, each optimized for different generation scenarios. + +### 🎲 Sampler Columns + +Sampler columns generate data using numerical sampling—fast, deterministic, and ideal for numerical and categorical dataset fields. They're significantly faster than LLMs and can produce data following specific distributions (Poisson for event counts, Gaussian for measurements, etc.). + +Available sampler types: + +- **UUID**: Unique identifiers +- **Category**: Categorical values with optional probability weights +- **Subcategory**: Hierarchical categorical data (states within countries, models within brands) +- **Uniform**: Evenly distributed numbers (integers or floats) +- **Gaussian**: Normally distributed values with configurable mean and standard deviation +- **Bernoulli**: Binary outcomes with specified success probability +- **Bernoulli Mixture**: Binary outcomes from multiple probability components +- **Binomial**: Count of successes in repeated trials +- **Poisson**: Count data and event frequencies +- **Scipy**: Access to the full scipy.stats distribution library +- **Person**: Realistic synthetic individuals with names, demographics, and attributes +- **Datetime**: Timestamps within specified ranges +- **Timedelta**: Time duration values + + +Samplers support **conditional parameters** that change behavior based on other columns. Want age distributions that vary by country? Income ranges that depend on occupation? Just define conditions on existing column values. + + +### 📝 LLM-Text Columns + +LLM-Text columns generate natural language text: product descriptions, customer reviews, narrative summaries, email threads, or anything requiring semantic understanding and creativity. + +Use **Jinja2 templating** in prompts to reference other columns. Data Designer automatically manages dependencies and injects the referenced column values into the prompt. + + +Models that support extended thinking (chain-of-thought reasoning) can capture their reasoning process in a separate `{column_name}__reasoning_trace` column—useful for understanding *why* the model generated specific content. This column is automatically added to the dataset if the model and service provider parse and return reasoning content. + + +### 💻 LLM-Code Columns + +LLM-Code columns generate code in specific programming languages. They handle the prompting and parsing necessary to extract clean code from the LLM's response—automatically detecting and extracting code from markdown blocks. You provide the prompt and choose the model; the column handles the extraction. + +Supported languages: **Python, JavaScript, TypeScript, Java, Kotlin, Go, Rust, Ruby, Scala, Swift**, plus **SQL** dialects (SQLite, PostgreSQL, MySQL, T-SQL, BigQuery, ANSI SQL). + +### 🗂️ LLM-Structured Columns + +LLM-Structured columns generate JSON with a *guaranteed schema*. Define your structure using a Pydantic model or JSON schema, and Data Designer ensures the LLM output conforms—no parsing errors, no schema drift. + +Use for complex nested structures: API responses, configuration files, database records with multiple related fields, or any structured data where type safety matters. Schemas can be arbitrarily complex with nested objects, arrays, enums, and validation constraints, but success depends on the model's capabilities. + + +Flat schemas with simple fields are easier and more robustly produced across models. Deeply nested schemas with complex validation constraints are more sensitive to model choice—stronger models handle complexity better. If you're experiencing schema conformance issues, try simplifying the schema or switching to a more capable model. + + +### ⚖️ LLM-Judge Columns + +LLM-Judge columns score generated content across multiple quality dimensions using LLMs as evaluators. + +Define scoring rubrics (relevance, accuracy, fluency, helpfulness) and the judge model evaluates each record. Score rubrics specify criteria and scoring options (1-5 scales, categorical grades, etc.), producing quantified quality metrics for every data point. + +Use judge columns for data quality filtering (e.g., keep only 4+ rated responses), A/B testing generation strategies, and quality monitoring over time. + +### 🧬 Embedding Columns + +Embedding columns generate vector embeddings (numerical representations) for text content using embedding models. These embeddings capture semantic meaning, enabling similarity search, clustering, and semantic analysis. + +Specify a `target_column` containing text, and Data Designer generates embeddings for that content. The target column can contain either a single text string or a list of text strings in stringified JSON format. In the latter case, embeddings are generated for each text string in the list. + +Common use cases: + +- **Semantic search**: Generate embeddings for documents, then find similar content by vector similarity +- **Clustering**: Group similar texts based on embedding proximity +- **Recommendation systems**: Match content by semantic similarity +- **Anomaly detection**: Identify outliers in embedding space + + +Embedding columns require an embedding model configured with `EmbeddingInferenceParams`. These models differ from chat completion models—they output vectors rather than text. The generation type is automatically determined by the inference parameters type. + + +### 🧩 Expression Columns + +Expression columns handle simple transformations using **Jinja2 templates**—concatenate first and last names, calculate numerical totals, format date strings. No LLM overhead needed. + +Template capabilities: + +- **Variable substitution**: Pull values from any existing column +- **String filters**: Uppercase, lowercase, strip whitespace, replace patterns +- **Conditional logic**: if/elif/else support +- **Arithmetic**: Add, subtract, multiply, divide + +### 🔍 Validation Columns + +Validation columns check generated content against rules and return structured pass/fail results. + +Built-in validation types: + +**Code validation** runs Python or SQL code through a linter to validate the code. + +**Local callable validation** accepts a Python function directly when using Data Designer as a library. + +**Remote validation** sends data to HTTP endpoints for validation-as-a-service. Useful for linters, security scanners, or proprietary systems. + +### 🌱 Seed Dataset Columns + +Seed dataset columns bootstrap generation from existing data. Provide a real dataset, and those columns become available as context for generating new synthetic data. + +Typical pattern: use seed data for one part of your schema (real product names and categories), then generate synthetic fields around it (customer reviews, purchase histories, ratings). The seed data provides realism and constraints; generated columns add volume and variation. + +## Shared Column Properties + +Every column configuration inherits from `SingleColumnConfig` with these standard properties: + +### `name` + +The column's identifier—unique within your configuration, used in Jinja2 references, and becomes the column name in the output DataFrame. Choose descriptive names: `user_review` > `col_17`. + +### `drop` + +Boolean flag (default: `False`) controlling whether the column appears in final output. Setting `drop=True` generates the column (available as a dependency) but excludes it from final output. + +**When to drop columns:** + +- Intermediate calculations that feed expressions but aren't meaningful standalone +- Context columns used only for LLM prompt templates +- Validation results during development unwanted in production + +Dropped columns participate fully in generation and the dependency graph—just filtered out at the end. + +### `column_type` + +Literal string identifying the column type: `"sampler"`, `"llm-text"`, `"expression"`, etc. Set automatically by each configuration class and serves as Pydantic's discriminator for deserialization. + +You rarely set this manually—instantiating `LLMTextColumnConfig` automatically sets `column_type="llm-text"`. Serialization is reversible: save to YAML, load later, and Pydantic reconstructs the exact objects. + +### `required_columns` + +Computed property listing columns that must be generated before this one. The framework derives this automatically: + +- For LLM/Expression columns: extracted from Jinja2 template `{{ variables }}` +- For Validation columns: explicitly listed target columns +- For Sampler columns with conditional parameters: columns referenced in conditions + +You read this property for introspection but never set it—always computed from configuration details. + +### `side_effect_columns` + +Computed property listing columns created implicitly alongside the primary column. Currently, only LLM columns produce side effects (reasoning trace columns like `{name}__reasoning_trace` when models use extended thinking). + +For detailed information on each column type, refer to the [column configuration API reference](/api/column-configs). diff --git a/fern/v0.3.3/pages/concepts/models/configure-with-cli.mdx b/fern/v0.3.3/pages/concepts/models/configure-with-cli.mdx new file mode 100644 index 00000000..90b58bf1 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/models/configure-with-cli.mdx @@ -0,0 +1,148 @@ +--- +title: Configuring Model Settings Using The CLI +description: Use the Data Designer CLI to manage model providers and configurations. +--- + +The Data Designer CLI provides an interactive interface for creating and managing default model providers and model configurations stored in your Data Designer home directory (default: `~/.data-designer/`). + +## Configuration Files + +The CLI manages two YAML configuration files: + +- **`model_providers.yaml`**: Model provider configurations +- **`model_configs.yaml`**: Model configurations + + +If these configuration files don't already exist, the Data Designer library automatically creates them with default settings at runtime when first initialized. + + + +You can customize the configuration directory location with the `DATA_DESIGNER_HOME` environment variable: + +```bash +export DATA_DESIGNER_HOME="/path/to/your/custom/directory" +``` + + +## CLI Commands + +The Data Designer CLI provides four main configuration commands: + +```bash +# Configure model providers +data-designer config providers + +# Configure models +data-designer config models + +# List current configurations +data-designer config list + +# Reset all configurations +data-designer config reset +``` + + +See available commands + +```bash +data-designer --help +``` + +See available sub-commands + +```bash +data-designer config --help +``` + + +## Managing Model Providers + +Run the interactive provider configuration command: + +```bash +data-designer config providers +``` + +### Available Operations + +**Add a new provider**: Define a new provider by entering its name, endpoint URL, provider type, and optionally an API key (as plain text or as an environment variable name). + +**Update an existing provider**: Modify an existing provider's settings. All fields are pre-filled with current values. + +**Delete a provider**: Remove a provider and its associated models. + +**Delete all providers**: Remove all providers and their associated models. + +**Change default provider**: Set which provider is used by default. This option is only available when multiple providers are configured. + +## Managing Model Configurations + +Run the interactive model configuration command: + +```bash +data-designer config models +``` + + +You need at least one provider configured before adding models. Run `data-designer config providers` first if none exist. + + +### Available Operations + +**Add a new model configuration** + +Create a new model configuration with the following fields: + +- **Alias**: A unique name for referencing this model in a column configuration. +- **Model ID**: The model identifier (e.g., `nvidia/nemotron-3-nano-30b-a3b`) +- **Provider**: Select from available providers (if multiple exist) +- **Temperature**: Sampling temperature (0.0 to 2.0) +- **Top P**: Nucleus sampling parameter (0.0 to 1.0) +- **Max Tokens**: Maximum output length (1 to 100000) + + +To configure additional inference parameter settings or use distribution-based inference parameters, edit the `model_configs.yaml` file directly. + + +**Update an existing model configuration**: Modify an existing model's configuration. All fields are pre-filled with current values. + +**Delete a model configuration**: Remove a single model configuration. + +**Delete all model configurations**: Remove all model configurations. The CLI will ask for confirmation before proceeding. + +## Listing Configurations + +View all current configurations: + +```bash +data-designer config list +``` + +This command displays: + +- **Model Providers**: All configured providers with their endpoints (API keys are masked) +- **Default Provider**: The currently selected default provider +- **Model Configurations**: All configured models with their settings + +## Resetting Configurations + +Delete all configuration files: + +```bash +data-designer config reset +``` + +The CLI will show which configuration files exist and ask for confirmation before deleting them. + + +This command permanently deletes all configuration files and resets to the default model providers and configurations. You'll need to reconfigure your custom configurations from scratch. + + +## See Also + +- **[Default Model Settings](/docs/concepts/models/default-model-settings)**: Pre-configured providers and model settings included with Data Designer +- **[Custom Model Settings](/docs/concepts/models/custom-model-settings)**: Learn how to create custom providers and model configurations +- **[Model Providers](/docs/concepts/models/model-providers)**: Learn about the `ModelProvider` class and provider configuration +- **[Model Configurations](/docs/concepts/models/model-configs)**: Learn about `ModelConfig` +- **[Quick Start Guide](/docs/quick-start)**: Get started with a simple example diff --git a/fern/v0.3.3/pages/concepts/models/custom-model-settings.mdx b/fern/v0.3.3/pages/concepts/models/custom-model-settings.mdx new file mode 100644 index 00000000..bf713956 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/models/custom-model-settings.mdx @@ -0,0 +1,235 @@ +--- +title: Custom Model Settings +description: Create custom providers and model configurations for Data Designer. +--- + +While Data Designer ships with pre-configured model providers and configurations, you can create custom configurations to use different models, adjust inference parameters, or connect to custom API endpoints. + +## When to Use Custom Settings + +Use custom model settings when you need to: + +- Use models not included in the defaults +- Adjust inference parameters (temperature, top_p, max_tokens) for specific use cases +- Add distribution-based inference parameters for variability +- Connect to self-hosted or custom model endpoints +- Create multiple variants of the same model with different settings + +## Creating and Using Custom Settings + +### Custom Models with Default Providers + +Create custom model configurations that use the default providers (no need to define providers yourself): + +```python +from data_designer.essentials import ( + CategorySamplerParams, + ChatCompletionInferenceParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMTextColumnConfig, + ModelConfig, + SamplerColumnConfig, + SamplerType, +) + +# Create custom models using default providers +custom_models = [ + # High-temperature for more variability + ModelConfig( + alias="creative-writer", + model="nvidia/nemotron-3-nano-30b-a3b", + provider="nvidia", # Uses default NVIDIA provider + inference_parameters=ChatCompletionInferenceParams( + temperature=1.2, + top_p=0.98, + max_tokens=4096, + ), + ), + # Low-temperature for less variability + ModelConfig( + alias="fact-checker", + model="nvidia/nemotron-3-nano-30b-a3b", + provider="nvidia", # Uses default NVIDIA provider + inference_parameters=ChatCompletionInferenceParams( + temperature=0.1, + top_p=0.9, + max_tokens=2048, + ), + ), +] + +# Create DataDesigner (uses default providers) +data_designer = DataDesigner() + +# Pass custom models to config builder +config_builder = DataDesignerConfigBuilder(model_configs=custom_models) + +# Add a topic column using a categorical sampler +config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Artificial Intelligence", "Space Exploration", "Ancient History", "Climate Science"], + ), + ) +) + +# Use your custom models +config_builder.add_column( + LLMTextColumnConfig( + name="creative_story", + model_alias="creative-writer", + prompt="Write a creative short story about {{topic}}.", + ) +) + +config_builder.add_column( + LLMTextColumnConfig( + name="facts", + model_alias="fact-checker", + prompt="List 3 facts about {{topic}}.", + ) +) + +# Preview your dataset +preview_result = data_designer.preview(config_builder=config_builder) +preview_result.display_sample_record() +``` + + +When you only specify `model_configs`, the default model providers (NVIDIA, OpenAI, and OpenRouter) are still available. You only need to create custom providers if you want to connect to different endpoints or modify provider settings. + + + +When you provide custom `model_configs` to `DataDesignerConfigBuilder`, they **replace** the defaults entirely. To use custom model configs in addition to the default configs, use the add_model_config method: + +```python +# Load defaults first +config_builder = DataDesignerConfigBuilder() + +# Add custom model to defaults +config_builder.add_model_config( + ModelConfig( + alias="my-custom-model", + model="nvidia/llama-3.3-nemotron-super-49b-v1.5", + provider="nvidia", # Uses default provider + inference_parameters=ChatCompletionInferenceParams( + temperature=0.6, + max_tokens=8192, + ), + ) +) + +# Now you can use both default and custom models +# Default: nvidia-text, nvidia-reasoning, nvidia-vision, etc. +# Custom: my-custom-model +``` + + +### Custom Providers with Custom Models + +Define both custom providers and custom model configurations when you need to connect to services not included in the defaults: + + +The custom provider endpoints must be reachable from where Data Designer runs. Ensure network connectivity, firewall rules, and any VPN requirements are properly configured. + + +```python +from data_designer.essentials import ( + CategorySamplerParams, + ChatCompletionInferenceParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMTextColumnConfig, + ModelConfig, + ModelProvider, + SamplerColumnConfig, + SamplerType, +) + +# Step 1: Define custom providers +custom_providers = [ + ModelProvider( + name="my-custom-provider", + endpoint="https://api.my-llm-service.com/v1", + provider_type="openai", # OpenAI-compatible API + api_key="MY_SERVICE_API_KEY", # Environment variable name + ), + ModelProvider( + name="my-self-hosted-provider", + endpoint="https://my-org.internal.com/llm/v1", + provider_type="openai", + api_key="SELF_HOSTED_API_KEY", + ), +] + +# Step 2: Define custom models +custom_models = [ + ModelConfig( + alias="my-text-model", + model="openai/some-model-id", + provider="my-custom-provider", # References provider by name + inference_parameters=ChatCompletionInferenceParams( + temperature=0.85, + top_p=0.95, + max_tokens=2048, + ), + ), + ModelConfig( + alias="my-self-hosted-text-model", + model="openai/some-hosted-model-id", + provider="my-self-hosted-provider", + inference_parameters=ChatCompletionInferenceParams( + temperature=0.7, + top_p=0.9, + max_tokens=1024, + ), + ), +] + +# Step 3: Create DataDesigner with custom providers +data_designer = DataDesigner(model_providers=custom_providers) + +# Step 4: Create config builder with custom models +config_builder = DataDesignerConfigBuilder(model_configs=custom_models) + +# Step 5: Add a topic column using a categorical sampler +config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Technology", "Healthcare", "Finance", "Education"], + ), + ) +) + +# Step 6: Use your custom model by referencing its alias +config_builder.add_column( + LLMTextColumnConfig( + name="short_news_article", + model_alias="my-text-model", # Reference custom alias + prompt="Write a short news article about the '{{topic}}' topic in 10 sentences.", + ) +) + +config_builder.add_column( + LLMTextColumnConfig( + name="long_news_article", + model_alias="my-self-hosted-text-model", # Reference custom alias + prompt="Write a detailed news article about the '{{topic}}' topic.", + ) +) + +# Step 7: Preview your dataset +preview_result = data_designer.preview(config_builder=config_builder) +preview_result.display_sample_record() +``` + +## See Also + +- **[Default Model Settings](/docs/concepts/models/default-model-settings)**: Pre-configured providers and model settings +- **[Configure Model Settings With the CLI](/docs/concepts/models/configure-with-cli)**: CLI-based configuration +- **[Quick Start Guide](/docs/quick-start)**: Basic usage example diff --git a/fern/v0.3.3/pages/concepts/models/default-model-settings.mdx b/fern/v0.3.3/pages/concepts/models/default-model-settings.mdx new file mode 100644 index 00000000..cedcc521 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/models/default-model-settings.mdx @@ -0,0 +1,130 @@ +--- +title: Default Model Settings +description: Pre-configured model providers and configurations included with Data Designer. +--- + +Data Designer ships with pre-configured model providers and model configurations that make it easy to start generating synthetic data without manual setup. + +## Model Providers + +Data Designer includes a few default model providers that are configured automatically: + +### NVIDIA Provider (`nvidia`) + +- **Endpoint**: `https://integrate.api.nvidia.com/v1` +- **API Key**: Set via `NVIDIA_API_KEY` environment variable +- **Models**: Access to NVIDIA's hosted models from [build.nvidia.com](https://build.nvidia.com) +- **Getting Started**: Sign up and get your API key at [build.nvidia.com](https://build.nvidia.com) + +The NVIDIA provider gives you access to state-of-the-art models including Nemotron and other NVIDIA-optimized models. + +### OpenAI Provider (`openai`) + +- **Endpoint**: `https://api.openai.com/v1` +- **API Key**: Set via `OPENAI_API_KEY` environment variable +- **Models**: Access to OpenAI's model catalog +- **Getting Started**: Get your API key from [platform.openai.com/api-keys](https://platform.openai.com/api-keys) + +The OpenAI provider gives you access to GPT models and other OpenAI offerings. + +### OpenRouter Provider (`openrouter`) + +- **Endpoint**: `https://openrouter.ai/api/v1` +- **API Key**: Set via `OPENROUTER_API_KEY` environment variable +- **Models**: Access to a wide variety of models through OpenRouter's unified API +- **Getting Started**: Get your API key from [openrouter.ai](https://openrouter.ai) + +The OpenRouter provider gives you access to a unified interface for many different language models from various providers. + +## Model Configurations + +Data Designer provides pre-configured model aliases for common use cases. When you create a `DataDesignerConfigBuilder` without specifying `model_configs`, these default configurations are automatically available. + +### NVIDIA Models + +The following model configurations are automatically available when `NVIDIA_API_KEY` is set: + +| Alias | Model | Use Case | Inference Parameters | +|-------|-------|----------|---------------------| +| `nvidia-text` | `nvidia/nemotron-3-nano-30b-a3b` | General text generation | `temperature=1.0, top_p=1.0` | +| `nvidia-reasoning` | `openai/gpt-oss-20b` | Reasoning and analysis tasks | `temperature=0.35, top_p=0.95` | +| `nvidia-vision` | `nvidia/nemotron-nano-12b-v2-vl` | Vision and image understanding | `temperature=0.85, top_p=0.95` | +| `nvidia-embedding` | `nvidia/llama-3.2-nv-embedqa-1b-v2` | Text embeddings | `encoding_format="float", extra_body={"input_type": "query"}` | + + +### OpenAI Models + +The following model configurations are automatically available when `OPENAI_API_KEY` is set: + +| Alias | Model | Use Case | Inference Parameters | +|-------|-------|----------|---------------------| +| `openai-text` | `gpt-4.1` | General text generation | `temperature=0.85, top_p=0.95` | +| `openai-reasoning` | `gpt-5` | Reasoning and analysis tasks | `temperature=0.35, top_p=0.95` | +| `openai-vision` | `gpt-5` | Vision and image understanding | `temperature=0.85, top_p=0.95` | +| `openai-embedding` | `text-embedding-3-large` | Text embeddings | `encoding_format="float"` | + +### OpenRouter Models + +The following model configurations are automatically available when `OPENROUTER_API_KEY` is set: + +| Alias | Model | Use Case | Inference Parameters | +|-------|-------|----------|---------------------| +| `openrouter-text` | `nvidia/nemotron-3-nano-30b-a3b` | General text generation | `temperature=1.0, top_p=1.0` | +| `openrouter-reasoning` | `openai/gpt-oss-20b` | Reasoning and analysis tasks | `temperature=0.35, top_p=0.95` | +| `openrouter-vision` | `nvidia/nemotron-nano-12b-v2-vl` | Vision and image understanding | `temperature=0.85, top_p=0.95` | +| `openrouter-embedding` | `openai/text-embedding-3-large` | Text embeddings | `encoding_format="float"` | + + +## Using Default Settings + +Default settings work out of the box - no configuration needed! Simply create `DataDesigner` and `DataDesignerConfigBuilder` instances without any arguments, and reference the default model aliases in your column configurations. + +For a complete example showing how to use default model settings, see the **[Quick Start Guide](/docs/quick-start)**. + +### How Default Model Providers and Configurations Work + +When the Data Designer library or the CLI is initialized, default model configurations and providers are stored in the Data Designer home directory for easy access and customization if they do not already exist. These configuration files serve as the single source of truth for model settings. By default they are saved to the following paths: + +- **Model Configs**: `~/.data-designer/model_configs.yaml` +- **Model Providers**: `~/.data-designer/model_providers.yaml` + + +While these files provide a convenient way to specify settings for your model providers and configuration you use most often, they can always be set programmatically in your SDG workflow. + + +You can customize the home directory location by setting the `DATA_DESIGNER_HOME` environment variable: + +```bash +# In your .bashrc, .zshrc, or similar +export DATA_DESIGNER_HOME="/path/to/your/custom/directory" +``` + +These configuration files can be modified in two ways: + +1. **Using the CLI**: Run CLI commands to add, update, or delete model configurations and providers +2. **Manual editing**: Directly edit the YAML files with your preferred text editor + +Both methods operate on the same files, ensuring consistency across your entire Data Designer setup. + +## Important Notes + + +While default model configurations are always available, you need to set the appropriate API key environment variable (`NVIDIA_API_KEY`, `OPENAI_API_KEY`, or `OPENROUTER_API_KEY`) to actually use the corresponding models for data generation. Without a valid API key, any attempt to generate data using that provider's models will fail. + + + +Store your API keys in environment variables rather than hardcoding them in your scripts: + +```bash +# In your .bashrc, .zshrc, or similar +export NVIDIA_API_KEY="your-api-key-here" +export OPENAI_API_KEY="your-openai-api-key-here" +export OPENROUTER_API_KEY="your-openrouter-api-key-here" +``` + + +## See Also + +- **[Custom Model Settings](/docs/concepts/models/custom-model-settings)**: Learn how to create custom providers and model configurations +- **[Configure Model Settings With the CLI](/docs/concepts/models/configure-with-cli)**: Learn how to use the CLI to manage model settings +- **[Model Configurations](/docs/concepts/models/model-configs)**: Learn about model configurations diff --git a/fern/v0.3.3/pages/concepts/models/inference-parameters.mdx b/fern/v0.3.3/pages/concepts/models/inference-parameters.mdx new file mode 100644 index 00000000..49046245 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/models/inference-parameters.mdx @@ -0,0 +1,151 @@ +--- +title: Inference Parameters +description: Control model behavior during synthetic data generation. +--- + +Inference parameters control how models generate responses during synthetic data generation. Data Designer provides two types of inference parameters: `ChatCompletionInferenceParams` for text/code/structured generation and `EmbeddingInferenceParams` for embedding generation. + +## Overview + +When you create a `ModelConfig`, you can specify inference parameters to adjust model behavior. These parameters control aspects like randomness (temperature), diversity (top_p), context size (max_tokens), and more. Data Designer supports both static values and dynamic distribution-based sampling for certain parameters. + +## Chat Completion Inference Parameters + +The `ChatCompletionInferenceParams` class controls how models generate text completions (for text, code, and structured data generation). It provides fine-grained control over generation behavior and supports both static values and dynamic distribution-based sampling. + +### Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `temperature` | `float` or `Distribution` | No | Controls randomness in generation (0.0 to 2.0). Higher values = more creative/random | +| `top_p` | `float` or `Distribution` | No | Nucleus sampling parameter (0.0 to 1.0). Controls diversity by filtering low-probability tokens | +| `max_tokens` | `int` | No | Maximum number of tokens to generate in the response (≥ 1) | +| `max_parallel_requests` | `int` | No | Maximum concurrent API requests (default: 4, ≥ 1) | +| `timeout` | `int` | No | API request timeout in seconds (≥ 1) | +| `extra_body` | `dict[str, Any]` | No | Additional parameters to include in the API request body | + + +If `temperature`, `top_p`, or `max_tokens` are not provided, the model provider's default values will be used. Different providers and models may have different defaults. + + + +For gpt-oss models like `gpt-oss-20b` and `gpt-oss-120b`, you can control the reasoning effort using the `extra_body` parameter: + +```python +from data_designer.essentials import ChatCompletionInferenceParams + +# High reasoning effort (more thorough, slower) +inference_parameters = ChatCompletionInferenceParams( + extra_body={"reasoning_effort": "high"} +) + +# Medium reasoning effort (balanced) +inference_parameters = ChatCompletionInferenceParams( + extra_body={"reasoning_effort": "medium"} +) + +# Low reasoning effort (faster, less thorough) +inference_parameters = ChatCompletionInferenceParams( + extra_body={"reasoning_effort": "low"} +) +``` + + +### Temperature and Top P Guidelines + +- **Temperature**: + - `0.0-0.3`: Highly deterministic, focused outputs (ideal for structured/reasoning tasks) + - `0.4-0.7`: Balanced creativity and coherence (general purpose) + - `0.8-1.0`: Creative, diverse outputs (ideal for creative writing) + - `1.0+`: Highly random and experimental + +- **Top P**: + - `0.1-0.5`: Very focused, only most likely tokens + - `0.6-0.9`: Balanced diversity + - `0.95-1.0`: Maximum diversity, including less likely tokens + + +When tuning both parameters simultaneously, consider these combinations: + +- **For deterministic/structured outputs**: Low temperature (`0.0-0.3`) + moderate-to-high top_p (`0.8-0.95`) + - The low temperature ensures focus, while top_p allows some token diversity +- **For balanced generation**: Moderate temperature (`0.5-0.7`) + high top_p (`0.9-0.95`) + - This is a good starting point for most use cases +- **For creative outputs**: Higher temperature (`0.8-1.0`) + high top_p (`0.95-1.0`) + - Both parameters work together to maximize diversity + +**Avoid**: Setting both very low (overly restrictive) or adjusting both dramatically at once. When experimenting, adjust one parameter at a time to understand its individual effect. + + +## Distribution-Based Inference Parameters + +For `temperature` and `top_p` in `ChatCompletionInferenceParams`, you can specify distributions instead of fixed values. This allows Data Designer to sample different values for each generation request, introducing controlled variability into your synthetic data. + +### Uniform Distribution + +Samples values uniformly between a low and high bound: + +```python +from data_designer.essentials import ( + ChatCompletionInferenceParams, + UniformDistribution, + UniformDistributionParams, +) + +inference_params = ChatCompletionInferenceParams( + temperature=UniformDistribution( + params=UniformDistributionParams(low=0.7, high=1.0) + ), +) +``` + +### Manual Distribution + +Samples from a discrete set of values with optional weights: + +```python +from data_designer.essentials import ( + ChatCompletionInferenceParams, + ManualDistribution, + ManualDistributionParams, +) + +# Equal probability for each value +inference_params = ChatCompletionInferenceParams( + temperature=ManualDistribution( + params=ManualDistributionParams(values=[0.5, 0.7, 0.9]) + ), +) + +# Weighted probabilities (normalized automatically) +inference_params = ChatCompletionInferenceParams( + top_p=ManualDistribution( + params=ManualDistributionParams( + values=[0.8, 0.9, 0.95], + weights=[0.2, 0.5, 0.3] # 20%, 50%, 30% probability + ) + ), +) +``` + +## Embedding Inference Parameters + +The `EmbeddingInferenceParams` class controls how models generate embeddings. This is used when working with embedding models for tasks like semantic search or similarity analysis. + +### Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `encoding_format` | `Literal["float", "base64"]` | No | Format of the embedding encoding (default: "float") | +| `dimensions` | `int` | No | Number of dimensions for the embedding | +| `max_parallel_requests` | `int` | No | Maximum concurrent API requests (default: 4, ≥ 1) | +| `timeout` | `int` | No | API request timeout in seconds (≥ 1) | +| `extra_body` | `dict[str, Any]` | No | Additional parameters to include in the API request body | + + +## See Also + +- **[Default Model Settings](/docs/concepts/models/default-model-settings)**: Pre-configured model settings included with Data Designer +- **[Custom Model Settings](/docs/concepts/models/custom-model-settings)**: Learn how to create custom providers and model configurations +- **[Model Configurations](/docs/concepts/models/model-configs)**: Learn about configuring model settings +- **[Model Providers](/docs/concepts/models/model-providers)**: Learn about configuring model providers diff --git a/fern/v0.3.3/pages/concepts/models/model-configs.mdx b/fern/v0.3.3/pages/concepts/models/model-configs.mdx new file mode 100644 index 00000000..fc4cace5 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/models/model-configs.mdx @@ -0,0 +1,125 @@ +--- +title: Model Configurations +description: Configure model settings for synthetic data generation. +--- + +Model configurations define the specific models you use for synthetic data generation and their associated inference parameters. Each `ModelConfig` represents a named model that can be referenced throughout your data generation workflows. + +## Overview + +A `ModelConfig` specifies which LLM model to use and how it should behave during generation. When you create column configurations (like `LLMText`, `LLMCode`, or `LLMStructured`), you reference a model by its alias. Data Designer uses the model configuration to determine which model to call and with what parameters. + +## ModelConfig Structure + +The `ModelConfig` class has the following fields: + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `alias` | `str` | Yes | Unique identifier for this model configuration (e.g., `"my-text-model"`, `"reasoning-model"`) | +| `model` | `str` | Yes | Model identifier as recognized by the provider (e.g., `"nvidia/nemotron-3-nano-30b-a3b"`, `"gpt-4"`) | +| `inference_parameters` | `InferenceParamsT` | No | Controls model behavior during generation. Use `ChatCompletionInferenceParams` for text/code/structured generation or `EmbeddingInferenceParams` for embeddings. Defaults to `ChatCompletionInferenceParams()` if not provided. The generation type is automatically determined by the inference parameters type. See [Inference Parameters](/docs/concepts/models/inference-parameters) for details. | +| `provider` | `str` | No | Reference to the name of the Provider to use (e.g., `"nvidia"`, `"openai"`, `"openrouter"`). If not specified, one set as the default provider, which may resolve to the first provider if there are more than one | + + +## Examples + +### Basic Model Configuration + +```python +from data_designer.essentials import ChatCompletionInferenceParams, ModelConfig + +# Simple model configuration with fixed parameters +model_config = ModelConfig( + alias="my-text-model", + model="nvidia/nemotron-3-nano-30b-a3b", + provider="nvidia", + inference_parameters=ChatCompletionInferenceParams( + temperature=0.85, + top_p=0.95, + max_tokens=2048, + ), +) +``` + +### Multiple Model Configurations for Different Tasks + +```python +from data_designer.essentials import ( + ChatCompletionInferenceParams, + EmbeddingInferenceParams, + GenerationType, + ModelConfig +) + +model_configs = [ + # Creative tasks + ModelConfig( + alias="creative-model", + model="nvidia/nemotron-3-nano-30b-a3b", + provider="nvidia", + inference_parameters=ChatCompletionInferenceParams( + temperature=0.9, + top_p=0.95, + max_tokens=2048, + ), + ), + # Critic tasks + ModelConfig( + alias="critic-model", + model="nvidia/nemotron-3-nano-30b-a3b", + provider="nvidia", + inference_parameters=ChatCompletionInferenceParams( + temperature=0.25, + top_p=0.95, + max_tokens=2048, + ), + ), + # Reasoning and structured tasks + ModelConfig( + alias="reasoning-model", + model="openai/gpt-oss-20b", + provider="nvidia", + inference_parameters=ChatCompletionInferenceParams( + temperature=0.3, + top_p=0.9, + max_tokens=4096, + ), + ), + # Vision tasks + ModelConfig( + alias="vision-model", + model="nvidia/nemotron-nano-12b-v2-vl", + provider="nvidia", + inference_parameters=ChatCompletionInferenceParams( + temperature=0.7, + top_p=0.95, + max_tokens=2048, + ), + ), + # Embedding tasks + ModelConfig( + alias="embedding_model", + model="nvidia/llama-3.2-nv-embedqa-1b-v2", + provider="nvidia", + inference_parameters=EmbeddingInferenceParams( + encoding_format="float", + extra_body={ + "input_type": "query" + } + ) + ) +] +``` + + +The number of tokens required to generate a single data entry can vary significantly with use case. For example, reasoning models often need more tokens to "think through" problems before generating a response. Note that `max_tokens` specifies the **maximum number of output tokens** to generate in the response, so set this value based on the expected length of the generated content. + + +## See Also + +- **[Inference Parameters](/docs/concepts/models/inference-parameters)**: Detailed guide to inference parameters and how to configure them +- **[Model Providers](/docs/concepts/models/model-providers)**: Learn about configuring model providers +- **[Default Model Settings](/docs/concepts/models/default-model-settings)**: Pre-configured model settings included with Data Designer +- **[Custom Model Settings](/docs/concepts/models/custom-model-settings)**: Learn how to create custom providers and model configurations +- **[Configure Model Settings With the CLI](/docs/concepts/models/configure-with-cli)**: Use the CLI to manage model settings +- **[Column Configurations](/api/column-configs)**: Learn how to use models in column configurations diff --git a/fern/v0.3.3/pages/concepts/models/model-providers.mdx b/fern/v0.3.3/pages/concepts/models/model-providers.mdx new file mode 100644 index 00000000..efc877f3 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/models/model-providers.mdx @@ -0,0 +1,56 @@ +--- +title: Model Providers +description: Configure connections to model hosting services. +--- + +Model providers are external services that host and serve models. Data Designer uses the `ModelProvider` class to configure connections to these services. + +## Overview + +A `ModelProvider` defines how Data Designer connects to a provider's API endpoint. When you create a `ModelConfig`, you reference a provider by name, and Data Designer uses that provider's settings to make API calls to the appropriate endpoint. + +## ModelProvider Configuration + +The `ModelProvider` class has the following fields: + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `name` | `str` | Yes | Unique identifier for the provider (e.g., `"nvidia"`, `"openai"`, `"openrouter"`) | +| `endpoint` | `str` | Yes | API endpoint URL (e.g., `"https://integrate.api.nvidia.com/v1"`) | +| `provider_type` | `str` | No | Provider type (default: `"openai"`). Uses OpenAI-compatible API format | +| `api_key` | `str` | No | API key or environment variable name (e.g., `"NVIDIA_API_KEY"`) | +| `extra_body` | `dict[str, Any]` | No | Additional parameters to include in the request body of all API requests to the provider. | +| `extra_headers` | `dict[str, str]` | No | Additional headers to include in all API requests to the provider. | + +## API Key Configuration + +The `api_key` field can be specified in two ways: + +1. **Environment variable name** (recommended): Set `api_key` to the name of an environment variable (e.g., `"NVIDIA_API_KEY"`). Data Designer will automatically resolve it at runtime. + +2. **Plain-text value**: Set `api_key` to the actual API key string. This is less secure and not recommended for production use. + +```python +# Method 1: Environment variable (recommended) +provider = ModelProvider( + name="nvidia", + endpoint="https://integrate.api.nvidia.com/v1", + api_key="NVIDIA_API_KEY", # Will be resolved from environment +) + +# Method 2: Direct value (not recommended) +provider = ModelProvider( + name="nvidia", + endpoint="https://integrate.api.nvidia.com/v1", + api_key="nvapi-abc123...", # Direct API key +) +``` + +## See Also + +- **[Model Configurations](/docs/concepts/models/model-configs)**: Learn about configuring models +- **[Inference Parameters](/docs/concepts/models/inference-parameters)**: Detailed guide to inference parameters and how to configure them +- **[Default Model Settings](/docs/concepts/models/default-model-settings)**: Pre-configured providers and model settings included with Data Designer +- **[Custom Model Settings](/docs/concepts/models/custom-model-settings)**: Learn how to create custom providers and model configurations +- **[Configure Model Settings With the CLI](/docs/concepts/models/configure-with-cli)**: Use the CLI to manage providers and model settings +- **[Quick Start Guide](/docs/quick-start)**: Get started with a simple example diff --git a/fern/v0.3.3/pages/concepts/person-sampling.mdx b/fern/v0.3.3/pages/concepts/person-sampling.mdx new file mode 100644 index 00000000..0bb23ce3 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/person-sampling.mdx @@ -0,0 +1,220 @@ +--- +title: Person Sampling in Data Designer +description: Generate synthetic person data for your datasets. +--- + +Person sampling in Data Designer allows you to generate synthetic person data for your datasets. There are two distinct approaches, each with different capabilities and use cases. + +## Overview + +Data Designer provides two ways to generate synthetic people: + +1. **Faker-based sampling** - Quick, basic PII generation for testing or when realistic demographic distributions are not relevant for your use case +2. **Nemotron-Personas datasets** - Demographically accurate, rich persona data + +--- + +## Approach 1: Faker-Based Sampling + +### What It Does +Uses the Faker library to generate random personal information. The data is basic and not demographically accurate, but is useful for quick testing, prototyping, or when realistic demographic distributions are not relevant for your use case. + +### Features +- Gives you access to person attributes that Faker exposes +- Quick to set up with no additional downloads +- Generates random names, emails, addresses, phone numbers, etc. +- Supports [all Faker-supported locales](https://faker.readthedocs.io/en/master/locales.html) +- **Not demographically grounded** - data patterns don't reflect real-world demographics + +### Usage Example +```python +from data_designer.essentials import ( + SamplerColumnConfig, + SamplerType, + PersonFromFakerSamplerParams, +) + +config_builder.add_column( + SamplerColumnConfig( + name="customer", + sampler_type=SamplerType.PERSON_FROM_FAKER, + params=PersonFromFakerSamplerParams( + locale="en_US", + age_range=[25, 65], + sex="Female", + ), + ) +) +``` + +For more details, see the documentation for [`SamplerColumnConfig`](/api/column-configs) and [`PersonFromFakerSamplerParams`](/api/sampler-params). + +--- + +## Approach 2: Nemotron-Personas Datasets + +### What It Does +Uses curated Nemotron-Personas datasets from NVIDIA GPU Cloud (NGC) to generate demographically accurate person data with rich personality profiles and behavioral characteristics. + +The NGC datasets are extended versions of the [open-source Nemotron-Personas datasets on HuggingFace](https://huggingface.co/collections/nvidia/nemotron-personas), with additional fields and enhanced data quality. + +Supported locales: + +- `en_US`: United States +- `ja_JP`: Japan +- `en_IN`: India +- `hi_Deva_IN`: India (Devanagari script) +- `hi_Latn_IN`: India (Latin script) + +### Features +- **Demographically accurate personal details**: Names, ages, sex, marital status, education, occupation based on census data +- **Rich persona details**: Comprehensive behavioral profiles including: + - Big Five personality traits with scores + - Cultural backgrounds and narratives + - Skills and hobbies + - Career goals and aspirations + - Context-specific personas (professional, financial, healthcare, sports, arts, travel, culinary, etc.) +- Consistent, referenceable attributes across your dataset +- Grounded in real-world demographic distributions + +### Prerequisites + +To use the extended Nemotron-Personas datasets with Data Designer, you need to download them [from NGC](https://catalog.ngc.nvidia.com/search?orderBy=scoreDESC&query=nemotron+personas) and move them to the Data Designer managed assets directory. + +See below for step-by-step instructions. + +### Nemotron-Personas Datasets Setup Instructions + +#### Step 0: Obtain an NGC API Key and install the NGC CLI + +To download the Nemotron-Personas datasets from NGC, you will need to obtain an NGC API key and install the NGC CLI. + +1. **NGC API Key**: Obtain from [NVIDIA GPU Cloud](https://ngc.nvidia.com/) +2. **NGC CLI**: [NGC CLI](https://org.ngc.nvidia.com/setup/installers/cli) + + +#### Step 1: Set Your NGC API Key +```bash +export NGC_API_KEY="your-ngc-api-key-here" +``` + +#### Step 2 (option 1): Download Nemotron-Personas Datasets via the Data Designer CLI + +Once you have the NGC CLI and your NGC API key set up, you can download the datasets via the Data Designer CLI. + +You can pass the locales you want to download as arguments to the CLI command: +```bash +data-designer download personas --locale en_US --locale ja_JP +``` + +Or you can use the interactive mode to select the locales you want to download: +```bash +data-designer download personas +``` + +#### Step 2 (option 2): Download Nemotron-Personas Datasets Directly + +Use the NGC CLI to download the datasets: +```bash +# For Nemotron-Personas USA +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-en_us" + +# For Nemotron-Personas IN +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-hi_deva_in" +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-hi_latn_in" +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-en_in" + +# For Nemotron-Personas JP +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ja_jp" +``` + +Then move the downloaded dataset to the Data Designer managed assets directory: +```bash +mkdir -p ~/.data-designer/managed-assets/datasets/ +mv nemotron-personas-dataset-*/*.parquet ~/.data-designer/managed-assets/datasets/ +``` + +#### Step 3: Use PersonSampler in Your Code +```python +from data_designer.essentials import ( + SamplerColumnConfig, + SamplerType, + PersonSamplerParams, +) + +config_builder.add_column( + SamplerColumnConfig( + name="customer", + sampler_type=SamplerType.PERSON, + params=PersonSamplerParams( + locale="en_US", + sex="Female", + age_range=[25, 45], + with_synthetic_personas=True, + ), + ) +) +``` + +For more details, see the documentation for [`SamplerColumnConfig`](/api/column-configs) and [`PersonSamplerParams`](/api/sampler-params). + +### Available Data Fields + +**Core Fields (all locales):** + +| Field | Type | Notes | +|-------|------|-------| +| `uuid` | UUID | Unique identifier | +| `first_name` | string | | +| `middle_name` | string | | +| `last_name` | string | | +| `sex` | enum | "Male" or "Female" | +| `birth_date` | date | Derived: year, month, day | +| `street_number` | int | | +| `street_name` | string | | +| `unit` | string | Address line 2 | +| `city` | string | | +| `region` | string | Alias: state | +| `district` | string | Alias: county | +| `postcode` | string | Alias: zipcode | +| `country` | string | | +| `phone_number` | PhoneNumber | Derived: area_code, country_code, prefix, line_number | +| `marital_status` | string | Values: never_married, married_present, separated, widowed, divorced | +| `education_level` | string or None | | +| `bachelors_field` | string or None | | +| `occupation` | string or None | | +| `email_address` | string | | +| `national_id` | string | + +**Japan-Specific Fields (`ja_JP`):** + +- `area` + +**India-Specific Fields (`en_IN`, `hi_IN`, `hi_Deva_IN`, `hi_Latn_IN`):** + +- `religion` - Census-reported religion +- `education_degree` - Census-reported education degree +- `first_language` - Native language +- `second_language` - Second language (if applicable) +- `third_language` - Third language (if applicable) +- `zone` - Urban vs rural + +**With Synthetic Personas Enabled:** + +- Big Five personality traits (Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism) with t-scores and labels +- Cultural background narratives +- Skills and competencies +- Hobbies and interests +- Career goals +- Context-specific personas (professional, financial, healthcare, sports, arts & entertainment, travel, culinary, etc.) + +### Configuration Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `locale` | str | Language/region code - must be one of: "en_US", "ja_JP", "en_IN", "hi_Deva_IN", "hi_Latn_IN" | +| `sex` | str (optional) | Filter by "Male" or "Female" | +| `city` | str or list[str] (optional) | Filter by specific city or cities within locale | +| `age_range` | list[int] (optional) | Two-element list [min_age, max_age] (default: [18, 114]) | +| `with_synthetic_personas` | bool (optional) | Include rich personality profiles (default: False) | +| `select_field_values` | dict (optional) | Custom field-based filtering (e.g., `{"state": ["NY", "CA"], "education_level": ["bachelors"]}`) | diff --git a/fern/v0.3.3/pages/concepts/processors.mdx b/fern/v0.3.3/pages/concepts/processors.mdx new file mode 100644 index 00000000..2efcd1c7 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/processors.mdx @@ -0,0 +1,158 @@ +--- +title: Processors +description: Transformations that modify your dataset before or after columns are generated. +--- + +Processors are transformations that modify your dataset before or after columns are generated. They run at different stages and can reshape, filter, or augment the data. + + +Processors handle transformations that don't fit the "column" model: restructuring the schema for a specific output format, dropping intermediate columns in bulk, or applying batch-wide operations. + + +## Overview + +Each processor: + +- Receives the complete batch DataFrame +- Applies its transformation +- Passes the result to the next processor (or to output) + +Currently, processors run only at the `POST_BATCH` stage, i.e., after column generation completes for each batch. + +## Processor Types + +### 🗑️ Drop Columns Processor + +Removes specified columns from the output dataset. Dropped columns are saved separately in the `dropped-columns` directory for reference. + + +The Drop Columns Processor is different from others in the sense that it does not need to be explicitly added: setting `drop = True` when configuring a column will accomplish the same. + + +**Configuration:** + +```python +from data_designer.essentials import DropColumnsProcessorConfig + +processor = DropColumnsProcessorConfig( + name="remove_intermediate", + column_names=["temp_calculation", "raw_input", "debug_info"], +) +``` + +**Behavior:** + +- Columns specified in `column_names` are removed from the output +- Original values are preserved in a separate parquet file +- Missing columns produce a warning but don't fail the build +- Column configs are automatically marked with `drop=True` when this processor is added + +**Use Cases:** + +- Removing intermediate columns used only for LLM context +- Cleaning up debug or validation columns before final output +- Separating sensitive data from the main dataset + +### 🔄 Schema Transform Processor + +Creates an additional dataset with a transformed schema using Jinja2 templates. The output is written to a separate directory alongside the main dataset. + +**Configuration:** + +```python +from data_designer.essentials import SchemaTransformProcessorConfig + +processor = SchemaTransformProcessorConfig( + name="chat_format", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + "metadata": "{{ category | upper }}", + }, +) +``` + +**Behavior:** + +- Each key in `template` becomes a column in the transformed dataset +- Values are Jinja2 templates with access to all columns in the batch +- Complex structures (lists, nested dicts) are supported +- Output is saved to the `processors-outputs/{name}/` directory +- The original dataset passes through unchanged + +**Template Capabilities:** + +- **Variable substitution**: `{{ column_name }}` +- **Filters**: `{{ text | upper }}`, `{{ text | lower }}`, `{{ text | trim }}` +- **Nested structures**: Arbitrarily deep JSON structures +- **Lists**: `["{{ col1 }}", "{{ col2 }}"]` + +**Use Cases:** + +- Converting flat columns to chat message format +- Restructuring data for specific model training formats +- Creating derived views without modifying the source dataset + +## Using Processors + +Add processors to your configuration using the builder's `add_processor` method: + +```python +from data_designer.essentials import ( + DataDesignerConfigBuilder, + DropColumnsProcessorConfig, + SchemaTransformProcessorConfig, +) + +builder = DataDesignerConfigBuilder() + +# ... add columns ... + +# Drop intermediate columns +builder.add_processor( + DropColumnsProcessorConfig( + name="cleanup", + column_names=["scratch_work", "raw_context"], + ) +) + +# Transform to chat format +builder.add_processor( + SchemaTransformProcessorConfig( + name="chat_format", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + }, + ) +) +``` + +### Execution Order + +Processors execute in the order they're added. Plan accordingly when one processor's output affects another. + +## Configuration Parameters + +### Common Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `name` | str | Identifier for the processor, used in output directory names | +| `build_stage` | BuildStage | When to run (default: `POST_BATCH`) | + +### DropColumnsProcessorConfig + +| Parameter | Type | Description | +|-----------|------|-------------| +| `column_names` | list[str] | Columns to remove from output | + +### SchemaTransformProcessorConfig + +| Parameter | Type | Description | +|-----------|------|-------------| +| `template` | dict[str, Any] | Jinja2 template defining the output schema. Must be JSON-serializable. | diff --git a/fern/v0.3.3/pages/concepts/validators.mdx b/fern/v0.3.3/pages/concepts/validators.mdx new file mode 100644 index 00000000..7b50d618 --- /dev/null +++ b/fern/v0.3.3/pages/concepts/validators.mdx @@ -0,0 +1,345 @@ +--- +title: Validators +description: Quality assurance mechanisms that check generated content against rules. +--- + +Validators are quality assurance mechanisms in Data Designer that check generated content against rules and return structured pass/fail results. They enable automated verification of data for correctness, code quality, and adherence to specifications. + + +Validators act as **quality gates** in your generation pipeline. Use them to filter invalid records, score code quality, verify format compliance, or integrate with external validation services. + + +## Overview + +Validation columns execute validation logic against target columns and produce structured results indicating: + +- **`is_valid`**: Boolean pass/fail status +- **Additional metadata**: Error messages, scores, severity levels, and custom fields + +Validators currently support three execution strategies: + +1. **Code validation**: Lint and check Python or SQL code using industry-standard tools +2. **Local callable validation**: Execute custom Python functions for flexible validation logic +3. **Remote validation**: Send data to HTTP endpoints for external validation services + +## Validator Types + +### 🐍 Python Code Validator + +The Python code validator runs generated Python code through [Ruff](https://github.com/astral-sh/ruff), a fast Python linter that checks for syntax errors, undefined variables, and code quality issues. + +**Configuration:** + +```python +from data_designer.essentials import CodeLang, CodeValidatorParams + +validator_params = CodeValidatorParams(code_lang=CodeLang.PYTHON) +``` + +**Validation Output:** + +Each validated record returns: + +- **`is_valid`**: `True` if no fatal or error-level issues found +- **`python_linter_score`**: Quality score from 0-10 (based on pylint formula) +- **`python_linter_severity`**: Highest severity level found (`"none"`, `"convention"`, `"refactor"`, `"warning"`, `"error"`, `"fatal"`) +- **`python_linter_messages`**: List of linter messages with line numbers, columns, and descriptions + +**Severity Levels:** + +- **Fatal**: Syntax errors preventing code execution +- **Error**: Undefined names, invalid syntax +- **Warning**: Code smells and potential issues +- **Refactor**: Simplification opportunities +- **Convention**: Style guide violations + +A record is marked valid if it has no messages or only messages at warning/convention/refactor levels. + +**Example Validation Result:** + +```python +{ + "is_valid": False, + "python_linter_score": 0, + "python_linter_severity": "error", + "python_linter_messages": [ + { + "type": "error", + "symbol": "F821", + "line": 1, + "column": 7, + "message": "Undefined name `it`" + } + ] +} +``` + +### 🗄️ SQL Code Validator + +The SQL code validator uses [SQLFluff](https://github.com/sqlfluff/sqlfluff), a dialect-aware SQL linter that checks query syntax and structure. + +**Configuration:** + +```python +from data_designer.essentials import CodeLang, CodeValidatorParams + +validator_params = CodeValidatorParams(code_lang=CodeLang.SQL_POSTGRES) +``` + + +The SQL code validator supports multiple dialects: `SQL_POSTGRES`, `SQL_ANSI`, `SQL_MYSQL`, `SQL_SQLITE`, `SQL_TSQL` and `SQL_BIGQUERY`. + + +**Validation Output:** + +Each validated record returns: + +- **`is_valid`**: `True` if no parsing errors found +- **`error_messages`**: Concatenated error descriptions (empty string if valid) + +The validator focuses on parsing errors (PRS codes) that indicate malformed SQL. It also checks for common pitfalls like `DECIMAL` definitions without scale parameters. + +**Example Validation Result:** + +```python +# Valid SQL +{ + "is_valid": True, + "error_messages": "" +} + +# Invalid SQL +{ + "is_valid": False, + "error_messages": "PRS: Line 1, Position 1: Found unparsable section: 'NOT SQL'" +} +``` + +### 🔧 Local Callable Validator + +The local callable validator executes custom Python functions for flexible validation logic. + +**Configuration:** + +```python +import pandas as pd + +from data_designer.essentials import LocalCallableValidatorParams + +def my_validation_function(df: pd.DataFrame) -> pd.DataFrame: + """Validate that values are positive. + + Args: + df: DataFrame with target columns + + Returns: + DataFrame with is_valid column and optional metadata + """ + result = pd.DataFrame() + result["is_valid"] = df["price"] > 0 + result["error_message"] = result["is_valid"].apply( + lambda valid: "" if valid else "Price must be positive" + ) + return result + +validator_params = LocalCallableValidatorParams( + validation_function=my_validation_function, + output_schema={ # Optional: enforce output schema + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "is_valid": {"type": ["boolean", "null"]}, + "error_message": {"type": "string"} + }, + "required": ["is_valid"] + } + } + } + } +) +``` + +**Function Requirements:** + +- **Input**: DataFrame with target columns +- **Output**: DataFrame with `is_valid` column (boolean or null) +- **Extra fields**: Any additional columns become validation metadata + +The `output_schema` parameter is optional but recommended—it validates the function's output against a JSON schema, catching unexpected return formats. + +### 🌐 Remote Validator + +The remote validator sends data to HTTP endpoints for validation-as-a-service. This is useful for when you have validation software that needs to run on external compute and you can expose it through a service. Some examples are: + +- External linting services +- Security scanners +- Domain-specific validators +- Proprietary validation systems + + +Currently, the remote validator is only able to perform unauthenticated API calls. When implementing your own service, you can rely on network isolation for security. If you need to reach a service that requires authentication, you should implement a local proxy. + + +**Configuration:** + +```python +from data_designer.essentials import RemoteValidatorParams + +validator_params = RemoteValidatorParams( + endpoint_url="https://api.example.com/validate", + timeout=30.0, # Request timeout in seconds + max_retries=3, # Retry attempts on failure + retry_backoff=2.0, # Exponential backoff factor + max_parallel_requests=4, # Concurrent request limit + output_schema={ # Optional: enforce response schema + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "is_valid": {"type": ["boolean", "null"]}, + "confidence": {"type": "string"} + } + } + } + } + } +) +``` + +**Request Format:** + +The validator sends POST requests with this structure: + +```json +{ + "data": [ + {"column1": "value1", "column2": "value2"}, + {"column1": "value3", "column2": "value4"} + ] +} +``` + +**Expected Response Format:** + +The endpoint must return: + +```json +{ + "data": [ + { + "is_valid": true, + "custom_field": "any additional metadata" + }, + { + "is_valid": false, + "custom_field": "more metadata" + } + ] +} +``` + +**Retry Behavior:** + +The validator automatically retries on: + +- Network errors +- HTTP status codes: 429 (rate limit), 500, 502, 503, 504 + +Failed requests use exponential backoff: `delay = retry_backoff^attempt`. + +**Parallelization:** + +Set `max_parallel_requests` to control concurrency. Higher values improve throughput but increase server load. The validator batches requests according to the `batch_size` parameter in the validation column configuration. + +## Using Validators in Columns + +Add validation columns to your configuration using the builder's `add_column` method: + +```python +from data_designer.essentials import ( + CodeValidatorParams, + CodeLang, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + ValidationColumnConfig, +) + +builder = DataDesignerConfigBuilder() + +# Generate Python code +builder.add_column( + LLMCodeColumnConfig( + name="sorting_algorithm", + prompt="Write a Python function to sort a list using bubble sort.", + code_lang="python", + model_alias="my-model" + ) +) + +# Validate the generated code +builder.add_column( + ValidationColumnConfig( + name="code_validation", + target_columns=["sorting_algorithm"], + validator_type="code", + validator_params=CodeValidatorParams(code_lang=CodeLang.PYTHON), + batch_size=10, + drop=False, + ) +) +``` + +The `target_columns` parameter specifies which columns to validate. All target columns are passed to the validator together (except for code validators, which process each column separately). + +### Configuration Parameters + +See more about parameters used to instantiate `ValidationColumnConfig` in the [API reference](/api/column-configs). + +### Batch Size Considerations + +Larger batch sizes improve efficiency but consume more memory: + +- **Code validators**: 5-20 records (file I/O overhead) +- **Local callable**: 10-50 records (depends on function complexity) +- **Remote validators**: 1-10 records (network latency, server capacity) + +Adjust based on: + +- Validator computational cost +- Available memory +- Network bandwidth (for remote validators) +- Server rate limits + +If the validation logic uses information from other samples, only samples in the batch will be considered. + +### Multiple Column Validation + +Validate multiple columns simultaneously: + +```python +from data_designer.essentials import RemoteValidatorParams, ValidationColumnConfig + +builder.add_column( + ValidationColumnConfig( + name="multi_column_validation", + target_columns=["column_a", "column_b", "column_c"], + validator_type="remote", + validator_params=RemoteValidatorParams( + endpoint_url="https://api.example.com/validate" + ) + ) +) +``` + +**Note**: Code validators always process each target column separately, even when multiple columns are specified. Local callable and remote validators receive all target columns together. + +## See Also + +- [Validator Parameters Reference](/api/validator-params): Configuration object schemas diff --git a/fern/v0.3.3/pages/contributing.mdx b/fern/v0.3.3/pages/contributing.mdx new file mode 100644 index 00000000..b317a75a --- /dev/null +++ b/fern/v0.3.3/pages/contributing.mdx @@ -0,0 +1,239 @@ +--- +title: 🎨✨ Contributing to NeMo Data Designer 🎨✨ +description: How to contribute to NeMo Data Designer +--- + +Thank you for your interest in contributing to Data Designer! + +We welcome contributions from the community and sincerely appreciate your efforts to improve the project. Whether you're fixing a typo, reporting a bug, proposing a new feature, or implementing a major enhancement, your work helps make Data Designer better for everyone 🎉. + +This guide will help you get started with the contribution process. + +## Table of Contents + +- [Getting Started](#getting-started) +- [Ways to Contribute](#ways-to-contribute) +- [Feature Requests](#feature-requests) +- [Development Guide](#development-guide) +- [Submitting Changes](#submitting-changes) +- [Code of Conduct](#code-of-conduct) +- [Signing off on your work](#signing-off-on-your-work) + + +## Getting Started +👋 Welcome to the Data Designer community! We're excited to have you here. + +Whether you're new to the project or ready to dive in, the resources below will help you get oriented and productive quickly: + +1. **[README.md](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/README.md)** – best place to start to learn the basics of the project + +2. **[AGENTS.md](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/AGENTS.md)** – context and instructions to help AI coding agents work on Data Designer (it's also useful for human developers!) + +3. **[Documentation](https://nvidia-nemo.github.io/DataDesigner/)** – detailed documentation on Data Designer's capabilities and usage + +## Ways to Contribute + +There are many ways to contribute to Data Designer: + +### 🐛 Bug Fixes + +Found a bug? Before reporting, please +1. Verify you're using the latest version: `uv pip install --upgrade data-designer` +2. Search for duplicates in the [issue tracker](https://github.com/NVIDIA-NeMo/DataDesigner/issues) + +When [creating a bug report](https://github.com/NVIDIA-NeMo/DataDesigner/issues/new), please include: +- Data Designer version +- Python version and operating system +- Minimal reproducible example +- Expected vs. actual behavior +- Full error messages and stack traces + +If you are interested in fixing the bug yourself, that's AWESOME! Please follow the [development guide](#development-guide) to get started. + +### ✨ Feature Implementation +Want to add new functionality? Great! Please review [our development approach](#feature-requests) and open a feature request to discuss the idea and get feedback before investing significant time on the implementation. + +### 📖 Documentation Improvements +Documentation is crucial for user adoption. Contributions that clarify usage, add examples, or fix typos are highly valued. + +### 💡 Examples and Tutorials +Share your use cases! Example notebooks and tutorials help others understand how to leverage Data Designer effectively. + +### 🧪 Test Coverage +Help us improve test coverage by adding tests for untested code paths or edge cases. + +## Feature Requests +Data Designer is designed to be as flexible and extensible as possible, and we welcome your ideas for pushing its capabilities even further! To keep the core library maintainable, while also supporting innovation, we take an incremental approach when adding new features – we explore what's already possible, extend through plugins when needed, and integrate the most broadly useful features into the core library: + +### How We Grow Data Designer +1. 🧗 **Explore what's possible**: Can your use case be achieved with current features? We've designed Data Designer to be composable – sometimes creative combinations of existing tools can accomplish what you need. Check out our examples or open an issue if you'd like help exploring this! + +2. 🔌 **Extend through plugins**: If existing features aren't quite enough, consider implementing your idea as a plugin that extends the core library. Plugins let you experiment and share functionality while keeping the core library focused. + +3. ⚙️ **Integrate into the core library**: If your feature or plugin proves broadly useful and aligns with Data Designer's goals, we'd love to integrate it into the core library! We're happy to discuss whether it's a good fit and how to move forward together. + +This approach helps us grow thoughtfully while keeping Data Designer focused and maintainable. + +### Submitting a Feature Request +Open a [new issue](https://github.com/NVIDIA-NeMo/DataDesigner/issues/new) with: + +- **Clear title**: Concise description of the feature +- **Use case**: Explain what problem this solves and why it's important +- **Proposed solution**: Describe how you envision the feature working +- **Alternatives considered**: Other approaches you've thought about +- **Examples**: Code examples or mockups of how users would interact with the feature +- **Willingness to implement**: Are you interested in implementing this yourself? + +## Development Guide +Data Designer uses [`uv`](https://github.com/astral-sh/uv) for dependency management. If you don't have uv installed, follow their [installation instructions](https://docs.astral.sh/uv/getting-started/installation/). + +### Initial Setup +0. **Create or find an issue** + + Before starting work, ensure there's an issue tracking your contribution: + + - For bug fixes: Search [existing issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues) or [create a new one](https://github.com/NVIDIA-NeMo/DataDesigner/issues/new) + - For new features: Open a [feature request](#feature-requests) to discuss the approach first + - Comment on the issue to let maintainers know you're working on it + +1. **Fork and clone the repository** + + Start by [forking the Data Designer repository](https://github.com/NVIDIA-NeMo/DataDesigner/fork), then clone your fork and add the upstream remote: + + ```bash + git clone https://github.com/YOUR_GITHUB_USERNAME/DataDesigner.git + + cd DataDesigner + + git remote add upstream https://github.com/NVIDIA-NeMo/DataDesigner.git + ``` + +2. **Install dependencies** + + ```bash + # Install project with dev dependencies + make install-dev + + # Or, if you use Jupyter / IPython for development + make install-dev-notebooks + ``` + +3. **Verify your setup** + + ```bash + make test && make check-all + ``` + + If no errors are reported, you're ready to develop 🚀 + +### Making Changes + +1. **Create a feature branch** + + ```bash + git checkout main + git pull upstream main + git checkout -b //- + ``` + + Example types of change: + + - `feat` for new features + - `fix` for bug fixes + - `docs` for documentation updates + - `test` for testing changes + - `refactor` for code refactoring + - `chore` for chore tasks + - `style` for style changes + - `perf` for performance improvements + + Example branch name: + + - `johnnygreco/feat/123-add-xyz-generator` for a new feature by @johnnygreco, addressing issue #123 + +2. **Develop your changes** + + Please follow the patterns and conventions used throughout the codebase, as well as those outlined in [AGENTS.md](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/AGENTS.md). + +3. **Test and validate** + + ```bash + make check-all-fix # Format code and fix linting issues + make test # Run all tests + make coverage # Check test coverage (must be >90%) + ``` + + **Writing tests**: Place tests in [tests/](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/tests/) mirroring the source structure. Use fixtures from [tests/conftest.py](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/tests/conftest.py), mock external services with `unittest.mock` or `pytest-httpx`, and test both success and failure cases. See [AGENTS.md](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/AGENTS.md) for patterns and examples. + +4. **Commit your work** + + Write clear, descriptive commit messages, optionally including a brief summary (50 characters or less) and reference issue numbers when applicable (e.g., "Fixes #123"). + + ```bash + git commit -m "Add XYZ generator for synthetic data" -m "Fixes #123" + ``` + +5. **Stay up to date** + + Regularly sync your branch with upstream changes: + + ```bash + git fetch upstream + git merge upstream/main + ``` + +## Submitting Changes + +### Before Submitting + +Ensure your changes meet the following criteria: + +- All tests pass (`make test`) +- Code is formatted and linted (`make check-all-fix`) +- New functionality includes tests +- Documentation is updated (README, docstrings, examples) +- License headers are present on all new files +- Commit messages are clear and descriptive + +### Creating a Pull Request + +1. **Push your changes** to your fork: + + ```bash + git push origin //- + ``` + +2. **Open a pull request** on GitHub from your fork to the main repository + +3. **Respond to review feedback** update your PR as needed + +### Pull Request Review Process + +- Maintainers will review your PR and may request changes +- Address feedback by pushing additional commits to your branch +- Reply to the feedback comment with a link to the commit that addresses it. +- Once approved, a maintainer will merge your PR +- Your contribution will be included in the next release! + +## Code of Conduct +Data Designer follows the Contributor Covenant Code of Conduct. We are committed to providing a welcoming and inclusive environment for all contributors. + +**Please read our complete [Code of Conduct](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/CODE_OF_CONDUCT.md)** for full details on our standards and expectations. + +### License File Headers +All code files that are added to this repository must include the appropriate NVIDIA copyright header: + +```python +# SPDX-FileCopyrightText: Copyright (c) {YEAR} NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +``` + +Use `make update-license-headers` to add headers automatically. + +## Signing off on your work + +When contributing to this project, you must agree that you have authored 100% of the content, that you have the necessary rights to the content and that the content you contribute may be provided under the project license. All contributors are asked to sign the Data Designer [Developer Certificate of Origin (DCO)](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/DCO) when submitting their first pull request. The process is automated by a bot that will comment on the pull request. Our DCO is the same as the Linux Foundation requires its contributors to sign. + +--- + +Thank you for contributing to NeMo Data Designer! Your efforts help make synthetic data generation more accessible and powerful for everyone. 🎨✨ diff --git a/fern/v0.3.3/pages/index.mdx b/fern/v0.3.3/pages/index.mdx new file mode 100644 index 00000000..218beadf --- /dev/null +++ b/fern/v0.3.3/pages/index.mdx @@ -0,0 +1,51 @@ +--- +title: 🎨 NeMo Data Designer Library +description: A general framework for generating high-quality synthetic data from scratch or using seed data. +--- + +[![GitHub](https://img.shields.io/badge/github-repo-952fc6?logo=github)](https://github.com/NVIDIA-NeMo/DataDesigner) [![License](https://img.shields.io/badge/License-Apache_2.0-0074df.svg)](https://opensource.org/licenses/Apache-2.0) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html) + +👋 Welcome to the Data Designer community! We're excited to have you here. + +Data Designer is a **general framework** for generating **high-quality** synthetic data **from scratch** or using your own **seed data** as a starting point for domain-grounded data generation. + +## Why Data Designer? + +Generating high-quality synthetic data requires much more than iteratively calling an LLM. + +Data Designer is **purpose-built** to support large-scale, high-quality data generation, including + + * **Diversity** – statistical distributions and variety that reflect real-world data patterns, not repetitive LLM outputs + * **Correlations** – meaningful relationships between fields that LLMs cannot maintain across independent calls + * **Steerability** – flexible control over data characteristics throughout the generation process + * **Validation** – automated quality checks and verification that data meets specifications + * **Reproducibility** – shareable and reproducible generation workflows + +## How does it work? + +Data Designer helps you create datasets through an intuitive, **iterative** process: + +1. **⚙️ Configure** your model settings + - Bring your own OpenAI-compatible model providers and models + - Or use the default model providers and models to get started quickly + - Learn more by reading the [model docs](/docs/concepts/models/default-model-settings) +2. **🏗️ Design** your dataset + - Iteratively design your dataset, column by column + - Leverage tools like statistical samplers and LLMs to generate a variety of data types + - Learn more by reading the [column docs](/docs/concepts/columns) + +3. **🔁 Preview** your results and iterate + - Generate a preview dataset stored in memory for fast iteration + - Inspect sample records and analysis results to refine your configuration + - Try for yourself by running the [tutorial notebooks](/docs/tutorials/overview) +4. **🖼️ Create** your dataset + - Generate your full dataset and save results to disk + - Access the generated dataset and associated artifacts for downstream use + - Give it a try by running the [tutorial notebooks](/docs/tutorials/overview) + +## Library and Microservice + +Data Designer is available as both an open-source library and a NeMo microservice. + + * **Open-source Library**: Purpose-built for flexibility and customization, prioritizing UX excellence, modularity, and extensibility. + * **NeMo Microservice**: An enterprise-grade solution that offers a seamless transition from the library, allowing you to leverage other NeMo microservices and generate datasets at scale. See the [microservice docs](https://docs.nvidia.com/nemo/microservices/latest/design-synthetic-data-from-scratch-or-seeds/index.html) for more details. diff --git a/fern/v0.3.3/pages/installation.mdx b/fern/v0.3.3/pages/installation.mdx new file mode 100644 index 00000000..ff7bbd6e --- /dev/null +++ b/fern/v0.3.3/pages/installation.mdx @@ -0,0 +1,36 @@ +--- +title: Installation +description: How to install Data Designer +--- + +Installing Data Designer is as simple as: + + + + ```bash + pip install data-designer + ``` + + + ```bash + uv add data-designer + ``` + + + +## Development Installation + +To install the latest development version from the GitHub repository: + + + + ```bash + pip install 'git+https://github.com/NVIDIA-NeMo/DataDesigner@main' + ``` + + + ```bash + uv add 'git+https://github.com/NVIDIA-NeMo/DataDesigner@main' + ``` + + diff --git a/fern/v0.3.3/pages/plugins/available.mdx b/fern/v0.3.3/pages/plugins/available.mdx new file mode 100644 index 00000000..594e53ac --- /dev/null +++ b/fern/v0.3.3/pages/plugins/available.mdx @@ -0,0 +1,6 @@ +--- +title: "🚧 Available Plugins: Coming Soon" +description: List of available Data Designer plugins. +--- + +This page will list available Data Designer plugins. Stay tuned! diff --git a/fern/v0.3.3/pages/plugins/example.mdx b/fern/v0.3.3/pages/plugins/example.mdx new file mode 100644 index 00000000..0be568cb --- /dev/null +++ b/fern/v0.3.3/pages/plugins/example.mdx @@ -0,0 +1,310 @@ +--- +title: "Example Plugin: Index Multiplier" +description: A complete walkthrough for creating a Data Designer plugin. +--- + + +The plugin system is currently **experimental** and under active development. The documentation, examples, and plugin interface are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). + + +In this guide, we will build a simple plugin that generates values by multiplying the row index by a user-specified multiplier. Admittedly, not the most useful plugin, but it demonstrates the required steps 😜. + +A Data Designer plugin is implemented as a Python package with three main components: + +1. **Configuration Class**: Defines the parameters users can configure +2. **Task Class**: Contains the core implementation of the plugin +3. **Plugin Object**: Connects the config and task classes to make the plugin discoverable + +Let's build the `data-designer-index-multiplier` plugin step by step. + +## Step 1: Create a Python package + +Data Designer plugins are implemented as Python packages. We recommend using a standard structure for your plugin package. + +For example, here is the structure of a `data-designer-index-multiplier` plugin: + +``` +data-designer-index-multiplier/ +├── pyproject.toml +└── src/ + └── data_designer_index_multiplier/ + ├── __init__.py + └── plugin.py +``` + +## Step 2: Create the config class + +The configuration class defines what parameters users can set when using your plugin. For column generator plugins, it must inherit from `SingleColumnConfig` and include a [discriminator field](https://docs.pydantic.dev/latest/concepts/unions/#discriminated-unions). + +```python +from typing import Literal +from data_designer.config.column_configs import SingleColumnConfig + +class IndexMultiplierColumnConfig(SingleColumnConfig): + """Configuration for the index multiplier column generator.""" + + # Configurable parameter for this plugin + multiplier: int = 2 + + # Required: discriminator field with a unique Literal type + # This value identifies your plugin and becomes its column_type + column_type: Literal["index-multiplier"] = "index-multiplier" +``` + +**Key points:** + +- The `column_type` field must be a `Literal` type with a string default +- This value uniquely identifies your plugin (use kebab-case) +- Add any custom parameters your plugin needs (here: `multiplier`) +- `SingleColumnConfig` is a Pydantic model, so you can leverage all of Pydantic's validation features + +## Step 3: Create the implementation class + +The implementation class defines the actual business logic of the plugin. For column generator plugins, it inherits from `ColumnGenerator` and must implement a `metadata` static method and `generate` method: + + +```python +import logging +import pandas as pd + +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, +) + +# Data Designer uses the standard Python logging module for logging +logger = logging.getLogger(__name__) + +class IndexMultiplierColumnGenerator(ColumnGenerator[IndexMultiplierColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + """Define metadata about this generator.""" + return GeneratorMetadata( + name="index-multiplier", + description="Generates values by multiplying the row index by a user-specified multiplier", + generation_strategy=GenerationStrategy.FULL_COLUMN, + ) + + def generate(self, data: pd.DataFrame) -> pd.DataFrame: + """Generate the column data. + + Args: + data: The current DataFrame being built + + Returns: + The DataFrame with the new column added + """ + logger.info( + f"Generating column {self.config.name} " + f"with multiplier {self.config.multiplier}" + ) + + # Access config via self.config + data[self.config.name] = data.index * self.config.multiplier + + return data +``` + +**Key points:** + +- Generic type `ColumnGenerator[IndexMultiplierColumnConfig]` connects the task to its config +- `metadata()` describes your generator and its requirements +- `generation_strategy` can be `FULL_COLUMN`, `CELL_BY_CELL` +- You have access to the configuration parameters via `self.config` + + +The `generation_strategy` specifies how the column generator will generate data. + +- **`FULL_COLUMN`**: Generates the full column (at the batch level) in a single call to `generate` + - `generate` must take as input a `pd.DataFrame` with all previous columns and return a `pd.DataFrame` with the generated column appended + +- **`CELL_BY_CELL`**: Generates one cell at a time + - `generate` must take as input a `dict` with key/value pairs for all previous columns and return a `dict` with an additional key/value for the generated cell + - Supports concurrent workers via a `max_parallel_requests` parameter on the configuration + + +## Step 4: Create the plugin object + +Create a `Plugin` object that makes the plugin discoverable and connects the task and config classes. + +```python +from data_designer.plugins import Plugin, PluginType + +# Plugin instance - this is what gets loaded via entry point +plugin = Plugin( + impl_qualified_name="data_designer_index_multiplier.plugin.IndexMultiplierColumnGenerator", + config_qualified_name="data_designer_index_multiplier.plugin.IndexMultiplierColumnConfig", + plugin_type=PluginType.COLUMN_GENERATOR, + emoji="🔌", +) +``` + +### Complete plugin code + +Pulling it all together, here is the complete plugin code for `src/data_designer_index_multiplier/plugin.py`: + +```python +import logging +from typing import Literal + +import pandas as pd + +from data_designer.config.column_configs import SingleColumnConfig +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, +) +from data_designer.plugins import Plugin, PluginType + +# Data Designer uses the standard Python logging module for logging +logger = logging.getLogger(__name__) + + +class IndexMultiplierColumnConfig(SingleColumnConfig): + """Configuration for the index multiplier column generator.""" + + # Configurable parameter for this plugin + multiplier: int = 2 + + # Required: discriminator field with a unique Literal type + # This value identifies your plugin and becomes its column_type + column_type: Literal["index-multiplier"] = "index-multiplier" + + +class IndexMultiplierColumnGenerator(ColumnGenerator[IndexMultiplierColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + """Define metadata about this generator.""" + return GeneratorMetadata( + name="index-multiplier", + description="Generates values by multiplying the row index by a user-specified multiplier", + generation_strategy=GenerationStrategy.FULL_COLUMN, + ) + + def generate(self, data: pd.DataFrame) -> pd.DataFrame: + """Generate the column data. + + Args: + data: The current DataFrame being built + + Returns: + The DataFrame with the new column added + """ + logger.info( + f"Generating column {self.config.name} " + f"with multiplier {self.config.multiplier}" + ) + + # Access config via self.config + data[self.config.name] = data.index * self.config.multiplier + + return data + + +# Plugin instance - this is what gets loaded via entry point +plugin = Plugin( + impl_qualified_name="data_designer_index_multiplier.plugin.IndexMultiplierColumnGenerator", + config_qualified_name="data_designer_index_multiplier.plugin.IndexMultiplierColumnConfig", + plugin_type=PluginType.COLUMN_GENERATOR, + emoji="🔌", +) +``` + +## Step 5: Package your plugin + +Create a `pyproject.toml` file to define your package and register the entry point: + +```toml +[project] +name = "data-designer-index-multiplier" +version = "1.0.0" +description = "Data Designer index multiplier plugin" +requires-python = ">=3.10" +dependencies = [ + "data-designer", +] + +# Register this plugin via entry points +[project.entry-points."data_designer.plugins"] +index-multiplier = "data_designer_index_multiplier.plugin:plugin" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/data_designer_index_multiplier"] +``` + + +Plugins are discovered automatically using [Python entry points](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata). It is important to register your plugin as an entry point under the `data_designer.plugins` group. + +The entry point format is: + +```toml +[project.entry-points."data_designer.plugins"] + = ":" +``` + + +## Step 6: Use your plugin + +Install your plugin in editable mode for testing: + +```bash +# From the plugin directory +uv pip install -e . +``` + +Once installed, your plugin works just like built-in column types: + +```python +from data_designer_index_multiplier.plugin import IndexMultiplierColumnConfig + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + SamplerColumnConfig, +) + +data_designer = DataDesigner() +builder = DataDesignerConfigBuilder() + +# Add a regular column +builder.add_column( + SamplerColumnConfig( + name="category", + sampler_type="category", + params=CategorySamplerParams(values=["A", "B", "C"]), + ) +) + +# Add your custom plugin column +builder.add_column( + IndexMultiplierColumnConfig( + name="v", + multiplier=5, + ) +) + +# Generate data +results = data_designer.create(builder, num_records=10) +print(results.load_dataset()) +``` + +Output: + +``` + category multiplied-index +0 B 0 +1 A 5 +2 C 10 +3 A 15 +4 B 20 +... +``` + +That's it! You have now created and used your first Data Designer plugin. The last step is to package your plugin and share it with the community 🚀 diff --git a/fern/v0.3.3/pages/plugins/overview.mdx b/fern/v0.3.3/pages/plugins/overview.mdx new file mode 100644 index 00000000..613f3b3f --- /dev/null +++ b/fern/v0.3.3/pages/plugins/overview.mdx @@ -0,0 +1,49 @@ +--- +title: Data Designer Plugins +description: Extend Data Designer's capabilities with custom plugins. +--- + + +The plugin system is currently **experimental** and under active development. The documentation, examples, and plugin interface are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). + + +## What are plugins? + +Plugins are Python packages that extend Data Designer's capabilities without modifying the core library. Similar to [VS Code extensions](https://marketplace.visualstudio.com/vscode) and [Pytest plugins](https://docs.pytest.org/en/stable/reference/plugin_list.html), the plugin system empowers you to build specialized extensions for your specific use cases and share them with the community. + +**Current capabilities**: Data Designer currently supports plugins for column generators (the column types you pass to the config builder's `add_column` method). + +**Coming soon**: Plugin support for processors, validators, and more! + +## How do you use plugins? + +A Data Designer plugin is just a Python package configured with an [entry point](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata) that points to a Data Designer `Plugin` object. Using a plugin is as simple as installing the package: + +```bash +pip install data-designer-{plugin-name} +``` + +Once installed, plugins are automatically discovered and ready to use. See the [example plugin](/docs/plugins/example) for a complete walkthrough. + +## How do you create plugins? + +Creating a plugin involves three main steps: + +### 1. Implement the Plugin Components + +- Create a task class inheriting from `ColumnGenerator` +- Create a config class inheriting from `SingleColumnConfig` +- Instantiate a `Plugin` object connecting them + +### 2. Package Your Plugin + +- Set up a Python package with `pyproject.toml` +- Register your plugin using entry points +- Define dependencies (including `data-designer`) + +### 3. Share Your Plugin + +- Publish to PyPI or another package index +- Share with the community! + +**Ready to get started?** See the [Example Plugin](/docs/plugins/example) for a complete walkthrough! diff --git a/fern/v0.3.3/pages/quick-start.mdx b/fern/v0.3.3/pages/quick-start.mdx new file mode 100644 index 00000000..e52b3c02 --- /dev/null +++ b/fern/v0.3.3/pages/quick-start.mdx @@ -0,0 +1,91 @@ +--- +title: Quick Start +description: Get started with Data Designer using default model providers and configurations. +--- + +Get started with Data Designer using the default model providers and configurations. Data Designer ships with built-in model providers and configurations that make it easy to start generating synthetic data immediately. + +## Prerequisites + +Before you begin, you'll need an API key from one of the default providers: + +- **NVIDIA API Key**: Get yours from [build.nvidia.com](https://build.nvidia.com) +- **OpenAI API Key** (optional): Get yours from [platform.openai.com](https://platform.openai.com/api-keys) +- **OpenRouter API Key** (optional): Get yours from [openrouter.ai](https://openrouter.ai) + +Set your API key as an environment variable: + +```bash +export NVIDIA_API_KEY="your-api-key-here" +# Or for OpenAI +export OPENAI_API_KEY="your-openai-api-key-here" +# Or for OpenRouter +export OPENROUTER_API_KEY="your-openrouter-api-key-here" +``` + +## Example + +Below we'll construct a simple Data Designer workflow that generates multilingual greetings. + +```python +import os + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + InfoType, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, +) + +# Set your API key from build.nvidia.com +# Skip this step if you've already exported your key to the environemnt variable +os.environ["NVIDIA_API_KEY"] = "your-api-key-here" + +# Create a DataDesigner instance +# This automatically configures the default model providers +data_designer = DataDesigner() + +# Print out all the model providers available +data_designer.info.display(InfoType.MODEL_PROVIDERS) + +# Create a config builder +# This automatically loads the default model configurations +config_builder = DataDesignerConfigBuilder() + +# Print out all the model configurations available +config_builder.info.display(InfoType.MODEL_CONFIGS) + +# Add a sampler column to randomly select a language +config_builder.add_column( + SamplerColumnConfig( + name="language", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["English", "Spanish", "French", "German", "Italian"], + ), + ) +) + +# Add an LLM text generation column +# We'll use the built-in 'nvidia-text' model alias +config_builder.add_column( + LLMTextColumnConfig( + name="greetings", + model_alias="nvidia-text", + prompt="""Write a casual and formal greeting in '{{language}}' language.""", + ) +) + +# Run a preview to generate sample records +preview_results = data_designer.preview(config_builder=config_builder) + +# Display a sample record +preview_results.display_sample_record() +``` + +🎉 Congratulations, you successfully ran one iteration designing your synthetic data. Follow along to learn more. + +To learn more about the default providers and model configurations available, see the [Default Model Settings](/docs/concepts/models/default-model-settings) guide. diff --git a/fern/v0.3.3/pages/recipes/code-generation/text-to-python.mdx b/fern/v0.3.3/pages/recipes/code-generation/text-to-python.mdx new file mode 100644 index 00000000..79e597a7 --- /dev/null +++ b/fern/v0.3.3/pages/recipes/code-generation/text-to-python.mdx @@ -0,0 +1,292 @@ +--- +title: Text to Python +description: Generate Python code from natural language descriptions. +--- + + +[Download the complete recipe script](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/code_generation/text_to_python.py) + + +```python +from pathlib import Path + +from data_designer.essentials import ( + CategorySamplerParams, + CodeLang, + CodeValidatorParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, + ValidationColumnConfig, + ValidatorType, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="industry_sector", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Healthcare", + "Finance", + "Technology", + ], + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="industry_sector", + values={ + "Healthcare": [ + "Electronic Health Records (EHR) Systems", + "Telemedicine Platforms", + "AI-Powered Diagnostic Tools", + ], + "Finance": [ + "Fraud Detection Software", + "Automated Trading Systems", + "Personal Finance Apps", + ], + "Technology": [ + "Cloud Computing Platforms", + "Artificial Intelligence and Machine Learning Platforms", + "DevOps and CI/CD Tools", + ], + }, + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="code_complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Beginner", + "Intermediate", + "Advanced", + ], + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="code_concept", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="code_complexity", + values={ + "Beginner": [ + "Variables", + "Data Types", + "Functions", + "Loops", + "Classes", + ], + "Intermediate": [ + "List Comprehensions", + "Object-oriented programming", + "Lambda Functions", + "Web frameworks", + "Pandas", + ], + "Advanced": [ + "Multithreading", + "Context Managers", + "Generators", + ], + }, + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="instruction_phrase", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Write a function that", + "Create a class that", + "Implement a script", + "Can you create a function", + "Develop a module that", + ], + ), + ), + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="instruction", + model_alias=model_alias, + system_prompt=("You are an expert at generating clear and specific programming tasks."), + prompt=( + "Generate an instruction to create Python code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n" + "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" + ), + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="code_implementation", + model_alias=model_alias, + code_lang=CodeLang.PYTHON, + system_prompt=( + "You are an expert Python programmer who writes clean, efficient, and well-documented code." + ), + prompt=( + "Write Python code for the following instruction:\n" + "Instruction: {{ instruction }}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n" + "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n" + "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n" + "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n" + ), + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="code_judge_result", + model_alias=model_alias, + prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE, + scores=python_scoring, + ) + ) + + config_builder.add_column( + ValidationColumnConfig( + name="code_validity_result", + validator_type=ValidatorType.CODE, + target_columns=["code_implementation"], + validator_params=CodeValidatorParams( + code_lang=CodeLang.PYTHON, + ), + batch_size=100, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\ +You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving. + +You think about potential flaws and errors in the code. You are a tough critic, but a fair one. + +Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS. + +#### INSTRUCTIONS +The Generated Python Code should be a valid response to the Natural Language Prompt below + +Natural Language Prompt: +{{ instruction }} + +Generated Python Code +{{ code_implementation }} +""" + + +python_scoring = [ + Score( + name="Relevance", + description="Adherence to INSTRUCTIONS and CONTEXT", + options={ + 4: "Perfectly meets all specified requirements.", + 3: "Meets most requirements with minor deviations.", + 2: "Moderate deviation from the instructions.", + 1: "Significant deviations from the instructions.", + 0: "Does not adhere to the instructions.", + }, + ), + Score( + name="Pythonic", + description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)", + options={ + 4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.", + 3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.", + 2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.", + 1: "The code loosely follows Python conventions, with several deviations from best practices.", + 0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.", + }, + ), + Score( + name="Readability", + description="Readability and Maintainability (Is the Python code easy to understand and maintain?)", + options={ + 4: ( + "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, " + "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format." + ), + 3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.", + 2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.", + 1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.", + 0: "The code is unreadable, with no attempt at formatting or description.", + }, + ), + Score( + name="Efficiency", + description="Efficiency and Performance (Is the code optimized for performance?)", + options={ + 4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.", + 3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.", + 2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.", + 1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.", + 0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.", + }, + ), +] + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() +``` diff --git a/fern/v0.3.3/pages/recipes/code-generation/text-to-sql.mdx b/fern/v0.3.3/pages/recipes/code-generation/text-to-sql.mdx new file mode 100644 index 00000000..26ddd1ab --- /dev/null +++ b/fern/v0.3.3/pages/recipes/code-generation/text-to-sql.mdx @@ -0,0 +1,334 @@ +--- +title: Text to SQL +description: Generate SQL queries from natural language descriptions. +--- + + +[Download the complete recipe script](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/code_generation/text_to_sql.py) + + +```python +from pathlib import Path + +from data_designer.essentials import ( + CategorySamplerParams, + CodeLang, + CodeValidatorParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, + ValidationColumnConfig, + ValidatorType, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="industry_sector", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Healthcare", "Finance", "Technology"], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="industry_sector", + values={ + "Healthcare": [ + "Electronic Health Records (EHR) Systems", + "Telemedicine Platforms", + "AI-Powered Diagnostic Tools", + ], + "Finance": [ + "Fraud Detection Software", + "Automated Trading Systems", + "Personal Finance Apps", + ], + "Technology": [ + "Cloud Computing Platforms", + "Artificial Intelligence and Machine Learning Platforms", + "DevOps and CI/CD Tools", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Beginner", "Intermediate", "Advanced"], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_concept", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="sql_complexity", + values={ + "Beginner": [ + "Basic SELECT Statements", + "WHERE Clauses", + "Basic JOINs", + "INSERT, UPDATE, DELETE", + ], + "Intermediate": [ + "Aggregation Functions", + "Multiple JOINs", + "Subqueries", + "Views", + ], + "Advanced": [ + "Window Functions", + "Common Table Expressions (CTEs)", + "Stored Procedures", + "Query Optimization", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_task_type", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Data Retrieval", + "Data Manipulation", + "Analytics and Reporting", + "Data Transformation", + ], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="instruction_phrase", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Write an SQL query that", + "Create an SQL statement to", + "Develop an SQL query to", + "Can you write SQL that", + "Formulate an SQL query that", + ], + ), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="sql_prompt", + model_alias=model_alias, + system_prompt="You are an expert at generating clear and specific SQL tasks.", + prompt=SQL_PROMPT_TEXT, + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="sql_context", + model_alias=model_alias, + code_lang=CodeLang.SQL_ANSI, + system_prompt=( + "You are an expert SQL database designer who creates clean, efficient, and " + "well-structured database schemas." + ), + prompt=SQL_CONTEXT_TEXT, + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="sql", + model_alias=model_alias, + code_lang=CodeLang.SQL_ANSI, + system_prompt="You are an expert SQL programmer who writes clean, efficient, and well-structured queries.", + prompt=SQL_CODE_TEXT, + ) + ) + + config_builder.add_column( + ValidationColumnConfig( + name="code_validity_result", + validator_type=ValidatorType.CODE, + target_columns=["sql"], + validator_params=CodeValidatorParams( + code_lang=CodeLang.SQL_ANSI, + ), + batch_size=100, + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="code_judge_result", + model_alias=model_alias, + prompt=TEXT_TO_SQL_JUDGE_TEMPLATE, + scores=sql_scoring, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +SQL_PROMPT_TEXT = ( + "Generate an instruction to create SQL code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{instruction_phrase}}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{industry_sector}} sector and {{topic}} topic.\n" + "* SQL Complexity: Tailor the instruction to the {{sql_complexity}} level. Utilize relevant {{sql_concept}} " + "where appropriate to match the complexity level.\n" + "* Task Type: The instruction should involve a {{sql_task_type}} task.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to " + "understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" +) + +SQL_CONTEXT_TEXT = ( + "Generate the SQL for creating database tables that would be relevant for the following instruction:\n" + "Instruction: {{sql_prompt}}\n\n" + "Important Guidelines:\n" + "* Relevance: Ensure all tables are directly related to the {{industry_sector}} sector and {{topic}} topic.\n" + "* Completeness: Include all essential columns with appropriate data types, primary/foreign keys, and necessary constraints.\n" + "* Realism: Use realistic table structures typical for the specified industry.\n" + "* Executable SQL: Provide complete CREATE TABLE statements that can be run without modification.\n" + "* Consistency: Use consistent naming conventions (e.g., snake_case for table and column names).\n" + "* Sample Data: Include INSERT statements with sample data that makes sense for the tables (at least 5-10 rows per table)." +) + +SQL_CODE_TEXT = ( + "Write SQL code for the following instruction based on the provided database context:\n" + "Instruction: {{sql_prompt}}\n\n" + "Database Context:\n" + "{{sql_context}}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your SQL should be clean, complete, self-contained and accurate.\n" + "* Code Validity: Please ensure that your SQL code is executable and does not contain any errors.\n" + "* Context: Base your query on the provided database context. Only reference tables and columns that " + "exist in the context.\n" + "* Complexity & Concepts: The SQL should be written at a {{sql_complexity}} level, making use of " + "concepts such as {{sql_concept}}.\n" + "* Task Type: Ensure your solution implements the appropriate {{sql_task_type}} operation.\n" + "* Comments: Include brief comments explaining the key parts of your query.\n" +) + + +TEXT_TO_SQL_JUDGE_TEMPLATE = """\ +You are an expert in SQL with deep knowledge of relational modeling, query semantics, +and performance tuning across common dialects (e.g., PostgreSQL, MySQL, SQLite, SQL Server). +You think critically about correctness, readability, and efficiency. + +Use the SQL Query Quality Rubric below to score the **Generated SQL Query** based on the INSTRUCTIONS. + +#### INSTRUCTIONS +The Generated SQL Query should be a valid response to the Natural Language Prompt below + +Natural Language Prompt: +{{ sql_prompt }} + +Database Context: +{{ sql_context }} + +Generated SQL Query +{{ sql }} +""" + + +sql_scoring = [ + Score( + name="Relevance", + description="Adherence to INSTRUCTIONS and CONTEXT", + options={ + 4: "Perfectly meets all specified requirements.", + 3: "Meets most requirements with minor deviations.", + 2: "Moderate deviation from the instructions.", + 1: "Significant deviations from the instructions.", + 0: "Does not adhere to the instructions.", + }, + ), + Score( + name="SQL Correctness", + description="Syntax and semantic correctness; returns the intended result", + options={ + 4: "Valid SQL with correct joins, filters, grouping/aggregation, and NULL handling; produces the intended result set under the stated/implicit dialect.", + 3: "Generally correct with minor issues (e.g., edge-case NULLs, minor grouping detail) but still likely yields the intended result.", + 2: "Partially correct; noticeable semantic mistakes (joins, grouping, filters) that may change results or fail in edge cases.", + 1: "Largely incorrect; major semantic or syntactic errors likely causing failure or wrong results.", + 0: "Invalid SQL or unrelated to the task; will not run or cannot produce a meaningful result.", + }, + ), + Score( + name="Readability", + description="Formatting, clarity, and maintainability", + options={ + 4: "Cleanly formatted (keywords/clauses consistently styled), clear structure (CTEs/subqueries where helpful), meaningful table/column aliases, and concise.", + 3: "Generally readable with consistent formatting and understandable aliases; could be organized slightly better.", + 2: "Somewhat readable but inconsistent formatting or confusing aliasing; structure is harder to follow.", + 1: "Poorly formatted and hard to read; unclear structure and aliasing.", + 0: "Unreadable or chaotic; no meaningful structure or styling.", + }, + ), + Score( + name="Efficiency", + description="Query performance best practices", + options={ + 4: "Uses sargable predicates, appropriate joins, selective filters early, avoids SELECT *, unnecessary DISTINCT, and wasteful subqueries; likely to use indexes effectively.", + 3: "Mostly efficient; minor opportunities for improvement (e.g., simplifying expressions, reducing data early).", + 2: "Moderate inefficiencies (e.g., non-sargable filters, unnecessary nested subqueries, broad SELECT *).", + 1: "Notably inefficient patterns likely causing large scans or poor plans.", + 0: "Highly inefficient; ignores basic best practices and likely to perform very poorly.", + }, + ), +] + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() +``` diff --git a/fern/v0.3.3/pages/recipes/index.mdx b/fern/v0.3.3/pages/recipes/index.mdx new file mode 100644 index 00000000..34d15a59 --- /dev/null +++ b/fern/v0.3.3/pages/recipes/index.mdx @@ -0,0 +1,70 @@ +--- +title: Use Case Recipes +description: Ready-to-use code examples for common Data Designer use cases. +--- + +Recipes are a collection of code examples that demonstrate how to leverage Data Designer in specific use cases. +Each recipe is a self-contained example that can be run independently. + + +Recipes provide working code for specific use cases without detailed explanations. If you're learning Data Designer for the first time, we recommend starting with our [tutorial notebooks](/docs/tutorials/overview), which offer step-by-step guidance and explain core concepts. Once you're familiar with the basics, return here for practical, ready-to-use implementations. + + + + + Generate a dataset of natural language instructions paired with Python code implementations, with varying complexity levels and industry focuses. + + **Demonstrates:** + - Python code generation + - Python code validation + - LLM-as-judge + + [Download Code](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/code_generation/text_to_python.py) + + + Generate a dataset of natural language instructions paired with SQL code implementations, with varying complexity levels and industry focuses. + + **Demonstrates:** + - SQL code generation + - SQL code validation + - LLM-as-judge + + [Download Code](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/code_generation/text_to_sql.py) + + + Generate a dataset that contains information about products and associated question/answer pairs. + + **Demonstrates:** + - Structured outputs + - Expression columns + - LLM-as-judge + + [Download Code](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/qa_and_chat/product_info_qa.py) + + + Generate a dataset of multi-turn chat conversations between a user and an AI assistant. + + **Demonstrates:** + - Structured outputs + - Expression columns + - LLM-as-judge + + [Download Code](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/qa_and_chat/multi_turn_chat.py) + + diff --git a/fern/v0.3.3/pages/recipes/qa-and-chat/multi-turn-chat.mdx b/fern/v0.3.3/pages/recipes/qa-and-chat/multi-turn-chat.mdx new file mode 100644 index 00000000..1b273592 --- /dev/null +++ b/fern/v0.3.3/pages/recipes/qa-and-chat/multi-turn-chat.mdx @@ -0,0 +1,215 @@ +--- +title: Multi-Turn Chat +description: Generate multi-turn conversational dialogues. +--- + + +[Download the complete recipe script](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/qa_and_chat/multi_turn_chat.py) + + +```python +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel, Field + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="domain", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["Tech Support", "Personal Finances", "Educational Guidance"]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="domain", + values={ + "Tech Support": [ + "Troubleshooting a Laptop", + "Setting Up a Home Wi-Fi Network", + "Installing Software Updates", + ], + "Personal Finances": [ + "Budgeting Advice", + "Understanding Taxes", + "Investment Strategies", + ], + "Educational Guidance": [ + "Choosing a College Major", + "Effective Studying Techniques", + "Learning a New Language", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["Basic", "Intermediate", "Advanced"]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="conversation_length", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=[2, 4, 6, 8]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="user_mood", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["happy", "silly", "sarcastic", "combative", "disappointed", "toxic"]), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="assistant_system_prompt", + prompt=( + "Write a reasonable system prompt for a helpful AI assistant with expertise in " + "{{domain}} and {{topic}}. The AI assistant must not engage in harmful behaviors." + ), + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="user_task", + prompt="Define a simple task related to {{topic}} of {{complexity}} complexity for the user.", + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMStructuredColumnConfig( + name="conversation", + prompt=( + "\n{{user_task}}\n\n\n" + "{{assistant_system_prompt}}\n\n" + "Generate a conversation between a user and an AI assistant with about .\n" + "User is asking the assistant for advice and is in a {{user_mood}} mood.\n" + "The conversation must be {{conversation_length}} messages in length.\n" + "The conversation must come to a natural end in {{conversation_length}} messages and if the assistant is unable " + "to solve the user's needs by then, they should offer to continue the conversation later or redirect them to additional resources.\n" + "The conversation must be realistic and natural, incorporating details from both the user's and assistant's profiles.\n" + "The AI assistant role has the system prompt defined in . All of its responses must conform to :\n" + ), + output_format=ChatConversation, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="toxicity_evaluation", + prompt=TOXICITY_PROMPT_TEMPLATE, + scores=[toxicity_score], + model_alias=model_alias, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +toxicity_score = Score( + name="Toxicity", + description="Evaluation of harmful, harassing, or hateful language in conversations", + options={ + "4": "No toxicity detected; conversation is respectful and appropriate.", + "3": "Mild toxicity; slightly rude or sarcastic but not hateful or harmful.", + "2": "Moderate toxicity; contains disrespectful or harassing language.", + "1": "Severe toxicity; contains overt hate, harassment, or harmful content.", + "0": "Extreme toxicity; contains dangerous, threatening, or abusive content.", + }, +) + +TOXICITY_PROMPT_TEMPLATE = ( + "You are an expert in content moderation and digital communication ethics. " + "You are evaluating the toxicity level in user messages.\n" + "Take a deep breath and carefully analyze the conversation provided. \n" + "Use the rubric to score the level of toxicity in the USER'S messages only.\n" + "Consider:\n" + "- Focus primarily on harassment, hate, or harmful language\n" + "- Evaluate the full context of the conversation\n" + "- Be objective and consistent in your assessment\n" + "- If any single message exhibits toxicity, it should influence the final score\n" + "## CONVERSATION\n" + "{{ conversation }}" +) + + +class Message(BaseModel): + """A single message turn in the conversation.""" + + role: Literal["user", "assistant"] = Field(..., description="Which role is writing the message.") + content: str = Field(..., description="Message contents.") + + +class ChatConversation(BaseModel): + """A chat conversation between a specific user and an AI assistant. + * All conversations are initiated by the user role. + * The assistant role always responds to the user message. + * Turns alternate between user and assistant roles. + * The last message is always from the assistant role. + * Message content can be long or short. + * All assistant messages are faithful responses and must be answered fully. + """ + + conversation: list[Message] = Field(..., description="List of all messages in the conversation.") + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() +``` diff --git a/fern/v0.3.3/pages/recipes/qa-and-chat/product-info-qa.mdx b/fern/v0.3.3/pages/recipes/qa-and-chat/product-info-qa.mdx new file mode 100644 index 00000000..f3cdf7c4 --- /dev/null +++ b/fern/v0.3.3/pages/recipes/qa-and-chat/product-info-qa.mdx @@ -0,0 +1,235 @@ +--- +title: Product Info QA +description: Generate question-answer pairs for product information. +--- + + +[Download the complete recipe script](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/docs/assets/recipes/qa_and_chat/product_info_qa.py) + + +```python +import string +from pathlib import Path + +from pydantic import BaseModel, Field + +from data_designer.essentials import ( + BernoulliSamplerParams, + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + ExpressionColumnConfig, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + UniformSamplerParams, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + config_builder.add_column( + SamplerColumnConfig( + name="category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Electronics", + "Clothing", + "Home Appliances", + "Groceries", + "Toiletries", + "Sports Equipment", + "Toys", + "Books", + "Pet Supplies", + "Tools & Home Improvement", + "Beauty", + "Health & Wellness", + "Outdoor Gear", + "Automotive", + "Jewelry", + "Watches", + "Office Supplies", + "Gifts", + "Arts & Crafts", + "Baby & Kids", + "Music", + "Video Games", + "Movies", + "Software", + "Tech Devices", + ] + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="price_tens_of_dollars", + sampler_type=SamplerType.UNIFORM, + params=UniformSamplerParams(low=1, high=200), + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="product_price", + expr="{{ (price_tens_of_dollars * 10) - 0.01 | round(2) }}", + dtype="float", + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="first_letter", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=list(string.ascii_uppercase)), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="is_hallucination", + sampler_type=SamplerType.BERNOULLI, + params=BernoulliSamplerParams(p=0.5), + ) + ) + + config_builder.add_column( + LLMStructuredColumnConfig( + name="product_info", + model_alias=model_alias, + prompt=( + "Generate a realistic product description for a product in the {{ category }} " + "category that costs {{ product_price }}.\n" + "The name of the product MUST start with the letter {{ first_letter }}.\n" + ), + output_format=ProductInfo, + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="question", + model_alias=model_alias, + prompt=("Ask a question about the following product:\n\n {{ product_info }}"), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="answer", + model_alias=model_alias, + prompt=( + "{%- if is_hallucination == 0 -%}\n" + "\n" + "{{ product_info }}\n" + "\n" + "{%- endif -%}\n" + "User Question: {{ question }}\n" + "Directly and succinctly answer the user's question.\n" + "{%- if is_hallucination == 1 -%}\n" + "Make up whatever information you need to in order to answer the user's request.\n" + "{%- endif -%}" + ), + ) + ) + + # Evaluate answer quality + config_builder.add_column( + LLMJudgeColumnConfig( + name="llm_answer_metrics", + model_alias=model_alias, + prompt=( + "\n" + "{{ product_info }}\n" + "\n" + "User Question: {{question }}\n" + "AI Assistant Answer: {{ answer }}\n" + "Judge the AI assistant's response to the user's question about the product described in ." + ), + scores=answer_quality_scores, + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="completeness_result", + expr="{{ llm_answer_metrics.Completeness.score }}", + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="accuracy_result", + expr="{{ llm_answer_metrics.Accuracy.score }}", + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +class ProductInfo(BaseModel): + product_name: str = Field(..., description="A realistic product name for the market.") + key_features: list[str] = Field(..., min_length=1, max_length=3, description="Key product features.") + description: str = Field( + ..., + description="A short, engaging description of what the product does, highlighting a unique but believable feature.", + ) + price_usd: float = Field(..., description="The price of the product", ge=10, le=1000, decimal_places=2) + + +completeness_score = Score( + name="Completeness", + description="Evaluation of AI assistant's thoroughness in addressing all aspects of the user's query.", + options={ + "Complete": "The response thoroughly covers all key points requested in the question, providing sufficient detail to satisfy the user's information needs.", + "PartiallyComplete": "The response addresses the core question but omits certain important details or fails to elaborate on relevant aspects that were requested.", + "Incomplete": "The response significantly lacks necessary information, missing major components of what was asked and leaving the query largely unanswered.", + }, +) + +accuracy_score = Score( + name="Accuracy", + description="Evaluation of how factually correct the AI assistant's response is relative to the product information.", + options={ + "Accurate": "The information provided aligns perfectly with the product specifications without introducing any misleading or incorrect details.", + "PartiallyAccurate": "While some information is correctly stated, the response contains minor factual errors or potentially misleading statements about the product.", + "Inaccurate": "The response presents significantly wrong information about the product, with claims that contradict the actual product details.", + }, +) + +answer_quality_scores = [completeness_score, accuracy_score] + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() +``` diff --git a/fern/v0.3.3/pages/tutorials/images-as-context.mdx b/fern/v0.3.3/pages/tutorials/images-as-context.mdx new file mode 100644 index 00000000..5897ace2 --- /dev/null +++ b/fern/v0.3.3/pages/tutorials/images-as-context.mdx @@ -0,0 +1,280 @@ +--- +title: "🎨 Data Designer Tutorial: Images as Context for Vision-Based Generation" +--- + + +Run this tutorial interactively in [Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/4-providing-images-as-context.ipynb). + + +#### 📚 What you'll learn + +This notebook demonstrates how to provide images as context to generate text descriptions using vision-language models. + +- ✨ **Visual Document Processing**: Converting images to chat-ready format for model consumption +- 🔍 **Vision-Language Generation**: Using vision models to generate detailed summaries from images + +If this is your first time using Data Designer, we recommend starting with the [first tutorial](/docs/tutorials/the-basics) in this series. + +### 📦 Import the essentials + +- The `essentials` module provides quick access to the most commonly used objects. + +```python +# Standard library imports +import base64 +import io +import uuid + +# Third-party imports +import pandas as pd +import rich +from datasets import load_dataset +from IPython.display import display +from rich.panel import Panel + +# Data Designer imports +from data_designer.essentials import ( + ChatCompletionInferenceParams, + DataDesigner, + DataDesignerConfigBuilder, + DataFrameSeedSource, + ImageContext, + ImageFormat, + LLMTextColumnConfig, + ModalityDataType, + ModelConfig, +) +``` + +### ⚙️ Initialize the Data Designer interface + +- `DataDesigner` is the main object is responsible for managing the data generation process. +- When initialized without arguments, the [default model providers](/docs/concepts/models/default-model-settings) are used. + +```python +data_designer = DataDesigner() +``` + +### 🎛️ Define model configurations + +- Each `ModelConfig` defines a model that can be used during the generation process. +- The "model alias" is used to reference the model in the Data Designer config (as we will see below). +- The "model provider" is the external service that hosts the model (see the [model config](/docs/concepts/models/default-model-settings) docs for more details). +- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. + +```python +# This name is set in the model provider configuration. +MODEL_PROVIDER = "nvidia" + +model_configs = [ + ModelConfig( + alias="vision", + model="meta/llama-4-scout-17b-16e-instruct", + provider=MODEL_PROVIDER, + inference_parameters=ChatCompletionInferenceParams( + temperature=0.60, + top_p=0.95, + max_tokens=2048, + ), + ), +] +``` + +### 🏗️ Initialize the Data Designer Config Builder + +- The Data Designer config defines the dataset schema and generation process. +- The config builder provides an intuitive interface for building this configuration. +- The list of model configs is provided to the builder at initialization. + +```python +config_builder = DataDesignerConfigBuilder(model_configs=model_configs) +``` + +### 🌱 Seed Dataset Creation + +In this section, we'll prepare our visual documents as a seed dataset for summarization: + +- **Loading Visual Documents**: We use the ColPali dataset containing document images +- **Image Processing**: Convert images to base64 format for vision model consumption +- **Metadata Extraction**: Preserve relevant document information (filename, page number, source, etc.) + +The seed dataset will be used to generate detailed text summaries of each document image. + +```python +# Dataset processing configuration +IMG_COUNT = 512 # Number of images to process +BASE64_IMAGE_HEIGHT = 512 # Standardized height for model input + +# Load ColPali dataset for visual documents +img_dataset_cfg = {"path": "vidore/colpali_train_set", "split": "train", "streaming": True} +``` + +```python +def resize_image(image, height: int): + """ + Resize image while maintaining aspect ratio. + + Args: + image: PIL Image object + height: Target height in pixels + + Returns: + Resized PIL Image object + """ + original_width, original_height = image.size + width = int(original_width * (height / original_height)) + return image.resize((width, height)) + + +def convert_image_to_chat_format(record, height: int) -> dict: + """ + Convert PIL image to base64 format for chat template usage. + + Args: + record: Dataset record containing image and metadata + height: Target height for image resizing + + Returns: + Updated record with base64_image and uuid fields + """ + # Resize image for consistent processing + image = resize_image(record["image"], height) + + # Convert to base64 string + img_buffer = io.BytesIO() + image.save(img_buffer, format="PNG") + byte_data = img_buffer.getvalue() + base64_encoded_data = base64.b64encode(byte_data) + base64_string = base64_encoded_data.decode("utf-8") + + # Return updated record + return record | {"base64_image": base64_string, "uuid": str(uuid.uuid4())} +``` + +```python +# Load and process the visual document dataset +print("📥 Loading and processing document images...") + +img_dataset_iter = iter( + load_dataset(**img_dataset_cfg).map(convert_image_to_chat_format, fn_kwargs={"height": BASE64_IMAGE_HEIGHT}) +) +img_dataset = pd.DataFrame([next(img_dataset_iter) for _ in range(IMG_COUNT)]) + +print(f"✅ Loaded {len(img_dataset)} images with columns: {list(img_dataset.columns)}") +``` + +```python +img_dataset.head() +``` + +```python +# Add the seed dataset containing our processed images +df_seed = pd.DataFrame(img_dataset)[["uuid", "image_filename", "base64_image", "page", "options", "source"]] +config_builder.with_seed_dataset(DataFrameSeedSource(df=df_seed)) +``` + +```python +# Add a column to generate detailed document summaries +config_builder.add_column( + LLMTextColumnConfig( + name="summary", + model_alias="vision", + prompt=( + "Provide a detailed summary of the content in this image in Markdown format. " + "Start from the top of the image and then describe it from top to bottom. " + "Place a summary at the bottom." + ), + multi_modal_context=[ + ImageContext( + column_name="base64_image", + data_type=ModalityDataType.BASE64, + image_format=ImageFormat.PNG, + ) + ], + ) +) +``` + +### 🔁 Iteration is key – preview the dataset! + +1. Use the `preview` method to generate a sample of records quickly. +2. Inspect the results for quality and format issues. +3. Adjust column configurations, prompts, or parameters as needed. +4. Re-run the preview until satisfied. + +```python +preview = data_designer.preview(config_builder, num_records=2) +``` + +```python +# Run this cell multiple times to cycle through the 2 preview records. +preview.display_sample_record() +``` + +```python +# The preview dataset is available as a pandas DataFrame. +preview.dataset +``` + +### 📊 Analyze the generated data + +- Data Designer automatically generates a basic statistical analysis of the generated data. +- This analysis is available via the `analysis` property of generation result objects. + +```python +# Print the analysis as a table. +preview.analysis.to_report() +``` + +### 🔎 Visual Inspection + +Let's compare the original document image with the generated summary to validate quality: + +```python +# Compare original document with generated summary +index = 0 # Change this to view different examples + +# Merge preview data with original images for comparison +comparison_dataset = preview.dataset.merge(pd.DataFrame(img_dataset)[["uuid", "image"]], how="left", on="uuid") + +# Extract the record for display +record = comparison_dataset.iloc[index] + +print("📄 Original Document Image:") +display(resize_image(record.image, BASE64_IMAGE_HEIGHT)) + +print("\n📝 Generated Summary:") +rich.print(Panel(record.summary, title="Document Summary", title_align="left")) +``` + +### 🆙 Scale up! + +- Happy with your preview data? +- Use the `create` method to submit larger Data Designer generation jobs. + +```python +results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-4") +``` + +```python +# Load the generated dataset as a pandas DataFrame. +dataset = results.load_dataset() + +dataset.head() +``` + +```python +# Load the analysis results into memory. +analysis = results.load_analysis() + +analysis.to_report() +``` + +## ⏭️ Next Steps + +Now that you've learned how to use visual context for image summarization in Data Designer, explore more: + +- Experiment with different vision models for specific document types +- Try different prompt variations to generate specialized descriptions (e.g., technical details, key findings) +- Combine vision-based summaries with other column types for multi-modal workflows +- Apply this pattern to other vision tasks like image captioning, OCR validation, or visual question answering diff --git a/fern/v0.3.3/pages/tutorials/overview.mdx b/fern/v0.3.3/pages/tutorials/overview.mdx new file mode 100644 index 00000000..38cc6859 --- /dev/null +++ b/fern/v0.3.3/pages/tutorials/overview.mdx @@ -0,0 +1,85 @@ +--- +title: 📓 Tutorials +description: Step-by-step tutorials for learning Data Designer. +--- + +Welcome to the Data Designer tutorials! These interactive notebooks guide you through the core concepts and features of Data Designer. + +## Getting Started + +Each tutorial builds on the previous one, so we recommend following them in order: + + + + Learn the fundamentals of Data Designer by generating a simple product review dataset. + + **Topics covered:** + - Sampler columns for categorical and numerical data + - LLM-generated text columns + - Previewing and iterating on your dataset + + + Learn advanced data generation using structured outputs and Jinja expressions. + + **Topics covered:** + - Pydantic models for structured output schemas + - Expression columns with Jinja2 templates + - Conditional logic in prompts + + + Bootstrap generation from existing data to create domain-grounded synthetic datasets. + + **Topics covered:** + - Loading seed datasets from local files + - Referencing seed data in prompts + - Combining real and synthetic data + + + Use vision-language models to generate text descriptions from images. + + **Topics covered:** + - Processing images for model input + - Vision model configuration + - Document summarization workflows + + + +## Running the Tutorials + +Each tutorial is available as an interactive Jupyter notebook that you can run in Google Colab. Click the "Open in Colab" badge at the top of each tutorial to launch it directly in your browser. + +### Prerequisites + +Before running the tutorials, make sure you have: + +1. **An API key** from one of the supported providers: + - [NVIDIA API Key](https://build.nvidia.com) (recommended) + - [OpenAI API Key](https://platform.openai.com/api-keys) + - [OpenRouter API Key](https://openrouter.ai) + +2. **Set your API key** as an environment variable or in the notebook: + ```bash + export NVIDIA_API_KEY="your-api-key-here" + ``` + +## Additional Resources + +- **[Quick Start Guide](/docs/quick-start)**: A condensed introduction to Data Designer +- **[Use Case Recipes](/docs/recipes)**: Complete working examples for specific use cases +- **[API Reference](/api/models)**: Detailed documentation for all configuration options diff --git a/fern/v0.3.3/pages/tutorials/seeding-with-dataset.mdx b/fern/v0.3.3/pages/tutorials/seeding-with-dataset.mdx new file mode 100644 index 00000000..b6e9c351 --- /dev/null +++ b/fern/v0.3.3/pages/tutorials/seeding-with-dataset.mdx @@ -0,0 +1,255 @@ +--- +title: "🎨 Data Designer Tutorial: Seeding with an External Dataset" +--- + + +Run this tutorial interactively in [Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb). + + +#### 📚 What you'll learn + +In this notebook, we will demonstrate how to seed synthetic data generation in Data Designer with an external dataset. + +If this is your first time using Data Designer, we recommend starting with the [first tutorial](/docs/tutorials/the-basics) in this series. + +### 📦 Import the essentials + +- The `essentials` module provides quick access to the most commonly used objects. + +```python +from data_designer.essentials import ( + ChatCompletionInferenceParams, + DataDesigner, + DataDesignerConfigBuilder, + LocalFileSeedSource, + ModelConfig, +) +``` + +### ⚙️ Initialize the Data Designer interface + +- `DataDesigner` is the main object is responsible for managing the data generation process. +- When initialized without arguments, the [default model providers](/docs/concepts/models/default-model-settings) are used. + +```python +data_designer = DataDesigner() +``` + +### 🎛️ Define model configurations + +- Each `ModelConfig` defines a model that can be used during the generation process. +- The "model alias" is used to reference the model in the Data Designer config (as we will see below). +- The "model provider" is the external service that hosts the model (see the [model config](/docs/concepts/models/default-model-settings) docs for more details). +- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. + +```python +# This name is set in the model provider configuration. +MODEL_PROVIDER = "nvidia" + +# The model ID is from build.nvidia.com. +MODEL_ID = "nvidia/nemotron-3-nano-30b-a3b" + +# We choose this alias to be descriptive for our use case. +MODEL_ALIAS = "nemotron-nano-v3" + +model_configs = [ + ModelConfig( + alias=MODEL_ALIAS, + model=MODEL_ID, + provider=MODEL_PROVIDER, + inference_parameters=ChatCompletionInferenceParams( + temperature=1.0, + top_p=1.0, + max_tokens=2048, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ), + ) +] +``` + +### 🏗️ Initialize the Data Designer Config Builder + +- The Data Designer config defines the dataset schema and generation process. +- The config builder provides an intuitive interface for building this configuration. +- The list of model configs is provided to the builder at initialization. + +```python +config_builder = DataDesignerConfigBuilder(model_configs=model_configs) +``` + +## 🏥 Prepare a seed dataset + +- For this notebook, we'll create a synthetic dataset of patient notes. +- We will _seed_ the generation process with a [symptom-to-diagnosis dataset](https://huggingface.co/datasets/gretelai/symptom_to_diagnosis). + + +- Seed datasets let you steer the generation process by providing context that is specific to your use case. +- Seed datasets are also an excellent way to inject real-world diversity into your synthetic data. +- During generation, prompt templates can reference any of the seed dataset fields. + + +```python +# Download sample dataset from Github +import urllib.request + +url = "https://raw.githubusercontent.com/NVIDIA/GenerativeAIExamples/refs/heads/main/nemo/NeMo-Data-Designer/data/gretelai_symptom_to_diagnosis.csv" +local_filename, _ = urllib.request.urlretrieve(url, "gretelai_symptom_to_diagnosis.csv") + +# Seed datasets are passed as reference objects to the config builder. +seed_source = LocalFileSeedSource(path=local_filename) + +config_builder.with_seed_dataset(seed_source) +``` + +## 🎨 Designing our synthetic patient notes dataset + +- Here we use `add_column` with keyword arguments (rather than imported config objects). +- Generally, we recommend using concrete objects, but this is a convenient shorthand. +- **Note**: The prompt template can reference fields from our seed dataset: + - `{{ diagnosis }}` - the medical diagnosis from the seed data + - `{{ patient_summary }}` - the symptom description from the seed data + +```python +config_builder.add_column( + name="patient_sampler", + column_type="sampler", + sampler_type="person_from_faker", +) + +config_builder.add_column( + name="doctor_sampler", + column_type="sampler", + sampler_type="person_from_faker", +) + +config_builder.add_column( + name="patient_id", + column_type="sampler", + sampler_type="uuid", + params={ + "prefix": "PT-", + "short_form": True, + "uppercase": True, + }, +) + +config_builder.add_column( + name="first_name", + column_type="expression", + expr="{{ patient_sampler.first_name}}", +) + +config_builder.add_column( + name="last_name", + column_type="expression", + expr="{{ patient_sampler.last_name }}", +) + + +config_builder.add_column( + name="dob", + column_type="expression", + expr="{{ patient_sampler.birth_date }}", +) + +config_builder.add_column( + name="symptom_onset_date", + column_type="sampler", + sampler_type="datetime", + params={"start": "2024-01-01", "end": "2024-12-31"}, +) + +config_builder.add_column( + name="date_of_visit", + column_type="sampler", + sampler_type="timedelta", + params={"dt_min": 1, "dt_max": 30, "reference_column_name": "symptom_onset_date"}, +) + +config_builder.add_column( + name="physician", + column_type="expression", + expr="Dr. {{ doctor_sampler.last_name }}", +) + +config_builder.add_column( + name="physician_notes", + column_type="llm-text", + prompt="""\ +You are a primary-care physician who just had an appointment with {{ first_name }} {{ last_name }}, +who has been struggling with symptoms from {{ diagnosis }} since {{ symptom_onset_date }}. +The date of today's visit is {{ date_of_visit }}. + +{{ patient_summary }} + +Write careful notes about your visit with {{ first_name }}, +as Dr. {{ doctor_sampler.first_name }} {{ doctor_sampler.last_name }}. + +Format the notes as a busy doctor might. +Respond with only the notes, no other text. +""", + model_alias=MODEL_ALIAS, +) + +data_designer.validate(config_builder) +``` + +### 🔁 Iteration is key – preview the dataset! + +1. Use the `preview` method to generate a sample of records quickly. +2. Inspect the results for quality and format issues. +3. Adjust column configurations, prompts, or parameters as needed. +4. Re-run the preview until satisfied. + +```python +preview = data_designer.preview(config_builder, num_records=2) +``` + +```python +# Run this cell multiple times to cycle through the 2 preview records. +preview.display_sample_record() +``` + +```python +# The preview dataset is available as a pandas DataFrame. +preview.dataset +``` + +### 📊 Analyze the generated data + +- Data Designer automatically generates a basic statistical analysis of the generated data. +- This analysis is available via the `analysis` property of generation result objects. + +```python +# Print the analysis as a table. +preview.analysis.to_report() +``` + +### 🆙 Scale up! + +- Happy with your preview data? +- Use the `create` method to submit larger Data Designer generation jobs. + +```python +results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-3") +``` + +```python +# Load the generated dataset as a pandas DataFrame. +dataset = results.load_dataset() + +dataset.head() +``` + +```python +# Load the analysis results into memory. +analysis = results.load_analysis() + +analysis.to_report() +``` + +## ⏭️ Next Steps + +Check out the following tutorial to learn more about: + +- [Providing images as context](/docs/tutorials/images-as-context) diff --git a/fern/v0.3.3/pages/tutorials/structured-outputs.mdx b/fern/v0.3.3/pages/tutorials/structured-outputs.mdx new file mode 100644 index 00000000..458bde2e --- /dev/null +++ b/fern/v0.3.3/pages/tutorials/structured-outputs.mdx @@ -0,0 +1,314 @@ +--- +title: "🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions" +--- + + +Run this tutorial interactively in [Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb). + + +#### 📚 What you'll learn + +In this notebook, we will continue our exploration of Data Designer, demonstrating more advanced data generation using structured outputs and Jinja expressions. + +If this is your first time using Data Designer, we recommend starting with the [first tutorial](/docs/tutorials/the-basics) in this series. + +### 📦 Import the essentials + +- The `essentials` module provides quick access to the most commonly used objects. + +```python +from data_designer.essentials import ( + CategorySamplerParams, + ChatCompletionInferenceParams, + DataDesigner, + DataDesignerConfigBuilder, + ExpressionColumnConfig, + LLMStructuredColumnConfig, + ModelConfig, + PersonFromFakerSamplerParams, + SamplerColumnConfig, + SamplerType, + SubcategorySamplerParams, +) +``` + +### ⚙️ Initialize the Data Designer interface + +- `DataDesigner` is the main object that is used to interface with the library. +- When initialized without arguments, the [default model providers](/docs/concepts/models/default-model-settings) are used. + +```python +data_designer = DataDesigner() +``` + +### 🎛️ Define model configurations + +- Each `ModelConfig` defines a model that can be used during the generation process. +- The "model alias" is used to reference the model in the Data Designer config (as we will see below). +- The "model provider" is the external service that hosts the model (see the [model config](/docs/concepts/models/default-model-settings) docs for more details). +- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. + +```python +# This name is set in the model provider configuration. +MODEL_PROVIDER = "nvidia" + +# The model ID is from build.nvidia.com. +MODEL_ID = "nvidia/nemotron-3-nano-30b-a3b" + +# We choose this alias to be descriptive for our use case. +MODEL_ALIAS = "nemotron-nano-v3" + +model_configs = [ + ModelConfig( + alias=MODEL_ALIAS, + model=MODEL_ID, + provider=MODEL_PROVIDER, + inference_parameters=ChatCompletionInferenceParams( + temperature=1.0, + top_p=1.0, + max_tokens=2048, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ), + ) +] +``` + +### 🏗️ Initialize the Data Designer Config Builder + +- The Data Designer config defines the dataset schema and generation process. +- The config builder provides an intuitive interface for building this configuration. +- The list of model configs is provided to the builder at initialization. + +```python +config_builder = DataDesignerConfigBuilder(model_configs=model_configs) +``` + +### 🧑‍🎨 Designing our data + +- We will again create a product review dataset, but this time we will use structured outputs and Jinja expressions. +- Structured outputs let you specify the exact schema of the data you want to generate. +- Data Designer supports schemas specified using either JSON schema or Pydantic data models (recommended). + +We'll define our structured outputs using [Pydantic](https://docs.pydantic.dev/latest/) data models. + + +- Pydantic models provide better IDE support and type validation. +- They are more Pythonic than raw JSON schemas. +- They integrate seamlessly with Data Designer's structured output system. + + +```python +from decimal import Decimal +from typing import Literal + +from pydantic import BaseModel, Field + + +# We define a Product schema so that the name, description, and price are generated +# in one go, with the types and constraints specified. +class Product(BaseModel): + name: str = Field(description="The name of the product") + description: str = Field(description="A description of the product") + price: Decimal = Field(description="The price of the product", ge=10, le=1000, decimal_places=2) + + +class ProductReview(BaseModel): + rating: int = Field(description="The rating of the product", ge=1, le=5) + customer_mood: Literal["irritated", "mad", "happy", "neutral", "excited"] = Field( + description="The mood of the customer" + ) + review: str = Field(description="A review of the product") +``` + +Next, let's design our product review dataset using a few more tricks compared to the previous notebook. + +```python +# Since we often only want a few attributes from Person objects, we can +# set drop=True in the column config to drop the column from the final dataset. +config_builder.add_column( + SamplerColumnConfig( + name="customer", + sampler_type=SamplerType.PERSON_FROM_FAKER, + params=PersonFromFakerSamplerParams(), + drop=True, + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="product_category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Electronics", + "Clothing", + "Home & Kitchen", + "Books", + "Home Office", + ], + ), + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="product_subcategory", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="product_category", + values={ + "Electronics": ["Smartphones", "Laptops", "Headphones", "Cameras", "Accessories"], + "Clothing": ["Men's Clothing", "Women's Clothing", "Winter Coats", "Activewear", "Accessories"], + "Home & Kitchen": ["Appliances", "Cookware", "Furniture", "Decor", "Organization"], + "Books": ["Fiction", "Non-Fiction", "Self-Help", "Textbooks", "Classics"], + "Home Office": ["Desks", "Chairs", "Storage", "Office Supplies", "Lighting"], + }, + ), + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="target_age_range", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["18-25", "25-35", "35-50", "50-65", "65+"]), + ) +) + +# Sampler columns support conditional params, which are used if the condition is met. +# In this example, we set the review style to rambling if the target age range is 18-25. +# Note conditional parameters are only supported for Sampler column types. +config_builder.add_column( + SamplerColumnConfig( + name="review_style", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["rambling", "brief", "detailed", "structured with bullet points"], + weights=[1, 2, 2, 1], + ), + conditional_params={ + "target_age_range == '18-25'": CategorySamplerParams(values=["rambling"]), + }, + ) +) + +# Optionally validate that the columns are configured correctly. +data_designer.validate(config_builder) +``` + +Next, we will use more advanced Jinja expressions to create new columns. + +Jinja expressions let you: + +- Access nested attributes: `{{ customer.first_name }}` +- Combine values: `{{ customer.first_name }} {{ customer.last_name }}` +- Use conditional logic: `{% if condition %}...{% endif %}` + +```python +# We can create new columns using Jinja expressions that reference +# existing columns, including attributes of nested objects. +config_builder.add_column( + ExpressionColumnConfig(name="customer_name", expr="{{ customer.first_name }} {{ customer.last_name }}") +) + +config_builder.add_column(ExpressionColumnConfig(name="customer_age", expr="{{ customer.age }}")) + +config_builder.add_column( + LLMStructuredColumnConfig( + name="product", + prompt=( + "Create a product in the '{{ product_category }}' category, focusing on products " + "related to '{{ product_subcategory }}'. The target age range of the ideal customer is " + "{{ target_age_range }} years old. The product should be priced between $10 and $1000." + ), + output_format=Product, + model_alias=MODEL_ALIAS, + ) +) + +# We can even use if/else logic in our Jinja expressions to create more complex prompt patterns. +config_builder.add_column( + LLMStructuredColumnConfig( + name="customer_review", + prompt=( + "Your task is to write a review for the following product:\n\n" + "Product Name: {{ product.name }}\n" + "Product Description: {{ product.description }}\n" + "Price: {{ product.price }}\n\n" + "Imagine your name is {{ customer_name }} and you are from {{ customer.city }}, {{ customer.state }}. " + "Write the review in a style that is '{{ review_style }}'." + "{% if target_age_range == '18-25' %}" + "Make sure the review is more informal and conversational.\n" + "{% else %}" + "Make sure the review is more formal and structured.\n" + "{% endif %}" + "The review field should contain only the review, no other text." + ), + output_format=ProductReview, + model_alias=MODEL_ALIAS, + ) +) + +data_designer.validate(config_builder) +``` + +### 🔁 Iteration is key – preview the dataset! + +1. Use the `preview` method to generate a sample of records quickly. +2. Inspect the results for quality and format issues. +3. Adjust column configurations, prompts, or parameters as needed. +4. Re-run the preview until satisfied. + +```python +preview = data_designer.preview(config_builder, num_records=2) +``` + +```python +# Run this cell multiple times to cycle through the 2 preview records. +preview.display_sample_record() +``` + +```python +# The preview dataset is available as a pandas DataFrame. +preview.dataset +``` + +### 📊 Analyze the generated data + +- Data Designer automatically generates a basic statistical analysis of the generated data. +- This analysis is available via the `analysis` property of generation result objects. + +```python +# Print the analysis as a table. +preview.analysis.to_report() +``` + +### 🆙 Scale up! + +- Happy with your preview data? +- Use the `create` method to submit larger Data Designer generation jobs. + +```python +results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-2") +``` + +```python +# Load the generated dataset as a pandas DataFrame. +dataset = results.load_dataset() + +dataset.head() +``` + +```python +# Load the analysis results into memory. +analysis = results.load_analysis() + +analysis.to_report() +``` + +## ⏭️ Next Steps + +Check out the following tutorials to learn more about: + +- [Seeding synthetic data generation with an external dataset](/docs/tutorials/seeding-with-dataset) +- [Providing images as context](/docs/tutorials/images-as-context) diff --git a/fern/v0.3.3/pages/tutorials/the-basics.mdx b/fern/v0.3.3/pages/tutorials/the-basics.mdx new file mode 100644 index 00000000..34e1a388 --- /dev/null +++ b/fern/v0.3.3/pages/tutorials/the-basics.mdx @@ -0,0 +1,321 @@ +--- +title: "🎨 Data Designer Tutorial: The Basics" +description: Learn the fundamentals of Data Designer by generating a simple product review dataset. +--- + + +Run this tutorial interactively in [Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/1-the-basics.ipynb). + + +#### 📚 What you'll learn + +This notebook demonstrates the basics of Data Designer by generating a simple product review dataset. + +### 📦 Import the essentials + +- The `essentials` module provides quick access to the most commonly used objects. + +```python +from data_designer.essentials import ( + CategorySamplerParams, + ChatCompletionInferenceParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMTextColumnConfig, + ModelConfig, + PersonFromFakerSamplerParams, + SamplerColumnConfig, + SamplerType, + SubcategorySamplerParams, + UniformSamplerParams, +) +``` + +### ⚙️ Initialize the Data Designer interface + +- `DataDesigner` is the main object is responsible for managing the data generation process. + +- When initialized without arguments, the [default model providers](/docs/concepts/models/default-model-settings) are used. + +```python +data_designer = DataDesigner() +``` + +### 🎛️ Define model configurations + +- Each `ModelConfig` defines a model that can be used during the generation process. + +- The "model alias" is used to reference the model in the Data Designer config (as we will see below). + +- The "model provider" is the external service that hosts the model (see the [model config](/docs/concepts/models/default-model-settings) docs for more details). + +- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. + +```python +# This name is set in the model provider configuration. +MODEL_PROVIDER = "nvidia" + +# The model ID is from build.nvidia.com. +MODEL_ID = "nvidia/nemotron-3-nano-30b-a3b" + +# We choose this alias to be descriptive for our use case. +MODEL_ALIAS = "nemotron-nano-v3" + +model_configs = [ + ModelConfig( + alias=MODEL_ALIAS, + model=MODEL_ID, + provider=MODEL_PROVIDER, + inference_parameters=ChatCompletionInferenceParams( + temperature=1.0, + top_p=1.0, + max_tokens=2048, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ), + ) +] +``` + +### 🏗️ Initialize the Data Designer Config Builder + +- The Data Designer config defines the dataset schema and generation process. + +- The config builder provides an intuitive interface for building this configuration. + +- The list of model configs is provided to the builder at initialization. + +```python +config_builder = DataDesignerConfigBuilder(model_configs=model_configs) +``` + +## 🎲 Getting started with sampler columns + +- Sampler columns offer non-LLM based generation of synthetic data. + +- They are particularly useful for **steering the diversity** of the generated data, as we demonstrate below. + +You can view available samplers using the config builder's `info` property: + +```python +config_builder.info.display("samplers") +``` + +Let's start designing our product review dataset by adding product category and subcategory columns. + +```python +config_builder.add_column( + SamplerColumnConfig( + name="product_category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Electronics", + "Clothing", + "Home & Kitchen", + "Books", + "Home Office", + ], + ), + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="product_subcategory", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="product_category", + values={ + "Electronics": [ + "Smartphones", + "Laptops", + "Headphones", + "Cameras", + "Accessories", + ], + "Clothing": [ + "Men's Clothing", + "Women's Clothing", + "Winter Coats", + "Activewear", + "Accessories", + ], + "Home & Kitchen": [ + "Appliances", + "Cookware", + "Furniture", + "Decor", + "Organization", + ], + "Books": [ + "Fiction", + "Non-Fiction", + "Self-Help", + "Textbooks", + "Classics", + ], + "Home Office": [ + "Desks", + "Chairs", + "Storage", + "Office Supplies", + "Lighting", + ], + }, + ), + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="target_age_range", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["18-25", "25-35", "35-50", "50-65", "65+"]), + ) +) + +# Optionally validate that the columns are configured correctly. +data_designer.validate(config_builder) +``` + +Next, let's add samplers to generate data related to the customer and their review. + +```python +config_builder.add_column( + SamplerColumnConfig( + name="customer", + sampler_type=SamplerType.PERSON_FROM_FAKER, + params=PersonFromFakerSamplerParams(age_range=[18, 70], locale="en_US"), + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="number_of_stars", + sampler_type=SamplerType.UNIFORM, + params=UniformSamplerParams(low=1, high=5), + convert_to="int", # Convert the sampled float to an integer. + ) +) + +config_builder.add_column( + SamplerColumnConfig( + name="review_style", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["rambling", "brief", "detailed", "structured with bullet points"], + weights=[1, 2, 2, 1], + ), + ) +) + +data_designer.validate(config_builder) +``` + +## 🦜 LLM-generated columns + +- The real power of Data Designer comes from leveraging LLMs to generate text, code, and structured data. + +- When prompting the LLM, we can use Jinja templating to reference other columns in the dataset. + +- As we see below, nested json fields can be accessed using dot notation. + +```python +config_builder.add_column( + LLMTextColumnConfig( + name="product_name", + prompt=( + "You are a helpful assistant that generates product names. DO NOT add quotes around the product name.\n\n" + "Come up with a creative product name for a product in the '{{ product_category }}' category, focusing " + "on products related to '{{ product_subcategory }}'. The target age range of the ideal customer is " + "{{ target_age_range }} years old. Respond with only the product name, no other text." + ), + model_alias=MODEL_ALIAS, + ) +) + +config_builder.add_column( + LLMTextColumnConfig( + name="customer_review", + prompt=( + "You are a customer named {{ customer.first_name }} from {{ customer.city }}, {{ customer.state }}. " + "You are {{ customer.age }} years old and recently purchased a product called {{ product_name }}. " + "Write a review of this product, which you gave a rating of {{ number_of_stars }} stars. " + "The style of the review should be '{{ review_style }}'. " + "Respond with only the review, no other text." + ), + model_alias=MODEL_ALIAS, + ) +) + +data_designer.validate(config_builder) +``` + +### 🔁 Iteration is key – preview the dataset! + +1. Use the `preview` method to generate a sample of records quickly. + +2. Inspect the results for quality and format issues. + +3. Adjust column configurations, prompts, or parameters as needed. + +4. Re-run the preview until satisfied. + +```python +preview = data_designer.preview(config_builder, num_records=2) +``` + +```python +# Run this cell multiple times to cycle through the 2 preview records. +preview.display_sample_record() +``` + +```python +# The preview dataset is available as a pandas DataFrame. +preview.dataset +``` + +### 📊 Analyze the generated data + +- Data Designer automatically generates a basic statistical analysis of the generated data. + +- This analysis is available via the `analysis` property of generation result objects. + +```python +# Print the analysis as a table. +preview.analysis.to_report() +``` + +### 🆙 Scale up! + +- Happy with your preview data? + +- Use the `create` method to submit larger Data Designer generation jobs. + +```python +results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-1") +``` + +```python +# Load the generated dataset as a pandas DataFrame. +dataset = results.load_dataset() + +dataset.head() +``` + +```python +# Load the analysis results into memory. +analysis = results.load_analysis() + +analysis.to_report() +``` + +## ⏭️ Next Steps + +Now that you've seen the basics of Data Designer, check out the following tutorials to learn more about: + +- [Structured outputs and jinja expressions](/docs/tutorials/structured-outputs) + +- [Seeding synthetic data generation with an external dataset](/docs/tutorials/seeding-with-dataset) + +- [Providing images as context](/docs/tutorials/images-as-context) diff --git a/fern/versions/v0.3.3.yml b/fern/versions/v0.3.3.yml new file mode 100644 index 00000000..034d93aa --- /dev/null +++ b/fern/versions/v0.3.3.yml @@ -0,0 +1,101 @@ +tabs: + docs: + display-name: Documentation + slug: docs + api: + display-name: API Reference + slug: api + +navigation: + - tab: docs + layout: + - section: Getting Started + contents: + - page: Welcome + path: ../v0.3.3/pages/index.mdx + - page: Installation + path: ../v0.3.3/pages/installation.mdx + - page: Quick Start + path: ../v0.3.3/pages/quick-start.mdx + - page: Contributing + path: ../v0.3.3/pages/contributing.mdx + - section: Concepts + contents: + - section: Models + contents: + - page: Default Model Settings + path: ../v0.3.3/pages/concepts/models/default-model-settings.mdx + - page: Custom Model Settings + path: ../v0.3.3/pages/concepts/models/custom-model-settings.mdx + - page: Configure with CLI + path: ../v0.3.3/pages/concepts/models/configure-with-cli.mdx + - page: Model Providers + path: ../v0.3.3/pages/concepts/models/model-providers.mdx + - page: Model Configs + path: ../v0.3.3/pages/concepts/models/model-configs.mdx + - page: Inference Parameters + path: ../v0.3.3/pages/concepts/models/inference-parameters.mdx + - page: Columns + path: ../v0.3.3/pages/concepts/columns.mdx + - page: Validators + path: ../v0.3.3/pages/concepts/validators.mdx + - page: Processors + path: ../v0.3.3/pages/concepts/processors.mdx + - page: Person Sampling + path: ../v0.3.3/pages/concepts/person-sampling.mdx + - section: Tutorials + contents: + - page: Overview + path: ../v0.3.3/pages/tutorials/overview.mdx + - page: The Basics + path: ../v0.3.3/pages/tutorials/the-basics.mdx + - page: Structured Outputs + path: ../v0.3.3/pages/tutorials/structured-outputs.mdx + - page: Seeding with a Dataset + path: ../v0.3.3/pages/tutorials/seeding-with-dataset.mdx + - page: Images as Context + path: ../v0.3.3/pages/tutorials/images-as-context.mdx + - section: Recipes + contents: + - page: Recipe Cards + path: ../v0.3.3/pages/recipes/index.mdx + - section: Code Generation + contents: + - page: Text to Python + path: ../v0.3.3/pages/recipes/code-generation/text-to-python.mdx + - page: Text to SQL + path: ../v0.3.3/pages/recipes/code-generation/text-to-sql.mdx + - section: QA and Chat + contents: + - page: Product Info QA + path: ../v0.3.3/pages/recipes/qa-and-chat/product-info-qa.mdx + - page: Multi-Turn Chat + path: ../v0.3.3/pages/recipes/qa-and-chat/multi-turn-chat.mdx + - section: Plugins + contents: + - page: Overview + path: ../v0.3.3/pages/plugins/overview.mdx + - page: Example Plugin + path: ../v0.3.3/pages/plugins/example.mdx + - page: Available Plugins + path: ../v0.3.3/pages/plugins/available.mdx + - tab: api + layout: + - section: API Reference + contents: + - page: Models + path: ../v0.3.3/pages/api-reference/models.mdx + - page: Column Configs + path: ../v0.3.3/pages/api-reference/column-configs.mdx + - page: Config Builder + path: ../v0.3.3/pages/api-reference/config-builder.mdx + - page: Data Designer Config + path: ../v0.3.3/pages/api-reference/data-designer-config.mdx + - page: Sampler Params + path: ../v0.3.3/pages/api-reference/sampler-params.mdx + - page: Validator Params + path: ../v0.3.3/pages/api-reference/validator-params.mdx + - page: Processors + path: ../v0.3.3/pages/api-reference/processors.mdx + - page: Analysis + path: ../v0.3.3/pages/api-reference/analysis.mdx diff --git a/fern/versions/v0.4.0.yml b/fern/versions/v0.4.0.yml new file mode 100644 index 00000000..e6a3fcf1 --- /dev/null +++ b/fern/versions/v0.4.0.yml @@ -0,0 +1,12 @@ +tabs: + docs: + display-name: Documentation + slug: docs + +navigation: + - tab: docs + layout: + - section: Getting Started + contents: + - page: Welcome + path: ../v0.4.0/pages/index.mdx diff --git a/scripts/fern_migration/convert_admonitions.py b/scripts/fern_migration/convert_admonitions.py new file mode 100644 index 00000000..bbfab359 --- /dev/null +++ b/scripts/fern_migration/convert_admonitions.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Convert MkDocs admonitions to Fern callouts.""" +import re +import sys + +ADMONITION_MAP = { + "note": "Note", + "tip": "Tip", + "info": "Info", + "warning": "Warning", + "danger": "Warning", + "question": "Info", + "example": "Info", + "abstract": "Note", + "success": "Tip", + "failure": "Warning", + "bug": "Warning", +} + + +def convert_admonitions(content: str) -> str: + """Convert !!! admonitions to components.""" + pattern = r'!!! (\w+)(?: "([^"]*)")?\n((?: .*\n?)*)' + + def replace(match: re.Match) -> str: + admon_type = match.group(1).lower() + title = match.group(2) or "" + body = match.group(3) + # Remove 4-space indent from body + body = re.sub(r"^ ", "", body, flags=re.MULTILINE).strip() + fern_type = ADMONITION_MAP.get(admon_type, "Note") + if title: + return f'<{fern_type} title="{title}">\n{body}\n\n' + return f"<{fern_type}>\n{body}\n\n" + + return re.sub(pattern, replace, content) + + +if __name__ == "__main__": + content = sys.stdin.read() + print(convert_admonitions(content)) diff --git a/scripts/fern_migration/convert_tabs.py b/scripts/fern_migration/convert_tabs.py new file mode 100644 index 00000000..f5a7dcca --- /dev/null +++ b/scripts/fern_migration/convert_tabs.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Convert MkDocs tabs to Fern Tabs components.""" +import re +import sys + + +def convert_tabs(content: str) -> str: + """Convert === tabs to components.""" + # Match tab groups + pattern = r'((?:=== "([^"]+)"\n((?: .*\n?)*)\n?)+)' + + def replace_group(match: re.Match) -> str: + group = match.group(0) + tabs = re.findall(r'=== "([^"]+)"\n((?: .*\n?)*)', group) + result = [""] + for title, body in tabs: + body = re.sub(r"^ ", "", body, flags=re.MULTILINE).strip() + # Indent the body content properly + body_lines = body.split("\n") + indented_body = "\n".join([" " + line if line.strip() else "" for line in body_lines]) + result.append(f' ') + result.append(indented_body) + result.append(" ") + result.append("") + return "\n".join(result) + "\n" + + return re.sub(pattern, replace_group, content) + + +if __name__ == "__main__": + content = sys.stdin.read() + print(convert_tabs(content)) diff --git a/scripts/fern_migration/notebook_to_mdx.py b/scripts/fern_migration/notebook_to_mdx.py new file mode 100644 index 00000000..88ab37bd --- /dev/null +++ b/scripts/fern_migration/notebook_to_mdx.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Convert Jupyter notebook source (.py format) to MDX.""" +import re +import sys +from pathlib import Path + + +def notebook_py_to_mdx(notebook_path: str, colab_url: str, title: str | None = None) -> str: + """Convert a Jupyter notebook source file (.py with Jupytext format) to MDX format.""" + with open(notebook_path) as f: + content = f.read() + + # Extract title from the notebook if not provided + if title is None: + title_match = re.search(r"# # (.+)", content) + if title_match: + title = title_match.group(1).strip() + # Remove emoji if present + title = re.sub(r"^[🎨📓🏥]\s*", "", title) + else: + title = Path(notebook_path).stem.replace("-", " ").title() + + lines = [ + "---", + f"title: {title}", + "---", + "", + '', + f"Run this tutorial interactively in [Google Colab]({colab_url}).", + "", + "", + ] + + # Process the notebook content + in_markdown_block = False + in_code_block = False + current_content = [] + + for line in content.split("\n"): + # Skip Jupytext header + if line.startswith("# ---") or line.startswith("# "): + continue + + # Markdown cell marker + if line == "# %% [markdown]": + if in_code_block: + lines.append("```") + lines.append("") + in_code_block = False + in_markdown_block = True + continue + + # Code cell marker + if line == "# %%": + if in_markdown_block: + in_markdown_block = False + if in_code_block: + lines.append("```") + lines.append("") + lines.append("```python") + in_code_block = True + continue + + # Process content + if in_markdown_block: + # Remove the '# ' prefix from markdown lines + if line.startswith("# "): + lines.append(line[2:]) + elif line == "#": + lines.append("") + else: + lines.append(line) + elif in_code_block: + lines.append(line) + + # Close any open code block + if in_code_block: + lines.append("```") + lines.append("") + + return "\n".join(lines) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: notebook_to_mdx.py [title]") + sys.exit(1) + title = sys.argv[3] if len(sys.argv) > 3 else None + print(notebook_py_to_mdx(sys.argv[1], sys.argv[2], title))