diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4e9cd336..466ddb34 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,23 +1,24 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/python { - "name": "Python 3", + "name": "Data Formulator Dev", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile - "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", + "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", // Features to add to the dev container. More info: https://containers.dev/features. - "features": { - "ghcr.io/devcontainers/features/node:1": { - "version": "18" - }, - "ghcr.io/devcontainers/features/azure-cli:1": {} - }, + "features": { + "ghcr.io/devcontainers/features/node:1": { + "version": "18" + }, + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/astral-sh/uv:1": {} + }, // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], + "forwardPorts": [5000, 5173], // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "cd /workspaces/data-formulator && npm install && npm run build && python3 -m venv /workspaces/data-formulator/venv && . /workspaces/data-formulator/venv/bin/activate && pip install -e /workspaces/data-formulator --verbose && data_formulator" + "postCreateCommand": "cd /workspaces/data-formulator && npm install && npm run build && uv sync && uv run data_formulator" // Configure tool-specific properties. // "customizations": {}, diff --git a/.env.template b/.env.template index 2405af77..0d9d9a46 100644 --- a/.env.template +++ b/.env.template @@ -3,6 +3,4 @@ # python -m data_formulator -p 5000 --exec-python-in-subprocess true --disable-display-keys true DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend -EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response - -LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory \ No newline at end of file +EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response \ No newline at end of file diff --git a/.gitignore b/.gitignore index f3420acd..6b8ca6c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *env +.venv/ *api-keys.env **/*.ipynb_checkpoints/ .DS_Store diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..2c073331 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 6467a874..3a483e25 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -2,16 +2,34 @@ How to set up your local machine. ## Prerequisites -* Python > 3.11 +* Python >= 3.11 * Node.js * Yarn +* [uv](https://docs.astral.sh/uv/) (recommended) or pip ## Backend (Python) +### Option 1: With uv (recommended) + +uv is faster and provides reproducible builds via lockfile. + +```bash +uv sync # Creates .venv and installs all dependencies +uv run data_formulator # Run app (opens browser automatically) +uv run data_formulator --dev # Run backend only (for frontend development) +``` + +**Which command to use:** +- **End users / testing the full app**: `uv run data_formulator` - starts server and opens browser to http://localhost:5000 +- **Frontend development**: `uv run data_formulator --dev` - starts backend server only, then run `yarn start` separately for the Vite dev server on http://localhost:5173 + +### Option 2: With pip (fallback) + - **Create a Virtual Environment** ```bash python -m venv venv - .\venv\Scripts\activate + source venv/bin/activate # Unix + # or .\venv\Scripts\activate # Windows ``` - **Install Dependencies** @@ -29,7 +47,6 @@ How to set up your local machine. - configure settings as needed: - DISABLE_DISPLAY_KEYS: if true, API keys will not be shown in the frontend - EXEC_PYTHON_IN_SUBPROCESS: if true, Python code runs in a subprocess (safer but slower), you may consider setting it true when you are hosting Data Formulator for others - - LOCAL_DB_DIR: directory to store the local database (uses temp directory if not set) - External database settings (when USE_EXTERNAL_DB=true): - DB_NAME: name to refer to this database connection - DB_TYPE: mysql or postgresql (currently only these two are supported) @@ -41,14 +58,16 @@ How to set up your local machine. - **Run the app** - - **Windows** - ```bash - .\local_server.bat - ``` - - - **Unix-based** ```bash + # Unix ./local_server.sh + + # Windows + .\local_server.bat + + # Or directly + data_formulator # Opens browser automatically + data_formulator --dev # Backend only (for frontend development) ``` ## Frontend (TypeScript) @@ -61,7 +80,12 @@ How to set up your local machine. - **Development mode** - Run the front-end in development mode using, allowing real-time edits and previews: + First, start the backend server (in a separate terminal): + ```bash + uv run data_formulator --dev # or ./local_server.sh + ``` + + Then, run the frontend in development mode with hot reloading: ```bash yarn start ``` @@ -81,6 +105,10 @@ How to set up your local machine. Then, build python package: ```bash + # With uv + uv build + + # Or with pip pip install build python -m build ``` @@ -112,23 +140,23 @@ How to set up your local machine. When deploying Data Formulator to production, please be aware of the following security considerations: -### Database Storage Security +### Database and Data Storage Security -1. **Local DuckDB Files**: When database functionality is enabled (default), Data Formulator stores DuckDB database files locally on the server. These files contain user data and are stored in the system's temporary directory or a configured `LOCAL_DB_DIR`. +1. **Workspace and table data**: Table data is stored in per-identity workspaces (e.g. parquet files). DuckDB is used only in-memory per request when needed (e.g. for SQL mode); no persistent DuckDB database files are created by the app. -2. **Session Management**: - - When database is **enabled**: Session IDs are stored in Flask sessions (cookies) and linked to local DuckDB files - - When database is **disabled**: No persistent storage is used, and no cookies are set. Session IDs are generated per request for API consistency +2. **Identity Management**: + - Each user's data is isolated by a namespaced identity key (e.g., `user:alice@example.com` or `browser:550e8400-...`) + - Anonymous users get a browser-based UUID stored in localStorage + - Authenticated users get their verified user ID from the auth provider -3. **Data Persistence**: User data processed through Data Formulator may be temporarily stored in these local DuckDB files, which could be a security risk in multi-tenant environments. +3. **Data persistence**: User data may be written to workspace storage (e.g. parquet) on the server. In multi-tenant deployments, ensure workspace directories are isolated and access-controlled. ### Recommended Security Measures For production deployment, consider: -1. **Use `--disable-database` flag** for stateless deployments where no data persistence is needed +1. **Use `--disable-database` flag** to disable table-connector routes when you do not need external or uploaded table support 2. **Implement proper authentication, authorization, and other security measures** as needed for your specific use case, for example: - - Store DuckDB file in a database - User authentication (OAuth, JWT tokens, etc.) - Role-based access control - API rate limiting @@ -142,5 +170,90 @@ For production deployment, consider: python -m data_formulator.app --disable-database ``` +## Authentication Architecture + +Data Formulator supports a **hybrid identity system** that supports both anonymous and authenticated users. + +### Identity Flow Overview + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Frontend Request │ +├─────────────────────────────────────────────────────────────────────┤ +│ Headers: │ +│ X-Identity-Id: "browser:550e8400-..." (namespace sent by client) │ +│ Authorization: Bearer (if custom auth implemented) │ +│ (Azure also adds X-MS-CLIENT-PRINCIPAL-ID automatically) │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Backend Identity Resolution │ +│ (auth.py: get_identity_id) │ +├─────────────────────────────────────────────────────────────────────┤ +│ Priority 1: Azure X-MS-CLIENT-PRINCIPAL-ID → "user:" │ +│ Priority 2: JWT Bearer token (if implemented) → "user:" │ +│ Priority 3: X-Identity-Id header → ALWAYS "browser:" │ +│ (client-provided namespace is IGNORED for security) │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Storage Isolation │ +├─────────────────────────────────────────────────────────────────────┤ +│ "user:alice@example.com" → alice's DuckDB file (ONLY via auth) │ +│ "browser:550e8400-..." → anonymous user's DuckDB file │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Security Model + +**Critical Security Rule:** The backend NEVER trusts the namespace prefix from the client-provided `X-Identity-Id` header. Even if a client sends `X-Identity-Id: "user:alice@..."`, the backend strips the prefix and forces `browser:alice@...`. Only verified authentication (Azure headers or JWT) can result in a `user:` prefixed identity. + +The key security principle is **namespaced isolation with forced prefixing**: + +| Scenario | X-Identity-Id Sent | Backend Resolution | Storage Key | +|----------|-------------------|-------------------|-------------| +| Anonymous user | `browser:550e8400-...` | Strips prefix, forces `browser:` | `browser:550e8400-...` | +| Azure logged-in user | `browser:550e8400-...` | Uses Azure header (priority 1) | `user:alice@...` | +| Attacker spoofing | `user:alice@...` (forged) | No valid auth, strips & forces `browser:` | `browser:alice@...` | + +**Why this is secure:** An attacker sending `X-Identity-Id: user:alice@...` gets `browser:alice@...` as their storage key, which is completely separate from the real `user:alice@...` that only authenticated Alice can access. + +### Implementing Custom Authentication + +To add JWT-based authentication: + +1. **Backend** (`tables_routes.py`): Uncomment and configure the JWT verification code in `get_identity_id()` +2. **Frontend** (`utils.tsx`): Implement `getAuthToken()` to retrieve the JWT from your auth context +3. **Add JWT secret** to Flask config: `current_app.config['JWT_SECRET']` + +### Azure App Service Authentication + +When deployed to Azure with EasyAuth enabled: +- Azure automatically adds `X-MS-CLIENT-PRINCIPAL-ID` header to authenticated requests +- The backend reads this header first (highest priority) +- No frontend changes needed - Azure handles the auth flow + +### Frontend Identity Management + +The frontend (`src/app/identity.ts`) manages identity as follows: + +```typescript +// Identity is always initialized with browser ID +identity: { type: 'browser', id: getBrowserId() } + +// If user logs in (e.g., via Azure), it's updated to: +identity: { type: 'user', id: userInfo.userId } + +// All API requests send namespaced identity: +// X-Identity-Id: "browser:550e8400-..." or "user:alice@..." +``` + +This ensures: +1. **Anonymous users**: Work immediately with localStorage-based browser ID +2. **Logged-in users**: Get their verified user ID from the auth provider +3. **Cross-tab consistency**: Browser ID is shared via localStorage across all tabs + ## Usage See the [Usage section on the README.md page](README.md#usage). diff --git a/README.md b/README.md index 5b4f9e84..db7ff79a 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@

Try Online Demo   - Install Locally + Install Locally

@@ -32,6 +32,9 @@ https://github.com/user-attachments/assets/8ca57b68-4d7a-42cb-bcce-43f8b1681ce2 ## News 🔥🔥🔥 +[01-31-2025] **uv support** — Faster installation with uv +- 🚀 **Install with uv**: Data Formulator now supports installation via [uv](https://docs.astral.sh/uv/), the ultra-fast Python package manager. Get started in seconds with `uvx data_formulator` or `uv pip install data_formulator`. + [01-25-2025] **Data Formulator 0.6** — Real-time insights from live data - ⚡ **Connect to live data**: Connect to URLs and databases with automatic refresh intervals. Visualizations update automatically as your data changes to provide you live insights. [Demo: track international space station position speed live](https://github.com/microsoft/data-formulator/releases/tag/0.6) - 🎨 **UI Updates**: Unified UI for data loading; direct drag-and-drop fields from the data table to update visualization designs. @@ -127,9 +130,30 @@ Data Formulator enables analysts to iteratively explore and visualize data. Star Play with Data Formulator with one of the following options: -- **Option 1: Install via Python PIP** +- **Option 1: Install via uv (recommended)** + + [uv](https://docs.astral.sh/uv/) is an extremely fast Python package manager. If you have uv installed, you can run Data Formulator directly without any setup: + + ```bash + # Run data formulator directly (no install needed) + uvx data_formulator + ``` + + Or install it in a project/virtual environment: + + ```bash + # Install data_formulator + uv pip install data_formulator + + # Run data formulator + python -m data_formulator + ``` + + Data Formulator will be automatically opened in the browser at [http://localhost:5000](http://localhost:5000). + +- **Option 2: Install via pip** - Use Python PIP for an easy setup experience, running locally (recommend: install it in a virtual environment). + Use pip for installation (recommend: install it in a virtual environment). ```bash # install data_formulator @@ -143,13 +167,13 @@ Play with Data Formulator with one of the following options: *you can specify the port number (e.g., 8080) by `python -m data_formulator --port 8080` if the default port is occupied.* -- **Option 2: Codespaces (5 minutes)** +- **Option 3: Codespaces (5 minutes)** You can also run Data Formulator in Codespaces; we have everything pre-configured. For more details, see [CODESPACES.md](CODESPACES.md). [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/microsoft/data-formulator?quickstart=1) -- **Option 3: Working in the developer mode** +- **Option 4: Working in the developer mode** You can build Data Formulator locally if you prefer full control over your development environment and develop your own version on top. For detailed instructions, refer to [DEVELOPMENT.md](DEVELOPMENT.md). diff --git a/local_server.bat b/local_server.bat index b585d712..36026cf9 100644 --- a/local_server.bat +++ b/local_server.bat @@ -7,4 +7,11 @@ :: set https_proxy=http://127.0.0.1:7890 set FLASK_RUN_PORT=5000 -python -m py-src.data_formulator.app --port %FLASK_RUN_PORT% --dev + +:: Use uv if available, otherwise fall back to python +where uv >nul 2>nul +if %ERRORLEVEL% EQU 0 ( + uv run data_formulator --port %FLASK_RUN_PORT% --dev +) else ( + python -m data_formulator.app --port %FLASK_RUN_PORT% --dev +) diff --git a/local_server.sh b/local_server.sh index 0df7db89..fbba1e3b 100644 --- a/local_server.sh +++ b/local_server.sh @@ -5,6 +5,11 @@ # export http_proxy=http://127.0.0.1:7890 # export https_proxy=http://127.0.0.1:7890 -#env FLASK_APP=py-src/data_formulator/app.py FLASK_RUN_PORT=5000 FLASK_RUN_HOST=0.0.0.0 flask run export FLASK_RUN_PORT=5000 -python -m py-src.data_formulator.app --port ${FLASK_RUN_PORT} --dev \ No newline at end of file + +# Use uv if available, otherwise fall back to python +if command -v uv &> /dev/null; then + uv run data_formulator --port ${FLASK_RUN_PORT} --dev +else + python -m data_formulator.app --port ${FLASK_RUN_PORT} --dev +fi \ No newline at end of file diff --git a/public/screenshot-stock-price-live.webp b/public/screenshot-stock-price-live.webp new file mode 100644 index 00000000..b0ebe71b Binary files /dev/null and b/public/screenshot-stock-price-live.webp differ diff --git a/py-src/data_formulator/__init__.py b/py-src/data_formulator/__init__.py index 2f2fd61f..ee0d133d 100644 --- a/py-src/data_formulator/__init__.py +++ b/py-src/data_formulator/__init__.py @@ -3,7 +3,7 @@ def run_app(): """Launch the Data Formulator Flask application.""" - # Import app only when actually running to avoid side effects + # Import app only when actually running to avoid heavy imports at package load from data_formulator.app import run_app as _run_app return _run_app() diff --git a/py-src/data_formulator/agent_routes.py b/py-src/data_formulator/agent_routes.py index 3de374bb..271d80e8 100644 --- a/py-src/data_formulator/agent_routes.py +++ b/py-src/data_formulator/agent_routes.py @@ -12,22 +12,19 @@ mimetypes.add_type('application/javascript', '.mjs') import flask -from flask import request, session, jsonify, Blueprint, current_app, Response, stream_with_context +from flask import request, jsonify, Blueprint, current_app, Response, stream_with_context import logging import json import html import pandas as pd -from data_formulator.agents.agent_concept_derive import ConceptDeriveAgent -from data_formulator.agents.agent_py_concept_derive import PyConceptDeriveAgent - -from data_formulator.agents.agent_py_data_transform import PythonDataTransformationAgent -from data_formulator.agents.agent_sql_data_transform import SQLDataTransformationAgent -from data_formulator.agents.agent_py_data_rec import PythonDataRecAgent -from data_formulator.agents.agent_sql_data_rec import SQLDataRecAgent +from data_formulator.agents.agent_data_transform import DataTransformationAgent +from data_formulator.agents.agent_data_rec import DataRecAgent from data_formulator.agents.agent_sort_data import SortDataAgent +from data_formulator.auth import get_identity_id +from data_formulator.datalake.workspace import Workspace, WorkspaceWithTempData from data_formulator.agents.agent_data_load import DataLoadAgent from data_formulator.agents.agent_data_clean import DataCleanAgent from data_formulator.agents.agent_data_clean_stream import DataCleanAgentStream @@ -36,12 +33,26 @@ from data_formulator.agents.agent_report_gen import ReportGenAgent from data_formulator.agents.client_utils import Client -from data_formulator.db_manager import db_manager from data_formulator.workflows.exploration_flow import run_exploration_flow_streaming # Get logger for this module (logging config done in app.py) logger = logging.getLogger(__name__) + +def get_temp_tables(workspace, input_tables: list[dict]) -> list[dict]: + """ + Determine which input tables are temp tables (not persisted in the workspace datalake). + + Args: + workspace: The user's workspace instance + input_tables: List of table dicts with 'name' and 'rows' keys + + Returns: + List of table dicts that don't exist in the workspace (temp tables) + """ + existing_tables = set(workspace.list_tables()) + return [table for table in input_tables if table.get('name') not in existing_tables] + agent_bp = Blueprint('agent', __name__, url_prefix='/api/agent') def get_client(model_config): @@ -123,7 +134,7 @@ def sanitize_model_error(error_message: str) -> str: # Keep only the essential error info if len(message) > 500: # Truncate very long messages message = message[:500] + "..." - + return message @agent_bp.route('/test-model', methods=['GET', 'POST']) @@ -175,49 +186,30 @@ def process_data_on_load_request(): logger.info("# process data query: ") content = request.get_json() token = content["token"] + input_data = content["input_data"] client = get_client(content['model']) logger.info(f" model: {content['model']}") try: - conn = db_manager.get_connection(session['session_id']) - except Exception as e: - conn = None - - agent = DataLoadAgent(client=client, conn=conn) - - candidates = agent.run(content["input_data"]) - - candidates = [c['content'] for c in candidates if c['status'] == 'ok'] + # Get workspace (needed for both virtual and in-memory tables) + identity_id = get_identity_id() + workspace = Workspace(identity_id) - response = flask.jsonify({ "status": "ok", "token": token, "result": candidates }) - else: - response = flask.jsonify({ "token": -1, "status": "error", "result": [] }) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@agent_bp.route('/derive-concept-request', methods=['GET', 'POST']) -def derive_concept_request(): - - if request.is_json: - logger.info("# code query: ") - content = request.get_json() - token = content["token"] - - client = get_client(content['model']) - - logger.info(f" model: {content['model']}") - agent = ConceptDeriveAgent(client=client) - - candidates = agent.run(content["input_data"], [f['name'] for f in content["input_fields"]], - content["output_name"], content["description"]) - - candidates = [c['code'] for c in candidates if c['status'] == 'ok'] + # Check if input table is in workspace, if not add as temp data + input_tables = [{"name": input_data.get("name"), "rows": input_data.get("rows", [])}] + temp_data = get_temp_tables(workspace, input_tables) + + with WorkspaceWithTempData(workspace, temp_data) as workspace: + agent = DataLoadAgent(client=client, workspace=workspace) + candidates = agent.run(content["input_data"]) + candidates = [c['content'] for c in candidates if c['status'] == 'ok'] - response = flask.jsonify({ "status": "ok", "token": token, "result": candidates }) + response = flask.jsonify({ "status": "ok", "token": token, "result": candidates }) + except Exception as e: + logger.exception(e) + response = flask.jsonify({ "token": token, "status": "error", "result": [] }) else: response = flask.jsonify({ "token": -1, "status": "error", "result": [] }) @@ -225,29 +217,6 @@ def derive_concept_request(): return response -@agent_bp.route('/derive-py-concept', methods=['GET', 'POST']) -def derive_py_concept(): - - if request.is_json: - logger.info("# code query: ") - content = request.get_json() - token = content["token"] - - client = get_client(content['model']) - - logger.info(f" model: {content['model']}") - agent = PyConceptDeriveAgent(client=client) - - results = agent.run(content["input_data"], [f['name'] for f in content["input_fields"]], - content["output_name"], content["description"]) - - response = flask.jsonify({ "status": "ok", "token": token, "results": results }) - else: - response = flask.jsonify({ "token": -1, "status": "error", "results": [] }) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - @agent_bp.route('/clean-data', methods=['GET', 'POST']) def clean_data_request(): @@ -371,7 +340,6 @@ def derive_data(): chart_encodings = content.get("chart_encodings", {}) instruction = content["extra_prompt"] - language = content.get("language", "python") # whether to use sql or python, default to python max_repair_attempts = content["max_repair_attempts"] if "max_repair_attempts" in content else 1 agent_coding_rules = content.get("agent_coding_rules", "") @@ -395,33 +363,35 @@ def derive_data(): if chart_encodings == {}: mode = "recommendation" - conn = db_manager.get_connection(session['session_id']) if language == "sql" else None + identity_id = get_identity_id() + workspace = Workspace(identity_id) + temp_data = get_temp_tables(workspace, input_tables) + max_display_rows = current_app.config['CLI_ARGS']['max_display_rows'] - if mode == "recommendation": - # now it's in recommendation mode - agent = SQLDataRecAgent(client=client, conn=conn, agent_coding_rules=agent_coding_rules) if language == "sql" else PythonDataRecAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'], agent_coding_rules=agent_coding_rules) - results = agent.run(input_tables, instruction, n=1, prev_messages=prev_messages) - else: - agent = SQLDataTransformationAgent(client=client, conn=conn, agent_coding_rules=agent_coding_rules) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'], agent_coding_rules=agent_coding_rules) - results = agent.run(input_tables, instruction, chart_type, chart_encodings, prev_messages) + with WorkspaceWithTempData(workspace, temp_data) as workspace: + if mode == "recommendation": + # Use unified Python agent for recommendations + agent = DataRecAgent(client=client, workspace=workspace, agent_coding_rules=agent_coding_rules, max_display_rows=max_display_rows) + results = agent.run(input_tables, instruction, n=1, prev_messages=prev_messages) + else: + # Use unified Python agent that generates Python scripts with DuckDB + pandas + agent = DataTransformationAgent(client=client, workspace=workspace, agent_coding_rules=agent_coding_rules, max_display_rows=max_display_rows) + results = agent.run(input_tables, instruction, chart_type, chart_encodings, prev_messages) - repair_attempts = 0 - while results[0]['status'] == 'error' and repair_attempts < max_repair_attempts: # try up to n times - error_message = results[0]['content'] - new_instruction = f"We run into the following problem executing the code, please fix it:\n\n{error_message}\n\nPlease think step by step, reflect why the error happens and fix the code so that no more errors would occur." + repair_attempts = 0 + while results[0]['status'] == 'error' and repair_attempts < max_repair_attempts: + error_message = results[0]['content'] + new_instruction = f"We run into the following problem executing the code, please fix it:\n\n{error_message}\n\nPlease think step by step, reflect why the error happens and fix the code so that no more errors would occur." - prev_dialog = results[0]['dialog'] + prev_dialog = results[0]['dialog'] - if mode == "transform": - results = agent.followup(input_tables, prev_dialog, [], chart_type, chart_encodings, new_instruction, n=1) - if mode == "recommendation": - results = agent.followup(input_tables, prev_dialog, [], new_instruction, n=1) + if mode == "transform": + results = agent.followup(input_tables, prev_dialog, [], chart_type, chart_encodings, new_instruction, n=1) + if mode == "recommendation": + results = agent.followup(input_tables, prev_dialog, [], new_instruction, n=1) + + repair_attempts += 1 - repair_attempts += 1 - - if conn: - conn.close() - response = flask.jsonify({ "token": token, "status": "ok", "results": results }) else: response = flask.jsonify({ "token": "", "status": "error", "results": [] }) @@ -442,7 +412,6 @@ def generate(): # each table is a dict with {"name": xxx, "rows": [...]} input_tables = content["input_tables"] initial_plan = content["initial_plan"] # The exploration question - language = content.get("language", "python") # whether to use sql or python, default to python max_iterations = content.get("max_iterations", 3) # Number of exploration iterations max_repair_attempts = content.get("max_repair_attempts", 1) agent_exploration_rules = content.get("agent_exploration_rules", "") @@ -465,7 +434,8 @@ def generate(): "api_version": content['model'].get('api_version', '') } - session_id = session.get('session_id') if language == "sql" else None + # Get identity for workspace (used for both SQL and Python with WorkspaceWithTempData) + identity_id = get_identity_id() exec_python_in_subprocess = current_app.config['CLI_ARGS']['exec_python_in_subprocess'] try: @@ -473,8 +443,7 @@ def generate(): model_config=model_config, input_tables=input_tables, initial_plan=initial_plan, - language=language, - session_id=session_id, + session_id=identity_id, exec_python_in_subprocess=exec_python_in_subprocess, max_iterations=max_iterations, max_repair_attempts=max_repair_attempts, @@ -550,8 +519,6 @@ def refine_data(): latest_data_sample = content["latest_data_sample"] max_repair_attempts = content.get("max_repair_attempts", 1) agent_coding_rules = content.get("agent_coding_rules", "") - - language = content.get("language", "python") # whether to use sql or python, default to python logger.info("== input tables ===>") for table in input_tables: @@ -563,23 +530,24 @@ def refine_data(): logger.info(chart_encodings) logger.info(new_instruction) - conn = db_manager.get_connection(session['session_id']) if language == "sql" else None - - # always resort to the data transform agent - agent = SQLDataTransformationAgent(client=client, conn=conn, agent_coding_rules=agent_coding_rules) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'], agent_coding_rules=agent_coding_rules) - results = agent.followup(input_tables, dialog, latest_data_sample, chart_type, chart_encodings, new_instruction, n=1) + identity_id = get_identity_id() + workspace = Workspace(identity_id) + temp_data = get_temp_tables(workspace, input_tables) + max_display_rows = current_app.config['CLI_ARGS']['max_display_rows'] - repair_attempts = 0 - while results[0]['status'] == 'error' and repair_attempts < max_repair_attempts: # only try once - error_message = results[0]['content'] - new_instruction = f"We run into the following problem executing the code, please fix it:\n\n{error_message}\n\nPlease think step by step, reflect why the error happens and fix the code so that no more errors would occur." - prev_dialog = results[0]['dialog'] + with WorkspaceWithTempData(workspace, temp_data) as workspace: + # Use unified Python agent for followup transformations + agent = DataTransformationAgent(client=client, workspace=workspace, agent_coding_rules=agent_coding_rules, max_display_rows=max_display_rows) + results = agent.followup(input_tables, dialog, latest_data_sample, chart_type, chart_encodings, new_instruction, n=1) - results = agent.followup(input_tables, prev_dialog, [], chart_type, chart_encodings, new_instruction, n=1) - repair_attempts += 1 + repair_attempts = 0 + while results[0]['status'] == 'error' and repair_attempts < max_repair_attempts: + error_message = results[0]['content'] + new_instruction = f"We run into the following problem executing the code, please fix it:\n\n{error_message}\n\nPlease think step by step, reflect why the error happens and fix the code so that no more errors would occur." + prev_dialog = results[0]['dialog'] - if conn: - conn.close() + results = agent.followup(input_tables, prev_dialog, [], chart_type, chart_encodings, new_instruction, n=1) + repair_attempts += 1 response = flask.jsonify({ "token": token, "status": "ok", "results": results}) else: @@ -592,25 +560,31 @@ def refine_data(): def request_code_expl(): if request.is_json: logger.info("# request data: ") - content = request.get_json() + content = request.get_json() client = get_client(content['model']) # each table is a dict with {"name": xxx, "rows": [...]} input_tables = content["input_tables"] code = content["code"] - - code_expl_agent = CodeExplanationAgent(client=client) - candidates = code_expl_agent.run(input_tables, code) - - # Return the first candidate's content as JSON - if candidates and len(candidates) > 0: - result = candidates[0] - if result['status'] == 'ok': - return jsonify(result) + + # Get workspace and mount temp data + identity_id = get_identity_id() + workspace = Workspace(identity_id) + temp_data = get_temp_tables(workspace, input_tables) + + with WorkspaceWithTempData(workspace, temp_data) as workspace: + code_expl_agent = CodeExplanationAgent(client=client, workspace=workspace) + candidates = code_expl_agent.run(input_tables, code) + + # Return the first candidate's content as JSON + if candidates and len(candidates) > 0: + result = candidates[0] + if result['status'] == 'ok': + return jsonify(result) + else: + return jsonify(result), 400 else: - return jsonify(result), 400 - else: - return jsonify({'error': 'No explanation generated'}), 400 + return jsonify({'error': 'No explanation generated'}), 400 else: return jsonify({'error': 'Invalid request format'}), 400 @@ -624,34 +598,35 @@ def generate(): client = get_client(content['model']) - language = content.get("language", "python") - if language == "sql": - db_conn = db_manager.get_connection(session['session_id']) - else: - db_conn = None + input_tables = content.get("input_tables", []) + identity_id = get_identity_id() + workspace = Workspace(identity_id) agent_exploration_rules = content.get("agent_exploration_rules", "") - agent = InteractiveExploreAgent(client=client, agent_exploration_rules=agent_exploration_rules, db_conn=db_conn) - - # Get input tables from the request - input_tables = content.get("input_tables", []) - - # Get exploration thread if provided (for context from previous explorations) mode = content.get("mode", "interactive") start_question = content.get("start_question", None) exploration_thread = content.get("exploration_thread", None) current_chart = content.get("current_chart", None) current_data_sample = content.get("current_data_sample", None) - try: - for chunk in agent.run(input_tables, start_question, exploration_thread, current_data_sample, current_chart, mode): - yield chunk - except Exception as e: - logger.error(e) - error_data = { - "content": "unable to process recommendation questions request" - } - yield 'error: ' + json.dumps(error_data) + '\n' + # Collect all tables that need to be in workspace: + # both the input tables and any tables from the exploration thread + all_tables = list(input_tables) + if exploration_thread: + all_tables.extend(exploration_thread) + temp_data = get_temp_tables(workspace, all_tables) if all_tables else None + + with WorkspaceWithTempData(workspace, temp_data) as workspace: + agent = InteractiveExploreAgent(client=client, workspace=workspace, agent_exploration_rules=agent_exploration_rules) + try: + for chunk in agent.run(input_tables, start_question, exploration_thread, current_data_sample, current_chart, mode): + yield chunk + except Exception as e: + logger.error(e) + error_data = { + "content": "unable to process recommendation questions request" + } + yield 'error: ' + json.dumps(error_data) + '\n' else: error_data = { "content": "Invalid request format" @@ -675,28 +650,24 @@ def generate(): client = get_client(content['model']) - language = content.get("language", "python") - if language == "sql": - db_conn = db_manager.get_connection(session['session_id']) - else: - db_conn = None - - agent = ReportGenAgent(client=client, conn=db_conn) - - # Get input tables and charts from the request input_tables = content.get("input_tables", []) charts = content.get("charts", []) style = content.get("style", "blog post") - - try: - for chunk in agent.stream(input_tables, charts, style): - yield chunk - except Exception as e: - logger.error(e) - error_data = { - "content": "unable to process report generation request" - } - yield 'error: ' + json.dumps(error_data) + '\n' + identity_id = get_identity_id() + workspace = Workspace(identity_id) + temp_data = get_temp_tables(workspace, input_tables) if input_tables else None + + with WorkspaceWithTempData(workspace, temp_data) as workspace: + agent = ReportGenAgent(client=client, workspace=workspace) + try: + for chunk in agent.stream(input_tables, charts, style): + yield chunk + except Exception as e: + logger.error(e) + error_data = { + "content": "unable to process report generation request" + } + yield 'error: ' + json.dumps(error_data) + '\n' else: error_data = { "content": "Invalid request format" @@ -714,24 +685,36 @@ def generate(): @agent_bp.route('/refresh-derived-data', methods=['POST']) def refresh_derived_data(): """ - Re-run Python transformation code with new input data to refresh a derived table. + Re-run Python transformation code with updated input data to refresh a derived table. + + This endpoint: + 1. Gets input tables from workspace (extending with temp data if needed) + 2. Re-runs the transformation code in workspace context + 3. Updates the derived table in workspace if virtual flag is true - This endpoint takes: + Request body: - input_tables: list of {name: string, rows: list} objects representing the parent tables - code: the Python transformation code to execute + - output_variable: the variable name containing the result DataFrame (required) + - output_table_name: the workspace table name to update with results (required if virtual=true) + - virtual: boolean flag indicating whether to save result to workspace Returns: - status: 'ok' or 'error' - - rows: the resulting rows if successful + - rows: the resulting rows if successful (limited to max_display_rows) + - virtual: {table_name: string, row_count: number} if output was saved to workspace - message: error message if failed """ try: - from data_formulator.py_sandbox import run_transform_in_sandbox2020 + from data_formulator.sandbox.py_sandbox import run_unified_transform_in_sandbox from flask import current_app data = request.get_json() input_tables = data.get('input_tables', []) code = data.get('code', '') + output_variable = data.get('output_variable') + output_table_name = data.get('output_table_name') + virtual = data.get('virtual', False) if not input_tables: return jsonify({ @@ -744,44 +727,74 @@ def refresh_derived_data(): "status": "error", "message": "No transformation code provided" }), 400 - - # Convert input tables to pandas DataFrames - df_list = [] - for table in input_tables: - table_name = table.get('name', '') - table_rows = table.get('rows', []) - - if not table_rows: - return jsonify({ - "status": "error", - "message": f"Table '{table_name}' has no rows" - }), 400 - - df = pd.DataFrame.from_records(table_rows) - df_list.append(df) - - # Get exec_python_in_subprocess setting from app config - exec_python_in_subprocess = current_app.config.get('CLI_ARGS', {}).get('exec_python_in_subprocess', False) - - # Run the transformation code - result = run_transform_in_sandbox2020(code, df_list, exec_python_in_subprocess) - - if result['status'] == 'ok': - result_df = result['content'] - - # Convert result DataFrame to list of records - rows = json.loads(result_df.to_json(orient='records', date_format='iso')) + if not output_variable: return jsonify({ - "status": "ok", - "rows": rows, - "message": "Successfully refreshed derived data" - }) - else: + "status": "error", + "message": "No output_variable provided" + }), 400 + + if virtual and not output_table_name: return jsonify({ "status": "error", - "message": result.get('content', 'Unknown error during transformation') + "message": "output_table_name is required when virtual=true" }), 400 + + # Get workspace and mount temp data for tables not in workspace + identity_id = get_identity_id() + workspace = Workspace(identity_id) + temp_data = get_temp_tables(workspace, input_tables) + + # Get settings from app config + exec_python_in_subprocess = current_app.config.get('CLI_ARGS', {}).get('exec_python_in_subprocess', False) + max_display_rows = current_app.config.get('CLI_ARGS', {}).get('max_display_rows', 5000) + + with WorkspaceWithTempData(workspace, temp_data) as workspace: + # Run the transformation code in workspace context + result = run_unified_transform_in_sandbox( + code=code, + workspace_path=workspace._path, + output_variable=output_variable, + exec_python_in_subprocess=exec_python_in_subprocess + ) + + if result['status'] == 'ok': + result_df = result['content'] + row_count = len(result_df) + + response_data = { + "status": "ok", + "message": "Successfully refreshed derived data", + "row_count": row_count + } + + if virtual: + # Virtual table: update workspace and return limited rows for display + workspace.write_parquet(result_df, output_table_name) + response_data["virtual"] = { + "table_name": output_table_name, + "row_count": row_count + } + # Limit rows for response payload since full data is in workspace + if row_count > max_display_rows: + display_df = result_df.head(max_display_rows) + else: + display_df = result_df + # Remove duplicate columns to avoid orient='records' error + display_df = display_df.loc[:, ~display_df.columns.duplicated()] + response_data["rows"] = json.loads(display_df.to_json(orient='records', date_format='iso')) + else: + # Temp table: return full data since there's no workspace storage + # Remove duplicate columns to avoid orient='records' error + result_df = result_df.loc[:, ~result_df.columns.duplicated()] + response_data["rows"] = json.loads(result_df.to_json(orient='records', date_format='iso')) + + return jsonify(response_data) + else: + return jsonify({ + "status": "error", + "message": result.get('content', 'Unknown error during transformation') + }), 400 except Exception as e: logger.error(f"Error refreshing derived data: {str(e)}") diff --git a/py-src/data_formulator/agents/__init__.py b/py-src/data_formulator/agents/__init__.py index d4186839..6ced4d6a 100644 --- a/py-src/data_formulator/agents/__init__.py +++ b/py-src/data_formulator/agents/__init__.py @@ -1,26 +1,19 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from data_formulator.agents.agent_concept_derive import ConceptDeriveAgent -from data_formulator.agents.agent_py_concept_derive import PyConceptDeriveAgent -from data_formulator.agents.agent_py_data_transform import PythonDataTransformationAgent -from data_formulator.agents.agent_sql_data_transform import SQLDataTransformationAgent +from data_formulator.agents.agent_data_transform import DataTransformationAgent +from data_formulator.agents.agent_data_rec import DataRecAgent + from data_formulator.agents.agent_data_load import DataLoadAgent from data_formulator.agents.agent_sort_data import SortDataAgent from data_formulator.agents.agent_data_clean import DataCleanAgent -from data_formulator.agents.agent_py_data_rec import PythonDataRecAgent -from data_formulator.agents.agent_sql_data_rec import SQLDataRecAgent from data_formulator.agents.agent_interactive_explore import InteractiveExploreAgent __all__ = [ - "ConceptDeriveAgent", - "PyConceptDeriveAgent", - "PythonDataTransformationAgent", - "SQLDataTransformationAgent", - "PythonDataRecAgent", - "SQLDataRecAgent", + "DataTransformationAgent", + "DataRecAgent", "DataLoadAgent", "SortDataAgent", "DataCleanAgent", "InteractiveExploreAgent", -] \ No newline at end of file +] diff --git a/py-src/data_formulator/agents/agent_code_explanation.py b/py-src/data_formulator/agents/agent_code_explanation.py index 73b90e5c..f67972f2 100644 --- a/py-src/data_formulator/agents/agent_code_explanation.py +++ b/py-src/data_formulator/agents/agent_code_explanation.py @@ -174,12 +174,13 @@ def extract_decade(date_str): class CodeExplanationAgent(object): - def __init__(self, client): + def __init__(self, client, workspace): self.client = client + self.workspace = workspace def run(self, input_tables, code, n=1): - data_summary = generate_data_summary(input_tables, include_data_samples=True) + data_summary = generate_data_summary(input_tables, workspace=self.workspace, include_data_samples=True) user_query = f"[CONTEXT]\n\n{data_summary}\n\n[CODE]\n\nhere is the transformation code: {code}\n\n[EXPLANATION]\n" diff --git a/py-src/data_formulator/agents/agent_concept_derive.py b/py-src/data_formulator/agents/agent_concept_derive.py deleted file mode 100644 index d7de8aab..00000000 --- a/py-src/data_formulator/agents/agent_concept_derive.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import os -import sys -import pandas as pd - -APP_ROOT = os.path.abspath('..') -sys.path.append(os.path.abspath(APP_ROOT)) - -from data_formulator.agents.agent_utils import generate_data_summary, field_name_to_ts_variable_name, extract_code_from_gpt_response, infer_ts_datatype - -import logging - -logger = logging.getLogger(__name__) - - -SYSTEM_PROMPT = '''You are a data scientist to help user to derive new column based on existing columns in a dataset. -Your job is to write a typescript function based on input data summary, instruction and output column name. -Complete a typescript function based off the [CONTEXT], [TEMPLATE] and [GOAL] provided, the function's input arguments are values from input columns, and the output is a value for the output column. -The function only operates on primitive types and it will be used by a map() function later to generate the new column. -The function should be as simple as possible. - -For example: - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -table_0 (us_covid_cases) fields: - Date -- type: object, values: 1/1/2021, 1/1/2022, 1/1/2023, ..., 9/8/2022, 9/9/2020, 9/9/2021, 9/9/2022 - Cases -- type: int64, values: -23999, -14195, -6940, ..., 1018935, 1032159, 1178403, 1433977 - -table_0 (us_covid_cases) sample: -``` -|Date|Cases -0|1/21/2020|1 -1|1/22/2020|0 -2|1/23/2020|0 -3|1/24/2020|1 -4|1/25/2020|1 -...... -``` - -[GOAL] - -extract month from Date - -[TEMPLATE] - -```typescript -(date : string) => { - // complete code here - return month -} -``` - -[OUTPUT] - -```typescript -(date: string) => { - const month = new Date(date).getMonth() + 1; - return month; -} -``` - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -table_0 (us_covid_cases) fields: - Date -- type: object, values: 1/1/2021, 1/1/2022, 1/1/2023, ..., 9/8/2022, 9/9/2020, 9/9/2021, 9/9/2022 - Cases -- type: int64, values: -23999, -14195, -6940, ..., 1018935, 1032159, 1178403, 1433977 - -table_0 (us_covid_cases) sample: -``` -|Date|Cases -0|1/21/2020|1 -1|1/22/2020|0 -2|1/23/2020|0 -3|1/24/2020|1 -4|1/25/2020|1 -...... -``` - -[GOAL] - -extract month from Date - -[TEMPLATE] - -```typescript -//extract month from Date -(date : string) => { - // complete code here - return month -} -``` - -[OUTPUT] - -```typescript -//extract month from Date -(date: string) => { - const month = new Date(date).getMonth() + 1; - return month; -} -``` - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -table_0 (student_exam) fields: - student -- type: int64, values: 1, 2, 3, ..., 997, 998, 999, 1000 - major -- type: object, values: liberal arts, science - math -- type: int64, values: 0, 8, 18, ..., 97, 98, 99, 100 - reading -- type: int64, values: 17, 23, 24, ..., 96, 97, 99, 100 - writing -- type: int64, values: 10, 15, 19, ..., 97, 98, 99, 100 - -table_0 (student_exam) sample: - -``` -|student|major|math|reading|writing -0|1|liberal arts|72|72|74 -1|2|liberal arts|69|90|88 -2|3|liberal arts|90|95|93 -3|4|science|47|57|44 -4|5|science|76|78|75 -...... -``` - -[GOAL] - -Derive average grade from writing, reading, math, grade should be A, B, C, D, F - -[TEMPLATE] - -```typescript -//Derive average grade from writing -(writing: number, reading: number, math: number) => { - // complete code here - return averageGrade -} -``` - -[OUTPUT] - -```typescript -//Derive average grade from writing, reading, math, grade should be A, B, C, D, F -(writing: number, reading: number, math: number): string => { - const average = (writing + reading + math) / 3; - if (average >= 90) { - return "A"; - } else if (average >= 80) { - return "B"; - } else if (average >= 70) { - return "C"; - } else if (average >= 60) { - return "D"; - } else { - return "F"; - } -} -``` -''' - -class ConceptDeriveAgent(object): - - def __init__(self, client): - self.client = client - - def run(self, input_table, input_fields, output_field, description, n=1): - """derive a new concept based on input table, input fields, and output field name, (and description) - """ - - data_summary = generate_data_summary([input_table], include_data_samples=True) - - input_fields_info = [{"name": name, "type": infer_ts_datatype(pd.DataFrame(input_table['rows']), name)} for name in input_fields] - - arg_string = ", ".join([f"{field_name_to_ts_variable_name(field['name'])} : {field['type']}" for field in input_fields_info]) - code_template = f"```typescript\n//{description}\n({arg_string}) => {{\n // complete code here\n return {field_name_to_ts_variable_name(output_field)}\n}}\n```" - - user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}\n\n[TEMPLATE]\n\n{code_template}\n\n[OUTPUT]\n" - - logger.info(user_query) - - messages = [{"role":"system", "content": SYSTEM_PROMPT}, - {"role":"user","content": user_query}] - - ###### the part that calls open_ai - response = self.client.get_completion(messages = messages) - - #log = {'messages': messages, 'response': response.model_dump(mode='json')} - - candidates = [] - for choice in response.choices: - - logger.info("\n=== cocept derive result ===>\n") - logger.info(choice.message.content + "\n") - - code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "typescript") - - if len(code_blocks) > 0: - result = {'status': 'ok', 'code': code_blocks[-1]} - else: - result = {'status': 'other error', 'content': 'unable to extract code from response'} - - result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] - result['agent'] = 'ConceptDeriveAgent' - - candidates.append(result) - - return candidates \ No newline at end of file diff --git a/py-src/data_formulator/agents/agent_data_load.py b/py-src/data_formulator/agents/agent_data_load.py index 250323b3..79d3e570 100644 --- a/py-src/data_formulator/agents/agent_data_load.py +++ b/py-src/data_formulator/agents/agent_data_load.py @@ -4,7 +4,10 @@ import json from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary -from data_formulator.agents.agent_sql_data_transform import sanitize_table_name, get_sql_table_statistics_str +from data_formulator.agents.semantic_types import ( + SEMANTIC_TYPE_CATEGORIES, + generate_semantic_types_prompt, +) import logging @@ -22,8 +25,8 @@ 3. provide a very short summary of the dataset. Types to consider include: string, number, date -Semantic types to consider include: Location, Decade, Year, Month, YearMonth, Day, Date, Time, DateTime, TimeRange, Range, Duration, Name, Percentage, String, Number +''' + generate_semantic_types_prompt() + ''' Sort order: @@ -32,9 +35,6 @@ - when the natural sort order is alphabetical or there is not natural sort order, there is no need to generate sort_order, examples: - Name, State, City, etc. -Special cases: -* sometimes, column name is year like "2020", "2021" but its content is not actually year (e.g., sales), in these cases, the semantic type of the column would not be Year! - Create a json object function based off the [DATA] provided. output should be in the format of: @@ -81,16 +81,16 @@ ```json { + "suggested_table_name": "income", "fields": { - "suggested_table_name": "income_json", - "name": {"type": "string", "semantic_type": "Location", "sort_order": null}, - "region": {"type": "string", "semantic_type": "String", "sort_order": ["northeast", "midwest", "south", "west", "other"]}, - "state_id": {"type": "number", "semantic_type": "Number", "sort_order": null}, + "name": {"type": "string", "semantic_type": "State", "sort_order": null}, + "region": {"type": "string", "semantic_type": "Region", "sort_order": ["northeast", "midwest", "south", "west", "other"]}, + "state_id": {"type": "number", "semantic_type": "ID", "sort_order": null}, "pct": {"type": "number", "semantic_type": "Percentage", "sort_order": null}, - "total": {"type": "number", "semantic_type": "Number", "sort_order": null}, + "total": {"type": "number", "semantic_type": "Count", "sort_order": null}, "group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]} }, - "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups.", + "data summary": "Income distribution across US states, with percentage and count by income bracket." } ``` @@ -116,9 +116,9 @@ [OUTPUT] -``` +```json { - "suggested_table_name": "weather_seattle_atlanta", + "suggested_table_name": "weather", "fields": { "Date": { "type": "string", @@ -127,32 +127,35 @@ }, "City": { "type": "string", - "semantic_type": "Location", + "semantic_type": "City", "sort_order": null }, "Temperature": { "type": "number", - "semantic_type": "Number", + "semantic_type": "Temperature", "sort_order": null } }, - "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format.", -}```''' + "data_summary": "Daily temperature readings for Seattle and Atlanta in 2020." +} +```''' class DataLoadAgent(object): - def __init__(self, client, conn): + def __init__(self, client, workspace): self.client = client - self.conn = conn + self.workspace = workspace def run(self, input_data, n=1): - if input_data['virtual']: - table_name = sanitize_table_name(input_data['name']) - table_summary_str = get_sql_table_statistics_str(self.conn, table_name, row_sample_size=5, field_sample_size=30) - data_summary = f"[TABLE {table_name}]\n\n{table_summary_str}" - else: - data_summary = generate_data_summary([input_data], include_data_samples=True, field_sample_size=30) + # Always use the unified generate_data_summary approach + # For virtual tables, workspace will find them; for in-memory tables, it uses rows + data_summary = generate_data_summary( + [input_data], + workspace=self.workspace, + include_data_samples=True, + field_sample_size=30 + ) user_query = f"[DATA]\n\n{data_summary}\n\n[OUTPUT]" diff --git a/py-src/data_formulator/agents/agent_py_data_rec.py b/py-src/data_formulator/agents/agent_data_rec.py similarity index 55% rename from py-src/data_formulator/agents/agent_py_data_rec.py rename to py-src/data_formulator/agents/agent_data_rec.py index c61cde63..266e6500 100644 --- a/py-src/data_formulator/agents/agent_py_data_rec.py +++ b/py-src/data_formulator/agents/agent_data_rec.py @@ -2,39 +2,61 @@ # Licensed under the MIT License. import json -import pandas as pd -from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response -import data_formulator.py_sandbox as py_sandbox +from data_formulator.agents.agent_utils import extract_json_objects, extract_code_from_gpt_response, generate_data_summary import traceback +import pandas as pd + import logging logger = logging.getLogger(__name__) SYSTEM_PROMPT = '''You are a data scientist to help user to recommend data that will be used for visualization. -The user will provide you information about what visualization they would like to create, and your job is to recommend a transformed data that can be used to create the visualization and write a python function to transform the data. -The recommendation and transformation function should be based on the [CONTEXT] and [GOAL] provided by the user. +The user will provide you information about what visualization they would like to create, and your job is to recommend a transformed data that can be used to create the visualization and write a Python script to transform the data. +The recommendation and transformation function should be based on the [CONTEXT] and [GOAL] provided by the user. The [CONTEXT] shows what the current dataset is, and the [GOAL] describes what the user wants the data for. **Important:** - NEVER make assumptions or judgments about a person's gender, biological sex, sexuality, religion, race, nationality, ethnicity, political stance, socioeconomic status, mental health, invisible disabilities, medical conditions, personality type, social impressions, emotional state, and cognitive state. - NEVER create formulas that could be used to discriminate based on age. Ageism of any form (explicit and implicit) is strictly prohibited. -- If above issue occurs, generate columns with np.nan. - -Concretely, you should infer the appropriate data and create in the output section a python function based off the [CONTEXT] and [GOAL] in two steps: +- If above issue occurs, generate columns with NULL or np.nan. + +**About the execution environment:** +- You can use BOTH DuckDB SQL and pandas operations in the same script +- The script will run in the workspace data directory where all files are located +- You can reference files directly by their filename (e.g., 'sales_data.parquet') +- **Allowed libraries:** pandas, numpy, duckdb, math, datetime, json, statistics, collections, re, sklearn, scipy, random, itertools, functools, operator, time +- **Not allowed:** matplotlib, plotly, seaborn, requests, subprocess, os, sys, io, or any other library not listed above. Do NOT import them — the sandbox will reject the import. +- File system access (open, write) and network access are also forbidden. + +**When to use DuckDB vs pandas:** +- For large datasets (parquet files with many rows): prefer DuckDB SQL for aggregations, filtering, joins, window functions, and groupby — DuckDB can process parquet files efficiently without loading all data into memory. +- For small datasets (even if stored as parquet): prefer pandas for readability and simplicity. +- Use pandas for: complex transformations, time series operations, ML features, reshaping. +- You can combine both: use DuckDB for initial data loading/filtering on large files, then pandas for complex operations. + +**Code structure:** +- The script should be standalone (no function wrapper) +- Import statements at the top +- Data loading using DuckDB or pandas +- Transformations combining SQL and pandas as needed +- Assign the final result to a variable (you will specify the variable name in JSON) + +Concretely, you should infer the appropriate data and create a Python script based off the [CONTEXT] and [GOAL] in two steps: 1. First, based on users' [GOAL]. Create a json object that represents the inferred user intent. The json object should have the following format: +```json { "mode": "" // string, one of "infer", "overview", "distribution", "summary", "forecast" "recap": "..." // string, a short summary of the user's goal. "display_instruction": "..." // string, the even shorter verb phrase describing the users' goal. "recommendation": "..." // string, explain why this recommendation is made "input_tables": [...] // string[], describe names of the input tables that will be used in the transformation. - "output_fields": [...] // string[], describe the desired output fields that the output data should have (i.e., the goal of transformed data), it's a good idea to preseve intermediate fields here - "chart_type": "" // string, one of "point", "bar", "line", "area", "heatmap", "group_bar", 'boxplot'. "chart_type" should either be inferred from user instruction, or recommend if the user didn't specify any. + "output_fields": [...] // string[], describe the desired output fields that the output data should have (i.e., the goal of transformed data), it's a good idea to preserve intermediate fields here + "chart_type": "" // string, one of "point", "bar", "line", "area", "heatmap", "group_bar", "boxplot", "worldmap", "usmap". "chart_type" should either be inferred from user instruction, or recommend if the user didn't specify any. "chart_encodings": { "x": "", "y": "", @@ -42,8 +64,13 @@ "size": "", "opacity": "", "facet": "", - } // object: map visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of output fields, appropriate visual channels for different chart types are defined below. + "longitude": "", + "latitude": "" + } // object: map visualization channels (x, y, color, size, opacity, facet, longitude, latitude, etc.) to a subset of output fields, appropriate visual channels for different chart types are defined below. + "projection": "" // string (optional, only for worldmap/usmap): one of "mercator", "equalEarth", "naturalEarth1", "orthographic", "stereographic", "albersUsa", "conicEqualArea", "gnomonic", "azimuthalEquidistant". Default is "equalEarth" for worldmap, "albersUsa" for usmap. + "projection_center": [0, 0] // [longitude, latitude] (optional, only for worldmap): the center point of the map projection. Use to focus on specific regions, e.g., [105, 35] for China, [-98, 39] for USA, [10, 50] for Europe, [139, 36] for Japan. } +``` Concretely: - recap what the user's goal is in a short summary in "recap". @@ -52,12 +79,12 @@ - choose one of "distribution", "overview", "summary", "forecast" in "mode": * if it is "overview" and the data is in wide format, reshape it into long format. * if it is "distribution", select a few fields that would be interesting to visualize together. - * if it is "summary", calculate some aggregated statistics to show intresting facts of the data. + * if it is "summary", calculate some aggregated statistics to show interesting facts of the data. * if it is "forecast", concretize the x,y fields that will be used for forecasting and decide if it is about regression or forecasting. - describe the recommendation reason in "recommendation" - based on the recommendation, determine what is an ideal output data. Note, the output data must be in tidy format. - then suggest recommendations of chart encoding that should be used to create the visualization. - - "display_instruction" should be a short verb phrase describing the users' goal, it should be even shorter than "recap". + - "display_instruction" should be a short verb phrase describing the users' goal, it should be even shorter than "recap". - it would be a short verbal description of user intent as a verb phrase (<12 words). - generate based on "recap" and the suggested visualization, but don't need to mention the visualization details. - should capture key computation ideas: by reading the display, the user can understand the purpose and what's derived from the data. @@ -69,7 +96,7 @@ - determine "input_tables", the names of a subset of input tables from [CONTEXT] section that will be used to achieve the user's goal. - **IMPORTANT** Note that the Table 1 in [CONTEXT] section is the table the user is currently viewing, it should take precedence if the user refers to insights about the "current table". - At the same time, leverage table information to determine which tables are relevant to the user's goal and should be used. - - "chart_type" must be one of "point", "bar", "line", "area", "heatmap", "group_bar", "boxplot" + - "chart_type" must be one of "point", "bar", "line", "area", "heatmap", "group_bar", "boxplot", "worldmap", "usmap" - "chart_encodings" should specify which fields should be used to create the visualization - decide which visual channels should be used to create the visualization appropriate for the chart type. - point: x, y, color, size, facet @@ -80,48 +107,66 @@ - heatmap: x, y, color, facet - group_bar: x, y, color, facet - boxplot: x, y, color, facet + - worldmap: longitude, latitude, color, size + - usmap: longitude, latitude, color, size - note that all fields used in "chart_encodings" should be included in "output_fields". - all fields you need for visualizations should be transformed into the output fields! - "output_fields" should include important intermediate fields that are not used in visualization but are used for data transformation. - - typically only 2-3 fields should be used to create the visualization (x, y, color/size), facet use be added if it's a faceted visualization (totally 4 fields used). + - typically only 2-3 fields should be used to create the visualization (x, y, color/size), facet can be added if it's a faceted visualization. - Guidelines for choosing chart type and visualization fields: - Consider chart types as follows: - - (point) Scatter Plots: x,y: Quantitative/Categorical, color: Categorical (optional), size: Quantitative (optional for creating bubble chart), + - (point) Scatter Plots: x,y: Quantitative/Categorical, color: Categorical (optional), size: Quantitative (optional for creating bubble chart), - best for: Relationships, correlations, distributions, forecasting, regression analysis - scatter plots are good default way to visualize data when other chart types are not applicable. - use color to visualize points from different categories. - use size to visualize data points with an additional quantitative dimension of the data points. - - (histogram) Histograms: x: Quantitative/Categorical, color: Categorical (optional for creating grouped histogram), + - (histogram) Histograms: x: Quantitative/Categorical, color: Categorical (optional for creating grouped histogram), - best for: Distribution of a quantitative field - use x values directly if x values are categorical, and transform the data into bins if the field values are quantitative. - when color is specified, the histogram will be grouped automatically (items with the same x values will be grouped). - - (bar) Bar Charts: x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical/Quantitative (for stacked bar chart / showing additional quantitative dimension), + - (bar) Bar Charts: x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical/Quantitative (for stacked bar chart / showing additional quantitative dimension), - best for: Comparisons across categories - - use (bar) for simple bar chart or stacked bar chart (when it makes sense to add up Y values for each category with the same X value), + - use (bar) for simple bar chart or stacked bar chart (when it makes sense to add up Y values for each category with the same X value), - when color is specified, the bar will be stacked automatically (items with the same x values will be stacked). - note that when there are multiple rows in the data with same x values, the bar will be stacked automatically. - 1. consider to use an aggregated field for y values if the value is not suitable for stacking. - 2. consider to introduce facets so that each group is visualized in a separate bar. - (group_bar) for grouped bar chart, x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical - - when color is specifed, bars from different groups will be grouped automatically. + - when color is specified, bars from different groups will be grouped automatically. - only use facet if the cardinality of color field is small (less than 5). - - (line) Line Charts: x: Temporal (preferred) or ordinal, y: Quantitative, color: Categorical (optional for creating multiple lines), + - (line) Line Charts: x: Temporal (preferred) or ordinal, y: Quantitative, color: Categorical (optional for creating multiple lines), - best for: Trends over time, continuous data, forecasting, regression analysis - note that when there are multiple rows in the data belong to the same group (same x and color values) but different y values, the line will not look correct. - consider to use an aggregated field for y values, or introduce facets so that each group is visualized in a separate line. - - (area) Area Charts: x: Temporal (preferred) or ordinal, y: Quantitative, color: Categorical (optional for creating stacked areas), + - (area) Area Charts: x: Temporal (preferred) or ordinal, y: Quantitative, color: Categorical (optional for creating stacked areas), - best for: Trends over time, continuous data - - (heatmap) Heatmaps: x,y: Categorical (you need to convert quantitative to nominal), color: Quantitative intensity, + - (heatmap) Heatmaps: x,y: Categorical (you need to convert quantitative to nominal), color: Quantitative intensity, - best for: Pattern discovery in matrix data - - (boxplot) Box plots: x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical (optional for creating grouped boxplots), + - (boxplot) Box plots: x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical (optional for creating grouped boxplots), - best for: Distribution of a quantitative field - use x values directly if x values are categorical, and transform the data into bins if the field values are quantitative. - when color is specified, the boxplot will be grouped automatically (items with the same x values will be grouped). + - (worldmap) World Map: longitude: Quantitative (geographic longitude -180 to 180), latitude: Quantitative (geographic latitude -90 to 90), color: Categorical/Quantitative (optional), size: Quantitative (optional) + - best for: Geographic data visualization on a world map + - use when the data contains geographic coordinates (longitude, latitude) for locations around the world + - the data must have longitude and latitude fields representing geographic coordinates + - color can be used to show categories (e.g., country, region) or quantitative values (e.g., population, sales) + - size can be used to show quantitative values (e.g., magnitude, count) + - example use cases: plotting cities, earthquakes, sales by location, etc. + - projection options: "mercator", "equalEarth" (default), "naturalEarth1", "orthographic", "stereographic", "albers", "conicEqualArea" + - projection_center: set [longitude, latitude] to center the map on a specific region: + * China: [105, 35], USA: [-98, 39], Europe: [10, 50], Japan: [139, 36], India: [78, 22] + * Brazil: [-55, -10], Australia: [134, -25], Russia: [100, 60], South Africa: [25, -29] + - (usmap) US Map: longitude: Quantitative (geographic longitude), latitude: Quantitative (geographic latitude), color: Categorical/Quantitative (optional), size: Quantitative (optional) + - best for: Geographic data visualization focused on the United States + - use when the data is specifically about US locations + - uses albersUsa projection optimized for US geography (includes Alaska and Hawaii) + - the data must have longitude and latitude fields representing US geographic coordinates - facet channel is available for all chart types, it supports a categorical field with small cardinality to visualize the data in different facets. - if you really need additional legend fields: - you can use opacity for legend (support Quantitative and Categorical). - - visualization fields require tidy data. - - similar to VegaLite and ggplot2 so that each field is mapped to a visualization axis or legend. + - visualization fields require tidy data. + - similar to VegaLite and ggplot2 so that each field is mapped to a visualization axis or legend. - consider data transformations if you want to visualize multiple fields together: - exapmle 1: suggest reshaping the data into long format in data transformation description (if these fields are all of the same type, e.g., they are all about sales, price, two columns about min/max-values, etc. don't mix different types of fields in reshaping) so we can visualize multiple fields as categories or in different facets. - exapmle 2: calculate some derived fields from these fields(e.g., correlation, difference, profit etc.) in data transformation description to visualize them in one visualization. @@ -141,37 +186,72 @@ - when the user asks for clustering: - the output should be a long format table where actual x, y pairs with a third column "cluster_id" that indicates the cluster id of the data point. - the recommended chart should be scatter plot (quantitative x, y) - - 2. Then, write a python function based on the inferred goal, the function input is a dataframe "df" (or multiple dataframes based on tables presented in the [CONTEXT] section) and the output is the transformed dataframe "transformed_df". -"transformed_df" should contain all "output_fields" from the refined user intent in the json object. -The python function must follow the template provided in [TEMPLATE]. The function should be as simple as possible and easily readable. -If there is no data transformation needed based on "output_fields", the transformation function can simply "return df". + - specify "output_variable", the name of the Python variable that will contain the final DataFrame result. + The name should be descriptive and reflect the data content (e.g., "sales_by_region", "monthly_trends", "customer_segments"). + Avoid generic names like "result_df", "output", or "data". Use snake_case naming convention. + +2. Then, write a Python script based on the inferred goal. The script should transform input data into the desired output table containing all "output_fields" from the refined goal. +The script should be as simple as possible and easily readable. If there is no data transformation needed based on "output_fields", the script can simply load and assign the data. -[TEMPLATE] +3. The output must only contain two items: + - a json object (wrapped in ```json```) representing the refined goal (including "mode", "recommendation", "output_fields", "chart_type", "chart_encodings", "output_variable") + - a python code block (wrapped in ```python```) representing the transformation script, do not add any extra text explanation. + +**Example data loading patterns:** + +```python +# Option 1: Load with DuckDB SQL +import pandas as pd +import duckdb + +df = duckdb.sql(""" + SELECT + student, + major, + (math + reading + writing) / 3.0 AS average_score, + RANK() OVER (ORDER BY (math + reading + writing) / 3.0 DESC) AS rank + FROM read_parquet('student_exam.parquet') + ORDER BY average_score DESC +""").df() + +result_df = df +``` ```python +# Option 2: Load with pandas then transform import pandas as pd -import collections -import numpy as np -# from sklearn import ... # import from sklearn if you need it. -def transform_data(df1, df2, ...): - # complete the template here - return transformed_df +df = pd.read_parquet('student_exam.parquet') +df['average_score'] = (df['math'] + df['reading'] + df['writing']) / 3.0 +df['rank'] = df['average_score'].rank(ascending=False, method='min') +df = df.sort_values('average_score', ascending=False) + +result_df = df[['student', 'major', 'average_score', 'rank']] ``` -note: -- decide the function signature based on the number of tables you decided in the previous step "input_tables": - - if you decide there will only be one input table, then function signature should be `def transform_data(df1)` - - if you decided there will be k input tables, then function signature should be `def transform_data(df_1, df_2, ..., df_k)`. - - instead of using generic names like df1, df2, ..., try to use intuitive table names for function arguments, for example, if you have input_tables: ["City", "Weather"]`, you can use `transform_data(df_city, df_weather)` to refer to the two dataframes. - - **VERY IMPORTANT** the number of arguments in the function signature must be the same as the number of tables provided in "input_tables", and the order of arguments must match the order of tables provided in "input_tables". -- datetime objects handling: - - if the output field is year, convert it to number, if it is year-month / year-month-day, convert it to string object (e.g., "2020-01" / "2020-01-01"). - - if the output is time only: convert hour to number if it's just the hour (e.g., 10), but convert hour:min or h:m:s to string object (e.g., "10:30", "10:30:45") - - never return datetime object directly, convert it to either number (if it only contains year) or string so it's readable. - - 3. The output must only contain a json object representing inferred user intent and a python code block representing the transformation code, do not add any extra text explanation. +```python +# Option 3: Hybrid - DuckDB for aggregation, pandas for reshaping +import pandas as pd +import duckdb + +# Aggregate with DuckDB +df = duckdb.sql(""" + SELECT category, SUM(value) as total + FROM read_parquet('data.parquet') + GROUP BY category +""").df() + +# Reshape with pandas +result_df = df.pivot(columns='category', values='total') +``` + +**Important notes:** +- In DuckDB, escape single quotes by doubling them ('') not with backslash (\') +- DuckDB does NOT support Unicode escape sequences like \\u0400-\\u04FF. Use character ranges directly: [а-яА-Я] for Cyrillic +- When using date/time functions in DuckDB, cast date columns to explicit types to avoid ambiguity: + * Use `CAST(date_column AS DATE)` for date operations + * Use `CAST(datetime_column AS TIMESTAMP)` for timestamp operations +- For complex datetime operations, consider loading data first then using pandas datetime functions ''' example = """ @@ -181,23 +261,24 @@ def transform_data(df1, df2, ...): Here are our datasets, here are their field summaries and samples: -df1 (student_exam) fields: - student -- type: int64, values: 1, 2, 3, ..., 997, 998, 999, 1000 - major -- type: object, values: liberal arts, science - math -- type: int64, values: 0, 8, 18, ..., 97, 98, 99, 100 - reading -- type: int64, values: 17, 23, 24, ..., 96, 97, 99, 100 - writing -- type: int64, values: 10, 15, 19, ..., 97, 98, 99, 100 +## Table 1: student_exam (student_exam.parquet) +(1000 rows × 5 columns) -df1 (student_exam) sample: +### Schema (5 fields) + - student -- type: int64, values: 1, 2, 3, ..., 997, 998, 999, 1000 + - major -- type: object, values: liberal arts, science + - math -- type: int64, values: 0, 8, 18, ..., 97, 98, 99, 100 + - reading -- type: int64, values: 17, 23, 24, ..., 96, 97, 99, 100 + - writing -- type: int64, values: 10, 15, 19, ..., 97, 98, 99, 100 +### Sample Data (first 5 rows) ``` -|student|major|math|reading|writing -0|1|liberal arts|72|72|74 -1|2|liberal arts|69|90|88 -2|3|liberal arts|90|95|93 -3|4|science|47|57|44 -4|5|science|76|78|75 -...... + student major math reading writing +0 1 liberal arts 72 72 74 +1 2 liberal arts 69 90 88 +2 3 liberal arts 90 95 93 +3 4 science 47 57 44 +4 5 science 76 78 75 ``` [GOAL] @@ -206,121 +287,146 @@ def transform_data(df1, df2, ...): [OUTPUT] -{ +```json +{ + "input_tables": ["student_exam"], "recap": "Rank students based on their average scores", - "display_instruction": "Rank students by average scores", + "display_instruction": "Rank students by **average scores**", "mode": "infer", - "recommendation": "To rank students based on their average scores, we need to calculate the average score for each student, then sort the data, and finally assign a rank to each student based on their average score.", - "input_tables": ["student_exam"], - "output_fields": ["student", "major", "average_score", "rank"], - "chart_type": "bar", - "chart_encodings": {"x": "student", "y": "average_score"}, -} + "recommendation": "To rank students based on their average scores, we need to calculate the average score for each student, then sort the data, and finally assign a rank to each student based on their average score.", + "output_fields": ["student", "major", "average_score", "rank"], + "chart_type": "bar", + "chart_encodings": {"x": "student", "y": "average_score"}, + "output_variable": "student_rankings" +} +``` ```python -import pandas as pd -import collections -import numpy as np - -def transform_data(df): - df['average_score'] = df[['math', 'reading', 'writing']].mean(axis=1) - df = df.sort_values(by='average_score', ascending=False) - df['rank'] = df['average_score'].rank(ascending=False, method='dense').astype(int) - transformed_df = df[['student', 'major', 'average_score', 'rank']] - return transformed_df +import pandas as pd +import duckdb + +# Use DuckDB for efficient ranking and aggregation +student_rankings = duckdb.sql(''' + SELECT + student, + major, + (math + reading + writing) / 3.0 AS average_score, + RANK() OVER (ORDER BY (math + reading + writing) / 3.0 DESC) AS rank + FROM read_parquet('student_exam.parquet') + ORDER BY average_score DESC +''').df() ``` """ -class PythonDataRecAgent(object): - def __init__(self, client, system_prompt=None, exec_python_in_subprocess=False, agent_coding_rules=""): +class DataRecAgent(object): + + def __init__(self, client, workspace, system_prompt=None, agent_coding_rules="", max_display_rows=5000): self.client = client - + self.workspace = workspace + self.max_display_rows = max_display_rows + # Incorporate agent coding rules into system prompt if provided if system_prompt is not None: self.system_prompt = system_prompt else: base_prompt = SYSTEM_PROMPT if agent_coding_rules and agent_coding_rules.strip(): - self.system_prompt = base_prompt + "\n\n[AGENT CODING RULES]\nPlease follow these rules when generating code. Note: if the user instruction conflicts with these rules, you should priortize user instructions.\n\n" + agent_coding_rules.strip() + self.system_prompt = base_prompt + "\n\n[AGENT CODING RULES]\nPlease follow these rules when generating code. Note: if the user instruction conflicts with these rules, you should prioritize user instructions.\n\n" + agent_coding_rules.strip() else: self.system_prompt = base_prompt - - self.exec_python_in_subprocess = exec_python_in_subprocess def process_gpt_response(self, input_tables, messages, response): - """process gpt response to handle execution""" - - #log = {'messages': messages, 'response': response.model_dump(mode='json')} + """Process GPT response to handle Python code execution""" if isinstance(response, Exception): result = {'status': 'other error', 'content': str(response.body)} return [result] - + candidates = [] for choice in response.choices: - + logger.info("\n=== Data recommendation result ===>\n") logger.info(choice.message.content + "\n") - + json_blocks = extract_json_objects(choice.message.content + "\n") if len(json_blocks) > 0: refined_goal = json_blocks[0] + output_variable = refined_goal.get('output_variable', 'result_df') else: - refined_goal = { 'mode': "", 'recommendation': "", 'input_tables': [], 'output_fields': [], 'chart_encodings': {}, 'chart_type': "" } + refined_goal = {'mode': "", 'recommendation': "", 'output_fields': [], 'chart_encodings': {}, 'chart_type': "", 'output_variable': 'result_df'} + output_variable = 'result_df' code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "python") if len(code_blocks) > 0: - code_str = code_blocks[-1] + code = code_blocks[-1] try: - # Check if input_tables is available - if not input_tables: - result = {'status': 'error', 'code': code_str, 'content': "No input tables available."} - else: - # Determine which tables to use based on refined_goal - if 'input_tables' in refined_goal and isinstance(refined_goal['input_tables'], list) and len(refined_goal['input_tables']) > 0: - # Use only specified tables - validate all exist - table_name_map = {t['name']: t for t in input_tables} - tables_to_use = [] - missing_tables = [] - - for table_name in refined_goal['input_tables']: - if table_name in table_name_map: - tables_to_use.append(table_name_map[table_name]) - else: - missing_tables.append(table_name) - - # Error if any specified table is missing - if missing_tables: - available_table_names = [t['name'] for t in input_tables] - result = {'status': 'error', 'code': code_str, 'content': f"Table(s) '{', '.join(missing_tables)}' specified in 'input_tables' not found. Available tables: {', '.join(available_table_names)}"} - else: - result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in tables_to_use], self.exec_python_in_subprocess) + # Import the sandbox execution function + from data_formulator.sandbox.py_sandbox import run_unified_transform_in_sandbox + from flask import current_app + + # Get exec_python_in_subprocess setting + exec_python_in_subprocess = current_app.config.get('CLI_ARGS', {}).get('exec_python_in_subprocess', False) + + # Execute the Python script in sandbox + execution_result = run_unified_transform_in_sandbox( + code=code, + workspace_path=self.workspace._path, + output_variable=output_variable, + exec_python_in_subprocess=exec_python_in_subprocess + ) + + if execution_result['status'] == 'ok': + full_df = execution_result['content'] + row_count = len(full_df) + + # Generate unique table name for workspace storage + output_table_name = self.workspace.get_fresh_name(f"d-{output_variable}") + + # Write full result to workspace as parquet + self.workspace.write_parquet(full_df, output_table_name) + + # Limit rows for response payload + if row_count > self.max_display_rows: + query_output = full_df.head(self.max_display_rows) else: - # No input_tables specified in refined_goal, use all input_tables - result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess) - - result['code'] = code_str - - if result['status'] == 'ok': - result_df = result['content'] - result['content'] = { - 'rows': json.loads(result_df.to_json(orient='records')), + query_output = full_df + + # Remove duplicate columns to avoid orient='records' error + query_output = query_output.loc[:, ~query_output.columns.duplicated()] + + result = { + "status": "ok", + "code": code, + "content": { + 'rows': json.loads(query_output.to_json(orient='records')), + 'virtual': { + 'table_name': output_table_name, + 'row_count': row_count + } + }, } else: - logger.info(result['content']) + # Execution error + error_message = execution_result.get('content', execution_result.get('error_message', 'Unknown error')) + result = { + 'status': 'error', + 'code': code, + 'content': error_message + } + except Exception as e: - logger.warning('other error:') + logger.warning('Error occurred during code execution:') error_message = traceback.format_exc() logger.warning(error_message) - result = {'status': 'other error', 'code': code_str, 'content': f"Unexpected error executing the code, please try again."} + result = {'status': 'other error', 'code': code, 'content': f"Unexpected error: {error_message}"} else: result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."} - + result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] - result['agent'] = 'PythonDataRecAgent' + result['agent'] = 'DataRecAgent' result['refined_goal'] = refined_goal candidates.append(result) @@ -333,16 +439,21 @@ def process_gpt_response(self, input_tables, messages, response): logger.info(f"## {key}:\n{value}") return candidates - - def run(self, input_tables, description, n=1, prev_messages: list[dict] = []): - data_summary = generate_data_summary(input_tables, include_data_samples=True) + def run(self, input_tables, description, n=1, prev_messages: list[dict] = []): + """ + Args: + input_tables: list[dict], each dict contains 'name' (table name in workspace) and 'rows' + description: str, the description of what the user wants + n: int, the number of candidates + prev_messages: list[dict], the previous messages + """ + # Generate data summary with file references + data_summary = generate_data_summary(input_tables, workspace=self.workspace) user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" - if len(prev_messages) > 0: - user_query = f"The user wants a new recommendation based off the following updated context and goal:\n\n[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" logger.info(user_query) @@ -353,27 +464,32 @@ def run(self, input_tables, description, n=1, prev_messages: list[dict] = []): messages = [{"role":"system", "content": self.system_prompt}, *filtered_prev_messages, {"role":"user","content": user_query}] - + response = self.client.get_completion(messages = messages) - + return self.process_gpt_response(input_tables, messages, response) - + def followup(self, input_tables, dialog, latest_data_sample, new_instruction: str, n=1): - """extend the input data (in json records format) to include new fields - latest_data_sample: the latest data sample that the user is working on, it's a json object that contains the data sample of the current table - new_instruction: the new instruction that the user wants to add to the latest data sample """ - + Followup recommendation based on previous dialog and new instruction. + + Args: + input_tables: list of input tables + dialog: previous conversation history + latest_data_sample: sample of the latest transformation result + new_instruction: new user instruction for followup + n: number of candidates + """ logger.info(f"GOAL: \n\n{new_instruction}") - # get the current table name + # Format sample data sample_data_str = pd.DataFrame(latest_data_sample).head(10).to_string() + '\n......' - messages = [*dialog, - {"role":"user", - "content": f"This is the result from the latest python code:\n\n{sample_data_str}\n\nUpdate the code above based on the following instruction:\n\n{new_instruction}"}] + messages = [*dialog, + {"role":"user", + "content": f"This is the result from the latest transformation:\n\n{sample_data_str}\n\nUpdate the Python script above based on the following instruction:\n\n{new_instruction}"}] response = self.client.get_completion(messages = messages) - return self.process_gpt_response(input_tables, messages, response) \ No newline at end of file + return self.process_gpt_response(input_tables, messages, response) diff --git a/py-src/data_formulator/agents/agent_data_transform.py b/py-src/data_formulator/agents/agent_data_transform.py new file mode 100644 index 00000000..baa6ac12 --- /dev/null +++ b/py-src/data_formulator/agents/agent_data_transform.py @@ -0,0 +1,429 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import json +import os + +from data_formulator.agents.agent_utils import extract_json_objects, extract_code_from_gpt_response +import pandas as pd + +import logging +import re +# Replace/update the logger configuration +logger = logging.getLogger(__name__) + +SYSTEM_PROMPT = '''You are a data scientist to help user to transform data that will be used for visualization. +The user will provide you information about what data would be needed, and your job is to create a Python script based on the input data summary, transformation instruction and expected fields. +The users' instruction includes "chart_type" and "chart_encodings" that describe the visualization they want, and natural language instructions "goal" that describe what data is needed. + +**Important:** +- NEVER make assumptions or judgments about a person's gender, biological sex, sexuality, religion, race, nationality, ethnicity, political stance, socioeconomic status, mental health, invisible disabilities, medical conditions, personality type, social impressions, emotional state, and cognitive state. +- NEVER create formulas that could be used to discriminate based on age. Ageism of any form (explicit and implicit) is strictly prohibited. +- If above issue occurs, generate columns with NULL or np.nan. + +**About the execution environment:** +- You can use BOTH DuckDB SQL and pandas operations in the same script +- The script will run in the workspace data directory where all files are located +- You can reference files directly by their filename (e.g., 'sales_data.parquet') +- **Allowed libraries:** pandas, numpy, duckdb, math, datetime, json, statistics, collections, re, sklearn, scipy, random, itertools, functools, operator, time +- **Not allowed:** matplotlib, plotly, seaborn, requests, subprocess, os, sys, io, or any other library not listed above. Do NOT import them — the sandbox will reject the import. +- File system access (open, write) and network access are also forbidden. + +**When to use DuckDB vs pandas:** +- For large datasets (parquet files with many rows): prefer DuckDB SQL for aggregations, filtering, joins, window functions, and groupby — DuckDB can process parquet files efficiently without loading all data into memory. +- For small datasets (even if stored as parquet): prefer pandas for readability and simplicity. +- Use pandas for: complex transformations, time series operations, ML features, reshaping. +- You can combine both: use DuckDB for initial data loading/filtering on large files, then pandas for complex operations. + +**Code structure:** +- The script should be standalone (no function wrapper) +- Import statements at the top +- Data loading using DuckDB or pandas +- Transformations combining SQL and pandas as needed +- Assign the final result to a variable (you will specify the variable name in JSON) + +Concretely, you should first refine users' goal and then create a Python script in the output section based off the [CONTEXT] and [GOAL]: + + 1. First, refine users' [GOAL]. The main objective in this step is to check if "chart_type" and "chart_encodings" provided by the user are sufficient to achieve their "goal". Concretely: + - based on the user's "goal" and "chart_type" and "chart_encodings", elaborate the goal into a "detailed_instruction". + - determine "input_tables", the names of a subset of input tables from [CONTEXT] section that will be used to achieve the user's goal. + - **IMPORTANT** Note that the Table 1 in [CONTEXT] section is the table the user is currently viewing, it should take precedence if the user refers to insights about the "current table". + - At the same time, leverage table information to determine which tables are relevant to the user's goal and should be used. + - "display_instruction" is a short verb phrase describing the users' goal. + - it would be a short verbal description of user intent as a verb phrase (<12 words). + - generate it based on detailed_instruction and the suggested chart_type and chart_encodings, but don't need to mention the chart details. + - should capture key computation ideas: by reading the display, the user can understand the purpose and what's derived from the data. + - if the user specification follows up the previous instruction, the 'display_instruction' should only describe how it builds up the previous instruction without repeating information from previous steps. + - the phrase can be presented in different styles, e.g., question (what's xxx), instruction (show xxx), description, etc. + - if you mention column names from the input or the output data, highlight the text in **bold**. + * the column can either be a column in the input data, or a new column that will be computed in the output data. + * the mention don't have to be exact match, it can be semantically matching, e.g., if you mentioned "average score" in the text while the column to be computed is "Avg_Score", you should still highlight "**average score**" in the text. + - determine "output_fields", the desired fields that the output data should have to achieve the user's goal, it's a good idea to include intermediate fields here. + - then decide "chart_encodings", which maps visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of "output_fields" that will be visualized, + - the "chart_encodings" should be created to support the user's "chart_type". + - first, determine whether the user has provided sufficient fields in "chart_encodings" that are needed to achieve their goal: + - if the user's "chart_encodings" are sufficient, simply copy it. + - if the user didn't provide sufficient fields in "chart_encodings", add missing fields in "chart_encodings" (ordered them based on whether the field will be used in x,y axes or legends); + - "chart_encodings" should only include fields that will be visualized (do not include other intermediate fields from "output_fields") + - when adding new fields to "chart_encodings", be efficient and add only a minimal number of fields that are needed to achive the user's goal. + - generally, the total number of fields in "chart_encodings" should be no more than 3 for x,y,legend. + - if the user's "chart_encodings" is sufficient but can be optimized, you can reorder encodings to visualize the data more effectively. + - sometimes, user may provide instruction to update visualizations fields they provided. You should leverage the user's goal to resolve the conflict and decide the final "chart_encodings" + - e.g., they may mention "use B metric instead" while A metric is in provided fields, in this case, you should update "chart_encodings" to update A metric with B metric. + - if the user provides latitude and longitude as visual channels, use "latitude" and "longitude" as visual channels in "chart_encodings" as opposed to "x" and "y". + - guide on statistical analysis: + - when the user asks for forecasting or regression analysis, you should consider the following: + - the output should be a long format table where actual x, y pairs and predicted x, y pairs are included in the X, Y columns, they are differentiated with a third column "is_predicted". + - i.e., if the user ask for forecasting based on two columns T and Y, the output should be three columns: T, Y, is_predicted, where + - T, Y columns contain BOTH original values from the data and predicted values from the data. + - is_predicted is a boolean field to indicate whether the x, y pairs are original values from the data or predicted / regression values from the data. + - the recommended chart should be line chart (time series) or scatter plot (quantitative x, y) + - if the user asks for forecasting, it's good to include predicted x, y pairs for both x in the original data and future x values (i.e., combine regression and forecasting results) + - in this case, is_predicted should be of three values 'original', 'regression', 'forecasting' + - when the user asks for clustering: + - the output should be a long format table where actual x, y pairs with a third column "cluster_id" that indicates the cluster id of the data point. + - the recommended chart should be scatter plot (quantitative x, y) + - specify "output_variable", the name of the Python variable that will contain the final DataFrame result. + The name should be descriptive and reflect the data content (e.g., "sales_by_region", "monthly_trends", "customer_segments"). + Avoid generic names like "result_df", "output", or "data". Use snake_case naming convention. + + Prepare the result in the following json format: + +```json +{ + "input_tables": ["student_exam"], + "detailed_instruction": "...", // string, elaborate user instruction with details + "display_instruction": "...", // string, the short verb phrase describing the users' goal + "output_fields": [...], // string[], describe the desired output fields that the output data should have based on the user's goal + "chart_encodings": { + "x": "", + "y": "", + "color": "", + "size": "", + "opacity": "", + "facet": "", + ... // other visualization channels user used + }, // object: map visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of "output_fields" that will be visualized + "output_variable": "...", // string, the name of the Python variable containing the final result. + // Should be descriptive and informative (e.g., "sales_by_region", "monthly_revenue", "top_10_products"), + // not generic names like "result_df" or "output". Use snake_case. + "reason": "..." // string, explain why this refinement is made +} +``` + + 2. Then, write a Python script based on the refined goal. The script should transform input data into the desired output table containing all "output_fields" from the refined goal. +The script should be as simple as possible and easily readable. If there is no data transformation needed based on "output_fields", the script can simply load and assign the data. + + 3. The output must only contain two items: + - a json object (wrapped in ```json```) representing the refined goal (including "detailed_instruction", "output_fields", "chart_encodings", "output_variable" and "reason") + - a python code block (wrapped in ```python```) representing the transformation script, do not add any extra text explanation. + +**Datetime handling notes:** +- If the output field is year, convert it to number. If it is year-month / year-month-day, convert it to string (e.g., "2020-01" / "2020-01-01"). +- If the output is time only: convert hour to number if it's just the hour (e.g., 10), but convert hour:min or h:m:s to string (e.g., "10:30", "10:30:45"). +- Never return datetime objects directly; convert to either number (if it only contains year) or string so it's readable. + +**Example data loading patterns:** + +```python +# Option 1: Load with DuckDB SQL +import pandas as pd +import duckdb + +df = duckdb.sql(""" + SELECT + date, + SUM(sales) as total_sales + FROM read_parquet('sales_data.parquet') + GROUP BY date +""").df() + +# Option 2: Load with pandas +import pandas as pd +df = pd.read_parquet('sales_data.parquet') + +# Option 3: Hybrid - DuckDB for aggregation, pandas for time series +import pandas as pd +import duckdb + +df = duckdb.sql(""" + SELECT date, SUM(value) as total + FROM read_parquet('data.parquet') + GROUP BY date +""").df() + +df['rolling_avg'] = df['total'].rolling(7).mean() +result_df = df +``` + +**Important notes:** +- In DuckDB, escape single quotes by doubling them ('') not with backslash (\') +- DuckDB does NOT support Unicode escape sequences like \\u0400-\\u04FF. Use character ranges directly: [а-яА-Я] for Cyrillic +- When using date/time functions in DuckDB, cast date columns to explicit types to avoid ambiguity: + * Use `CAST(date_column AS DATE)` for date operations + * Use `CAST(datetime_column AS TIMESTAMP)` for timestamp operations + * Example: `CAST(strftime('%Y', CAST(date_column AS DATE)) AS INTEGER) AS year` +- For complex datetime operations, consider loading data first then using pandas datetime functions +''' + +EXAMPLE=''' +[CONTEXT] + +Here are 1 dataset with their summaries: + +## Table 1: weather_seattle_atlanta (weather_seattle_atlanta.parquet) +(548 rows × 3 columns) + +### Schema (3 fields) + - Date -- type: VARCHAR, values: 1/1/2020, 1/10/2020, 1/11/2020, ..., 9/7/2020, 9/8/2020, 9/9/2020 + - City -- type: VARCHAR, values: Atlanta, Seattle + - Temperature -- type: INTEGER, range: [30, 86] + +### Sample Data (first 5 rows) +``` + Date City Temperature +0 1/1/2020 Seattle 51 +1 1/1/2020 Atlanta 45 +2 1/2/2020 Seattle 45 +3 1/2/2020 Atlanta 47 +4 1/3/2020 Seattle 48 +``` + +[GOAL] + +{ + "instruction": "create a scatter plot with seattle and atlanta temperatures on x,y axes, color points by which city is warmer", + "chart_type": "scatter", + "chart_encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"} +} + +[OUTPUT] + +```json +{ + "input_tables": ["weather_seattle_atlanta"], + "detailed_instruction": "Create a scatter plot to compare Seattle and Atlanta temperatures with Seattle temperatures on the x-axis and Atlanta temperatures on the y-axis. Color the points by which city is warmer.", + "display_instruction": "Compare **Seattle** and **Atlanta** temperatures", + "output_fields": ["Date", "Seattle Temperature", "Atlanta Temperature", "Warmer City"], + "chart_encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"}, + "output_variable": "city_temp_comparison", + "reason": "To compare Seattle and Atlanta temperatures, we need to pivot the data to have separate temperature columns for each city, then compute which city is warmer." +} +``` + +```python +import pandas as pd +import duckdb + +# Use DuckDB for pivot operation +city_temp_comparison = duckdb.sql(""" + WITH pivoted AS ( + SELECT + Date, + MAX(CASE WHEN City = 'Seattle' THEN Temperature END) AS "Seattle Temperature", + MAX(CASE WHEN City = 'Atlanta' THEN Temperature END) AS "Atlanta Temperature" + FROM read_parquet('weather_seattle_atlanta.parquet') + GROUP BY Date + ) + SELECT + Date, + "Seattle Temperature", + "Atlanta Temperature", + CASE WHEN "Seattle Temperature" > "Atlanta Temperature" THEN 'Seattle' ELSE 'Atlanta' END AS "Warmer City" + FROM pivoted +""").df() +``` +''' + + +class DataTransformationAgent(object): + + def __init__(self, client, workspace, system_prompt=None, agent_coding_rules="", max_display_rows=5000): + self.client = client + self.workspace = workspace + self.max_display_rows = max_display_rows + + # Incorporate agent coding rules into system prompt if provided + if system_prompt is not None: + self.system_prompt = system_prompt + else: + base_prompt = SYSTEM_PROMPT + if agent_coding_rules and agent_coding_rules.strip(): + self.system_prompt = base_prompt + "\n\n[AGENT CODING RULES]\nPlease follow these rules when generating code. Note: if the user instruction conflicts with these rules, you should prioritize user instructions.\n\n" + agent_coding_rules.strip() + else: + self.system_prompt = base_prompt + + + def process_gpt_response(self, response, messages): + """Process GPT response to handle Python code execution""" + + if isinstance(response, Exception): + result = {'status': 'other error', 'content': str(response.body)} + return [result] + + candidates = [] + for choice in response.choices: + logger.info("=== Python script result ===>") + logger.info(choice.message.content + "\n") + + json_blocks = extract_json_objects(choice.message.content + "\n") + if len(json_blocks) > 0: + refined_goal = json_blocks[0] + output_variable = refined_goal.get('output_variable', 'result_df') + else: + refined_goal = {'chart_encodings': {}, 'instruction': '', 'reason': '', 'output_variable': 'result_df'} + output_variable = 'result_df' + + code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "python") + + if len(code_blocks) > 0: + code = code_blocks[-1] + + try: + # Import the sandbox execution function + from data_formulator.sandbox.py_sandbox import run_unified_transform_in_sandbox + from flask import current_app + + # Get exec_python_in_subprocess setting + exec_python_in_subprocess = current_app.config.get('CLI_ARGS', {}).get('exec_python_in_subprocess', False) + + # Execute the Python script in sandbox + execution_result = run_unified_transform_in_sandbox( + code=code, + workspace_path=self.workspace._path, + output_variable=output_variable, + exec_python_in_subprocess=exec_python_in_subprocess + ) + + if execution_result['status'] == 'ok': + full_df = execution_result['content'] + row_count = len(full_df) + + # Generate unique table name for workspace storage + output_table_name = self.workspace.get_fresh_name(f"d-{output_variable}") + + # Write full result to workspace as parquet + self.workspace.write_parquet(full_df, output_table_name) + + # Limit rows for response payload + if row_count > self.max_display_rows: + query_output = full_df.head(self.max_display_rows) + else: + query_output = full_df + + # Remove duplicate columns to avoid orient='records' error + query_output = query_output.loc[:, ~query_output.columns.duplicated()] + + result = { + "status": "ok", + "code": code, + "content": { + 'rows': json.loads(query_output.to_json(orient='records')), + 'virtual': { + 'table_name': output_table_name, + 'row_count': row_count + } + }, + } + else: + # Execution error + result = { + 'status': 'error', + 'code': code, + 'content': execution_result['content'] + } + + except Exception as e: + logger.warning('Error occurred during code execution:') + logger.warning(f"Error type: {type(e).__name__}, message: {str(e)}") + error_message = f"An error occurred during code execution. Error type: {type(e).__name__}, message: {str(e)}" + result = {'status': 'error', 'code': code, 'content': error_message} + + else: + result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."} + + result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] + result['agent'] = 'DataTransformationAgent' + result['refined_goal'] = refined_goal + candidates.append(result) + + logger.info("=== Transform Candidates ===>") + for candidate in candidates: + for key, value in candidate.items(): + if key in ['dialog', 'content']: + logger.info(f"##{key}:\n{str(value)[:1000]}...") + else: + logger.info(f"## {key}:\n{value}") + + return candidates + + + def run(self, input_tables, description, chart_type: str, chart_encodings: dict, prev_messages: list[dict] = [], n=1): + """Args: + input_tables: list[dict], each dict contains 'name' (table name in workspace) + description: str, the description of the data transformation + chart_type: str, the chart type for visualization + chart_encodings: dict, the chart encodings mapping visualization channels to fields + prev_messages: list[dict], the previous messages + n: int, the number of candidates + """ + # Generate data summary with file references + from data_formulator.agents.agent_utils import generate_data_summary + data_summary = generate_data_summary(input_tables, workspace=self.workspace) + + goal = { + "instruction": description, + "chart_type": chart_type, + "chart_encodings": chart_encodings, + } + + user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{json.dumps(goal, indent=4)}" + if len(prev_messages) > 0: + user_query = f"The user wants a new transformation based off the following updated context and goal:\n\n[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" + + logger.info(user_query) + + # Filter out system messages from prev_messages + filtered_prev_messages = [msg for msg in prev_messages if msg.get("role") != "system"] + + messages = [{"role":"system", "content": self.system_prompt}, + *filtered_prev_messages, + {"role":"user","content": user_query}] + + response = self.client.get_completion(messages = messages) + + return self.process_gpt_response(response, messages) + + + def followup(self, input_tables, dialog, latest_data_sample, chart_type: str, chart_encodings: dict, new_instruction: str, n=1): + """ + Followup transformation based on previous dialog and new instruction. + + Args: + input_tables: list of input tables + dialog: previous conversation history + latest_data_sample: sample of the latest transformation result + chart_type: chart type + chart_encodings: chart encodings + new_instruction: new user instruction for followup + n: number of candidates + """ + goal = { + "followup_instruction": new_instruction, + "chart_type": chart_type, + "chart_encodings": chart_encodings + } + + logger.info(f"GOAL: \n\n{goal}") + + updated_dialog = [{"role":"system", "content": self.system_prompt}, *dialog[1:]] + + # Format sample data + sample_data_str = pd.DataFrame(latest_data_sample).head(10).to_string() + '\n......' + + messages = [*updated_dialog, {"role":"user", + "content": f"This is the result from the latest transformation:\n\n{sample_data_str}\n\nUpdate the Python script above based on the following instruction:\n\n{json.dumps(goal, indent=4)}"}] + + response = self.client.get_completion(messages = messages) + + return self.process_gpt_response(response, messages) diff --git a/py-src/data_formulator/agents/agent_exploration.py b/py-src/data_formulator/agents/agent_exploration.py index 73515c2e..fc4951de 100644 --- a/py-src/data_formulator/agents/agent_exploration.py +++ b/py-src/data_formulator/agents/agent_exploration.py @@ -6,7 +6,6 @@ import base64 from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary -from data_formulator.agents.agent_sql_data_transform import generate_sql_data_summary logger = logging.getLogger(__name__) @@ -95,10 +94,10 @@ class ExplorationAgent(object): - def __init__(self, client, agent_exploration_rules="", db_conn=None): + def __init__(self, client, workspace, agent_exploration_rules=""): self.agent_exploration_rules = agent_exploration_rules self.client = client - self.db_conn = db_conn + self.workspace = workspace def process_gpt_response(self, messages, response): """Process GPT response to extract exploration plan""" @@ -150,11 +149,7 @@ def get_chart_message(self, visualization): return {"type": "text", "text": "The visualization is not available."} def get_data_summary(self, input_tables): - if self.db_conn: - data_summary = generate_sql_data_summary(self.db_conn, input_tables) - else: - data_summary = generate_data_summary(input_tables) - return data_summary + return generate_data_summary(input_tables, workspace=self.workspace) def suggest_followup(self, input_tables, completed_steps: list[dict], next_steps: list[str]): """ diff --git a/py-src/data_formulator/agents/agent_interactive_explore.py b/py-src/data_formulator/agents/agent_interactive_explore.py index c9151db9..472afaf0 100644 --- a/py-src/data_formulator/agents/agent_interactive_explore.py +++ b/py-src/data_formulator/agents/agent_interactive_explore.py @@ -6,7 +6,6 @@ import pandas as pd from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary -from data_formulator.agents.agent_sql_data_transform import generate_sql_data_summary logger = logging.getLogger(__name__) @@ -115,17 +114,10 @@ class InteractiveExploreAgent(object): - def __init__(self, client, agent_exploration_rules="", db_conn=None): + def __init__(self, client, workspace, agent_exploration_rules=""): self.client = client self.agent_exploration_rules = agent_exploration_rules - self.db_conn = db_conn - - def get_data_summary(self, input_tables, table_name_prefix="Table"): - if self.db_conn: - data_summary = generate_sql_data_summary(self.db_conn, input_tables, table_name_prefix=table_name_prefix) - else: - data_summary = generate_data_summary(input_tables, include_data_samples=False, table_name_prefix=table_name_prefix) - return data_summary + self.workspace = workspace # when set (SQL/datalake mode), use parquet tables for summary def run(self, input_tables, start_question=None, exploration_thread=None, current_data_sample=None, current_chart=None, mode='interactive'): @@ -144,18 +136,19 @@ def run(self, input_tables, start_question=None, exploration_thread=None, """ # Generate data summary - data_summary = self.get_data_summary(input_tables) + data_summary = generate_data_summary(input_tables, self.workspace) # Build context including exploration thread if available context = f"[DATASETS] These are the datasets the user is working with:\n\n{data_summary}" if exploration_thread: - thread_summary = self.get_data_summary( + thread_summary = generate_data_summary( [{ 'name': table.get('name', f'Table {i}'), 'rows': table.get('rows', []), 'attached_metadata': table.get('description', ''), } for i, table in enumerate(exploration_thread, 1)], + self.workspace, table_name_prefix="Thread Table" ) context += f"\n\n[EXPLORATION THREAD] These are the sequence of tables the user created in this exploration thread, in the order they were created, and what questions are asked to create them:\n\n{thread_summary}" diff --git a/py-src/data_formulator/agents/agent_py_concept_derive.py b/py-src/data_formulator/agents/agent_py_concept_derive.py deleted file mode 100644 index 9d2ba055..00000000 --- a/py-src/data_formulator/agents/agent_py_concept_derive.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import time -import json - -from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response -import data_formulator.py_sandbox as py_sandbox - -import traceback - -import logging - -logger = logging.getLogger(__name__) - - -SYSTEM_PROMPT = '''You are a data scientist to help user to derive new column based on existing columns in a dataset. -Your job is to write a python function based on input data summary, instruction and output column name. -Complete a python function based off the [CONTEXT], [TEMPLATE] and [GOAL] provided, the function's input arguments is a dataframe, and the new column derived from the dataframe is returned. -The function should be as simple as possible. - -Allowed imports, if you need any of them, import yourself, otherwise, do not import (other libraries will be blocked): -- pandas (import pandas as pd is always included) -- numpy -- math -- datetime -- json -- statistics -- random -- collections -- re -- itertools -- functools -- operator - -[TEMPLATE] - -```python -import pandas as pd -import re -import datetime - -def derive_new_column(df): - # complete code here - return col -``` - -For example: - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -table_0 (us_covid_cases) fields: - Date -- type: object, values: 1/1/2021, 1/1/2022, 1/1/2023, ..., 9/8/2022, 9/9/2020, 9/9/2021, 9/9/2022 - Cases -- type: int64, values: -23999, -14195, -6940, ..., 1018935, 1032159, 1178403, 1433977 - -table_0 (us_covid_cases) sample: -``` -|Date|Cases -0|1/21/2020|1 -1|1/22/2020|0 -2|1/23/2020|0 -3|1/24/2020|1 -4|1/25/2020|1 -...... -``` - -[GOAL] - -{ - "input_fields": ["Date"], - "output_field": "month", - "description": "extract month from Date" -} - -[OUTPUT] - -```python -import re -import datetime - -def derive_new_column(df): - df['month'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y').month) - return df['month'] -``` -''' - - -class PyConceptDeriveAgent(object): - - def __init__(self, client, exec_python_in_subprocess=False): - self.client = client - self.exec_python_in_subprocess = exec_python_in_subprocess - - def run(self, input_table, input_fields, output_field, description): - """derive a new concept based on input table, input fields, and output field name, (and description) - """ - - data_summary = generate_data_summary([input_table], include_data_samples=True) - - objective = { - "input_fields": input_fields, - "output_field": output_field, - "description": description - } - - user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{objective}\n\n[OUTPUT]\n" - - logger.info(user_query) - - messages = [{"role":"system", "content": SYSTEM_PROMPT}, - {"role":"user","content": user_query}] - - time_start = time.time() - ###### the part that calls open_ai - response = self.client.get_completion(messages = messages) - time_end = time.time() - logger.info(f"time taken to get response: {time_end - time_start} seconds") - - #log = {'messages': messages, 'response': response.model_dump(mode='json')} - - candidates = [] - for choice in response.choices: - - logger.info("\n=== Python Data Derive Agent ===>\n") - logger.info(choice.message.content + "\n") - - code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "python") - - if len(code_blocks) > 0: - code_str = code_blocks[-1] - try: - result = py_sandbox.run_derive_concept(code_str, output_field, input_table['rows'], self.exec_python_in_subprocess) - - if result['status'] == 'ok': - result['content'] = { - 'rows': json.loads(result['content'].to_json(orient='records')), - } - else: - print(result['content']) - result['code'] = code_str - except Exception as e: - print('other error:') - error_message = traceback.format_exc() - print(error_message) - result = {'status': 'other error', 'content': error_message} - else: - result = {'status': 'other error', 'content': 'unable to extract code from response'} - - result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] - result['agent'] = 'PyConceptDeriveAgent' - candidates.append(result) - - time_end = time.time() - logger.info(f"time taken to get candidates: {time_end - time_start} seconds") - - return candidates \ No newline at end of file diff --git a/py-src/data_formulator/agents/agent_py_data_transform.py b/py-src/data_formulator/agents/agent_py_data_transform.py deleted file mode 100644 index a227d106..00000000 --- a/py-src/data_formulator/agents/agent_py_data_transform.py +++ /dev/null @@ -1,398 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import json - -from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response -import data_formulator.py_sandbox as py_sandbox -import pandas as pd - -import logging - -# Replace/update the logger configuration -logger = logging.getLogger(__name__) - -SYSTEM_PROMPT = '''You are a data scientist to help user to transform data that will be used for visualization. -The user will provide you information about what data would be needed, and your job is to create a python function based on the input data summary, transformation instruction and expected fields. -The users' instruction includes "chart_type" and "chart_encodings" that describe the visualization they want, and natural language instructions "goal" that describe what data is needed. - -**Important:** -- NEVER make assumptions or judgments about a person's gender, biological sex, sexuality, religion, race, nationality, ethnicity, political stance, socioeconomic status, mental health, invisible disabilities, medical conditions, personality type, social impressions, emotional state, and cognitive state. -- NEVER create formulas that could be used to discriminate based on age. Ageism of any form (explicit and implicit) is strictly prohibited. -- If above issue occurs, generate columns with np.nan. - -Concretely, you should first refine users' goal and then create a python function in the output section based off the [CONTEXT] and [GOAL]: - - 1. First, refine users' [GOAL]. The main objective in this step is to check if "chart_type" and "chart_encodings" provided by the user are sufficient to achieve their "goal". Concretely: - - based on the user's "goal" and "chart_type" and "chart_encodings", elaborate the goal into a "detailed_instruction". - - "display_instruction" is a short verb phrase describing the users' goal. - - it would be a short verbal description of user intent as a verb phrase (<12 words). - - generate it based on detailed_instruction and the suggested chart_type and chart_encodings, but don't need to mention the chart details. - - should capture key computation ideas: by reading the display, the user can understand the purpose and what's derived from the data. - - if the user specification follows up the previous instruction, the 'display_instruction' should only describe how it builds up the previous instruction without repeating information from previous steps. - - the phrase can be presented in different styles, e.g., question (what's xxx), instruction (show xxx), description, etc. - - if you mention column names from the input or the output data, highlight the text in **bold**. - * the column can either be a column in the input data, or a new column that will be computed in the output data. - * the mention don't have to be exact match, it can be semantically matching, e.g., if you mentioned "average score" in the text while the column to be computed is "Avg_Score", you should still highlight "**average score**" in the text. - - determine "input_tables", the names of a subset of input tables from [CONTEXT] section that will be used to achieve the user's goal. - - **IMPORTANT** Note that the Table 1 in [CONTEXT] section is the table the user is currently viewing, it should take precedence if the user refers to insights about the "current table". - - At the same time, leverage table information to determine which tables are relevant to the user's goal and should be used. - - determine "output_fields", the desired fields that the output data should have to achieve the user's goal, it's a good idea to include intermediate fields here. - - then decide "chart_encodings", which maps visualization channels (x, y, color, size, opacity, facet, latitude, longitude, etc.) to a subset of "output_fields" that will be visualized, - - the "chart_encodings" should be created to support the user's "chart_type". - - first, determine whether the user has provided sufficient fields in "chart_encodings" that are needed to achieve their goal: - - if the user's "chart_encodings" are sufficient, simply copy it. - - if the user didn't provide sufficient fields in "chart_encodings", add missing fields in "chart_encodings" (ordered them based on whether the field will be used in x,y axes or legends); - - "chart_encodings" should only include fields that will be visualized (do not include other intermediate fields from "output_fields") - - when adding new fields to "chart_encodings", be efficient and add only a minimal number of fields that are needed to achive the user's goal. - - generally, the total number of fields in "chart_encodings" should be no more than 3 for x,y,legend. - - if the user's "chart_encodings" is sufficient but can be optimized, you can reorder encodings to visualize the data more effectively. - - sometimes, user may provide instruction to update visualizations fields they provided. You should leverage the user's goal to resolve the conflict and decide the final "chart_encodings" - - e.g., they may mention "use B metric instead" while A metric is in provided fields, in this case, you should update "chart_encodings" to update A metric with B metric. - - if the user provides latitude and longitude as visual channels, use "latitude" and "longitude" as visual channels in "chart_encodings" as opposed to "x" and "y". - - guide on statistical analysis: - - when the user asks for forecasting or regression analysis, you should consider the following: - - the output should be a long format table where actual x, y pairs and predicted x, y pairs are included in the X, Y columns, they are differentiated with a third column "is_predicted". - - i.e., if the user ask for forecasting based on two columns T and Y, the output should be three columns: T, Y, is_predicted, where - - T, Y columns contain BOTH original values from the data and predicted values from the data. - - is_predicted is a boolean field to indicate whether the x, y pairs are original values from the data or predicted / regression values from the data. - - the recommended chart should be line chart (time series) or scatter plot (quantitative x, y) - - if the user asks for forecasting, it's good to include predicted x, y pairs for both x in the original data and future x values (i.e., combine regression and forecasting results) - - in this case, is_predicted should be of three values 'original', 'regression', 'forecasting' - - when the user asks for clustering: - - the output should be a long format table where actual x, y pairs with a third column "cluster_id" that indicates the cluster id of the data point. - - the recommended chart should be scatter plot (quantitative x, y) - - Prepare the result in the following json format: - -``` -{ - "detailed_instruction": "..." // string, elaborate user instruction with details if the user - "display_instruction": "..." // string, the short verb phrase describing the users' goal. - "input_tables": [...] // string[], describe names of the input tables that will be used in the transformation. - "output_fields": [...] // string[], describe the desired output fields that the output data should have based on the user's goal, it's a good idea to preserve intermediate fields here (i.e., the goal of transformed data) - "chart_encodings": { - "x": "", - "y": "", - "color": "", - "size": "", - "opacity": "", - "facet": "", - ... // other visualization channels user used - } // object: map visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of "output_fields" that will be visualized. - "reason": "..." // string, explain why this refinement is made -} -``` - - 2. Then, write a python function based on the refined goal, the function input is a dataframe "df" (or multiple dataframes based on tables described in "input_tables") and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal. -The python function must follow the template provided in [TEMPLATE], only import libraries allowed in the template, do not modify function name. The function should be as simple as possible and easily readable. -If there is no data transformation needed based on "output_fields", the transformation function can simply "return df". - -[TEMPLATE] - -```python -import pandas as pd -import collections -import numpy as np -# from sklearn import ... # import from sklearn if you need it. - -def transform_data(df1, df2, ...): - # complete the template here - return transformed_df -``` - -note: -- decide the function signature based on the number of tables you decided in the previous step "input_tables": - - if you decide there will only be one input table, then function signature should be `def transform_data(df1)` - - if you decided there will be k input tables, then function signature should be `def transform_data(df_1, df_2, ..., df_k)`. - - instead of using generic names like df1, df2, ..., try to use intuitive table names for function arguments, for example, if you have input_tables: ["City", "Weather"]`, you can use `transform_data(df_city, df_weather)` to refer to the two dataframes. - - **VERY IMPORTANT** the number of arguments in the function signature must be the same as the number of tables provided in "input_tables", and the order of arguments must match the order of tables provided in "input_tables". -- datetime objects handling: - - if the output field is year, convert it to number, if it is year-month / year-month-day, convert it to string object (e.g., "2020-01" / "2020-01-01"). - - if the output is time only: convert hour to number if it's just the hour (e.g., 10), but convert hour:min or h:m:s to string object (e.g., "10:30", "10:30:45") - - never return datetime object directly, convert it to either number (if it only contains year) or string so it's readable. - - 3. The output must only contain a json object representing the refined goal and a python code block representing the transformation code, do not add any extra text explanation. -''' - -EXAMPLE=''' - -For example: - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -df1 (us_covid_cases) fields: - Date -- type: object, values: 1/1/2021, 1/1/2022, 1/1/2023, ..., 9/8/2022, 9/9/2020, 9/9/2021, 9/9/2022 - Cases -- type: int64, values: -23999, -14195, -6940, ..., 1018935, 1032159, 1178403, 1433977 - -df1 (us_covid_cases) sample: -``` -|Date|Cases -0|1/21/2020|1 -1|1/22/2020|0 -2|1/23/2020|0 -3|1/24/2020|1 -4|1/25/2020|1 -...... -``` - -[GOAL] - -{ - "instruction": "calculate 7-day moving average", - "chart_type": "line", - "chart_encodings": {"x": "Date", "y": "7-day average cases"} -} - -[OUTPUT] - -{ - "detailed_instruction": "Calculate the 7-day moving average of COVID-19 cases over time.", - "display_instruction": "Calculate 7-day moving average of COVID-19 cases", - "output_fields": ["Date", "Cases", "7-day average cases"], - "chart_encodings": {"x": "Date", "y": "7-day average cases"}, - "reason": "To calculate the 7-day moving average, the 'Cases' field is required, but it is not needed for visualization. The provided fields are sufficient to achieve the goal." -} - -```python -import pandas as pd -import collections -import numpy as np - -def transform_data(df): - # Convert Date column to datetime - df['Date'] = pd.to_datetime(df['Date']) - - # Sort the dataframe by Date - df = df.sort_values('Date') - - # Calculate the 7-day moving average of cases - df['7-day average cases'] = df['Cases'].rolling(window=7).mean() - - # Select the output fields - transformed_df = df[['Date', 'Cases', '7-day average cases']] - - return transformed_df -``` - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -df1 (weather_seattle_atlanta) fields: - Date -- type: object, values: 1/1/2020, 1/10/2020, 1/11/2020, ..., 9/6/2020, 9/7/2020, 9/8/2020, 9/9/2020 - City -- type: object, values: Atlanta, Seattle - Temperature -- type: int64, values: 30, 31, 32, ..., 83, 84, 85, 86 - -df1 (weather_seattle_atlanta) sample: -``` -|Date|City|Temperature -0|1/1/2020|Seattle|51 -1|1/1/2020|Atlanta|45 -2|1/2/2020|Seattle|45 -3|1/2/2020|Atlanta|47 -4|1/3/2020|Seattle|48 -...... -``` - -[GOAL] - -{ - "instruction": "create a scatter plot to with seattle and atlanta temperatures on x,y axes, color points by which city is warmer", - "chart_type": "scatter", - "chart_encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"} -} - -[OUTPUT] - -{ - "detailed_instruction": "Create a scatter plot to compare Seattle and Atlanta temperatures with Seattle temperatures on the x-axis and Atlanta temperatures on the y-axis. Color the points by which city is warmer.", - "input_tables": ["weather_seattle_atlanta"], - "output_fields": ["Date", "Seattle Temperature", "Atlanta Temperature", "Warmer City"], - "chart_encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"}, - "reason": "To compare Seattle and Atlanta temperatures with Seattle temperatures on the x-axis and Atlanta temperatures on the y-axis, and color points by which city is warmer, separate temperature fields for Seattle and Atlanta are required. Additionally, a new field 'Warmer City' is needed to indicate which city is warmer." -} - -```python -import pandas as pd -import collections -import numpy as np - -def transform_data(df_weather_seattle_atlanta): - # Pivot the dataframe to have separate columns for Seattle and Atlanta temperatures - df_pivot = df.pivot(index='Date', columns='City', values='Temperature').reset_index() - df_pivot.columns = ['Date', 'Atlanta Temperature', 'Seattle Temperature'] - - # Determine which city is warmer for each date - df_pivot['Warmer City'] = df_pivot.apply(lambda row: 'Atlanta' if row['Atlanta Temperature'] > row['Seattle Temperature'] else 'Seattle', axis=1) - - # Select the output fields - transformed_df = df_pivot[['Date', 'Seattle Temperature', 'Atlanta Temperature', 'Warmer City']] - - return transformed_df -``` -''' - -class PythonDataTransformationAgent(object): - - def __init__(self, client, system_prompt=None, exec_python_in_subprocess=False, agent_coding_rules=""): - self.client = client - - # Incorporate agent coding rules into system prompt if provided - if system_prompt is not None: - self.system_prompt = system_prompt - else: - base_prompt = SYSTEM_PROMPT - if agent_coding_rules and agent_coding_rules.strip(): - self.system_prompt = base_prompt + "\n\n[AGENT CODING RULES]\nPlease follow these rules when generating code. Note: if the user instruction conflicts with these rules, you should priortize user instructions.\n\n" + agent_coding_rules.strip() - else: - self.system_prompt = base_prompt - - self.exec_python_in_subprocess = exec_python_in_subprocess - - def process_gpt_response(self, input_tables, messages, response): - """process gpt response to handle execution""" - - if isinstance(response, Exception): - result = {'status': 'other error', 'content': str(response.body)} - return [result] - - candidates = [] - for choice in response.choices: - logger.info("=== Data transformation result ===>") - logger.info(choice.message.content + "\n") - - json_blocks = extract_json_objects(choice.message.content + "\n") - if len(json_blocks) > 0: - refined_goal = json_blocks[0] - else: - refined_goal = {'chart_encodings': {}, 'instruction': '', 'reason': '', 'input_tables': []} - - code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "python") - - if len(code_blocks) > 0: - code_str = code_blocks[-1] - - try: - # Check if input_tables is available - if not input_tables: - result = {'status': 'error', 'code': code_str, 'content': "No input tables available."} - else: - # Determine which tables to use based on refined_goal - if 'input_tables' in refined_goal and isinstance(refined_goal['input_tables'], list) and len(refined_goal['input_tables']) > 0: - # Use only specified tables - validate all exist - table_name_map = {t['name']: t for t in input_tables} - tables_to_use = [] - missing_tables = [] - - for table_name in refined_goal['input_tables']: - if table_name in table_name_map: - tables_to_use.append(table_name_map[table_name]) - else: - missing_tables.append(table_name) - - # Error if any specified table is missing - if missing_tables: - available_table_names = [t['name'] for t in input_tables] - result = {'status': 'error', 'code': code_str, 'content': f"Table(s) '{', '.join(missing_tables)}' specified in 'input_tables' not found. Available tables: {', '.join(available_table_names)}"} - else: - result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in tables_to_use], self.exec_python_in_subprocess) - else: - # No input_tables specified in refined_goal, use all input_tables - result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess) - - result['code'] = code_str - - if result['status'] == 'ok': - # parse the content - result_df = result['content'] - result['content'] = { - 'rows': json.loads(result_df.to_json(orient='records')), - } - else: - logger.info(result['content']) - except Exception as e: - logger.warning('Error occurred during code execution:') - error_message = f"An error occurred during code execution. Error type: {type(e).__name__}" - logger.warning(error_message) - result = {'status': 'error', 'code': code_str, 'content': error_message} - else: - result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."} - - result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] - result['agent'] = 'PythonDataTransformationAgent' - result['refined_goal'] = refined_goal - candidates.append(result) - - logger.info("=== Transform Candidates ===>") - for candidate in candidates: - for key, value in candidate.items(): - if key in ['dialog', 'content']: - logger.info(f"##{key}:\n{str(value)[:1000]}...") - else: - logger.info(f"## {key}:\n{value}") - - return candidates - - - def run(self, input_tables, description, chart_type: str, chart_encodings: dict, prev_messages: list[dict] = [], n=1): - - data_summary = generate_data_summary(input_tables, include_data_samples=True) - - goal = { - "instruction": description, - "chart_type": chart_type, - "chart_encodings": chart_encodings, - } - - user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{json.dumps(goal, indent=4)}" - if len(prev_messages) > 0: - user_query = f"The user wants a new transformation based off the following updated context and goal:\n\n[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" - - logger.info(user_query) - - # Filter out system messages from prev_messages - filtered_prev_messages = [msg for msg in prev_messages if msg.get("role") != "system"] - - messages = [{"role":"system", "content": self.system_prompt}, - *filtered_prev_messages, - {"role":"user", "content": user_query}] - - response = self.client.get_completion(messages = messages) - - return self.process_gpt_response(input_tables, messages, response) - - - def followup(self, input_tables, dialog, latest_data_sample, chart_type: str, chart_encodings: dict, new_instruction: str, n=1): - """ - extend the input data (in json records format) to include new fields - latest_data_sample: the latest data sample that the user is working on, it's a json object that contains the data sample of the current table - chart_type: the chart type that the user wants to use - chart_encodings: the chart encodings that the user wants to use - new_instruction: the new instruction that the user wants to add to the latest data sample - """ - - goal = { - "followup_instruction": new_instruction, - "chart_type": chart_type, - "chart_encodings": chart_encodings - } - - logger.info(f"GOAL: \n\n{goal}") - - #logger.info(dialog) - - updated_dialog = [{"role":"system", "content": self.system_prompt}, *dialog[1:]] - - # get the current table name - sample_data_str = pd.DataFrame(latest_data_sample).head(10).to_string() + '\n......' - - messages = [*updated_dialog, - {"role":"user", - "content": f"This is the result from the latest python code:\n\n{sample_data_str}\n\nUpdate the code above based on the following instruction:\n\n{json.dumps(goal, indent=4)}"}] - - response = self.client.get_completion(messages = messages) - - return self.process_gpt_response(input_tables, messages, response) diff --git a/py-src/data_formulator/agents/agent_report_gen.py b/py-src/data_formulator/agents/agent_report_gen.py index f1dac083..bcee8be3 100644 --- a/py-src/data_formulator/agents/agent_report_gen.py +++ b/py-src/data_formulator/agents/agent_report_gen.py @@ -4,7 +4,6 @@ import json from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary -from data_formulator.agents.agent_sql_data_transform import generate_sql_data_summary import logging @@ -53,16 +52,12 @@ class ReportGenAgent(object): - def __init__(self, client, conn): + def __init__(self, client, workspace): self.client = client - self.conn = conn + self.workspace = workspace def get_data_summary(self, input_tables): - if self.conn: - data_summary = generate_sql_data_summary(self.conn, input_tables) - else: - data_summary = generate_data_summary(input_tables) - return data_summary + return generate_data_summary(input_tables, self.workspace) def stream(self, input_tables, charts=[], style="blog post"): """derive a new concept based on the raw input data diff --git a/py-src/data_formulator/agents/agent_sql_data_rec.py b/py-src/data_formulator/agents/agent_sql_data_rec.py deleted file mode 100644 index bc48c053..00000000 --- a/py-src/data_formulator/agents/agent_sql_data_rec.py +++ /dev/null @@ -1,363 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import json - -from data_formulator.agents.agent_utils import extract_json_objects, extract_code_from_gpt_response -from data_formulator.agents.agent_sql_data_transform import generate_sql_data_summary - -import random -import string - -import traceback -import duckdb -import pandas as pd - -import logging - -logger = logging.getLogger(__name__) - - -SYSTEM_PROMPT = '''You are a data scientist to help user to recommend data that will be used for visualization. -The user will provide you information about what visualization they would like to create, and your job is to recommend a transformed data that can be used to create the visualization and write a SQL query to transform the data. -The recommendation and transformation function should be based on the [CONTEXT] and [GOAL] provided by the user. -The [CONTEXT] shows what the current dataset is, and the [GOAL] describes what the user wants the data for. - -**Important:** -- NEVER make assumptions or judgments about a person's gender, biological sex, sexuality, religion, race, nationality, ethnicity, political stance, socioeconomic status, mental health, invisible disabilities, medical conditions, personality type, social impressions, emotional state, and cognitive state. -- NEVER create formulas that could be used to discriminate based on age. Ageism of any form (explicit and implicit) is strictly prohibited. -- If above issue occurs, generate columns with NULL. - -Concretely, you should infer the appropriate data and create a SQL query based off the [CONTEXT] and [GOAL] in two steps: - -1. First, based on users' [GOAL]. Create a json object that represents the inferred user intent. The json object should have the following format: - -{ - "mode": "" // string, one of "infer", "overview", "distribution", "summary" - "recap": "..." // string, a short summary of the user's goal. - "display_instruction": "..." // string, the even shorter verb phrase describing the users' goal. - "recommendation": "..." // string, explain why this recommendation is made - "input_tables": [...] // string[], describe names of the input tables that will be used in the transformation. - "output_fields": [...] // string[], describe the desired output fields that the output data should have (i.e., the goal of transformed data), it's a good idea to preseve intermediate fields here - "chart_type": "" // string, one of "point", "bar", "line", "area", "heatmap", "group_bar". "chart_type" should either be inferred from user instruction, or recommend if the user didn't specify any. - "chart_encodings": { - "x": "", - "y": "", - "color": "", - "size": "", - "opacity": "", - "facet": "", - } // object: map visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of output fields, appropriate visual channels for different chart types are defined below. -} - -Concretely: - - recap what the user's goal is in a short summary in "recap". - - If the user's [GOAL] is clear already, simply infer what the user mean. Set "mode" as "infer" and create "output_fields" and "chart_encodings" based off user description. - - If the user's [GOAL] is not clear, make recommendations to the user: - - choose one of "distribution", "overview", "summary" in "mode": - * if it is "overview" and the data is in wide format, reshape it into long format. - * if it is "distribution", select a few fields that would be interesting to visualize together. - * if it is "summary", calculate some aggregated statistics to show intresting facts of the data. - - describe the recommendation reason in "recommendation" - - based on the recommendation, determine what is an ideal output data. Note, the output data must be in tidy format. - - then suggest recommendations of chart encoding that should be used to create the visualization. - - "display_instruction" should be a short verb phrase describing the users' goal, it should be even shorter than "recap". - - it would be a short verbal description of user intent as a verb phrase (<12 words). - - generate based on "recap" and the suggested visualization, but don't need to mention the visualization details. - - should capture key computation ideas: by reading the display, the user can understand the purpose and what's derived from the data. - - if the user instruction builds up the previous instruction, the 'display_instruction' should only describe how it builds up the previous instruction without repeating information from previous steps. - - the phrase can be presented in different styles, e.g., question (what's xxx), instruction (show xxx), description, etc. - - if you mention column names from the input or the output data, highlight the text in **bold**. - * the column can either be a column in the input data, or a new column that will be computed in the output data. - * the mention don't have to be exact match, it can be semantically matching, e.g., if you mentioned "average score" in the text while the column to be computed is "Avg_Score", you should still highlight "**average score**" in the text. - - determine "input_tables", the names of a subset of input tables from [CONTEXT] section that will be used to achieve the user's goal. - - **IMPORTANT** Note that the Table 1 in [CONTEXT] section is the table the user is currently viewing, it should take precedence if the user refers to insights about the "current table". - - At the same time, leverage table information to determine which tables are relevant to the user's goal and should be used. - - "chart_type" must be one of "point", "bar", "line", "area", "heatmap", "group_bar" - - "chart_encodings" should specify which fields should be used to create the visualization - - decide which visual channels should be used to create the visualization appropriate for the chart type. - - point: x, y, color, size, facet - - histogram: x, color, facet - - bar: x, y, color, facet - - line: x, y, color, facet - - area: x, y, color, facet - - heatmap: x, y, color, facet - - group_bar: x, y, color, facet - - note that all fields used in "chart_encodings" should be included in "output_fields". - - all fields you need for visualizations should be transformed into the output fields! - - "output_fields" should include important intermediate fields that are not used in visualization but are used for data transformation. - - typically only 2-3 fields should be used to create the visualization (x, y, color/size), facet use be added if it's a faceted visualization (totally 4 fields used). - - Guidelines for choosing chart type and visualization fields: - - Consider chart types as follows: - - (point) Scatter Plots: x,y: Quantitative/Categorical, color: Categorical (optional), size: Quantitative (optional for creating bubble chart), - - best for: Relationships, correlations, distributions - - scatter plots are good default way to visualize data when other chart types are not applicable. - - use color to visualize points from different categories. - - use size to visualize data points with an additional quantitative dimension of the data points. - - (histogram) Histograms: x: Quantitative/Categorical, color: Categorical (optional for creating grouped histogram), - - best for: Distribution of a quantitative field - - use x values directly if x values are categorical, and transform the data into bins if the field values are quantitative. - - when color is specified, the histogram will be grouped automatically (items with the same x values will be grouped). - - (bar) Bar Charts: x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical/Quantitative (for stacked bar chart / showing additional quantitative dimension), - - best for: Comparisons across categories - - use (bar) for simple bar chart or stacked bar chart (when it makes sense to add up Y values for each category with the same X value), - - when color is specified, the bar will be stacked automatically (items with the same x values will be stacked). - - note that when there are multiple rows in the data with same x values, the bar will be stacked automatically. - - 1. consider to use an aggregated field for y values if the value is not suitable for stacking. - - 2. consider to introduce facets so that each group is visualized in a separate bar. - - (group_bar) for grouped bar chart, x: Categorical (nominal/ordinal), y: Quantitative, color: Categorical - - when color is specifed, bars from different groups will be grouped automatically. - - only use facet if the cardinality of color field is small (less than 5). - - (line) Line Charts: x: Temporal (preferred) or ordinal, y: Quantitative, color: Categorical (optional for creating multiple lines), - - best for: Trends over time, continuous data - - note that when there are multiple rows in the data belong to the same group (same x and color values) but different y values, the line will not look correct. - - consider to use an aggregated field for y values, or introduce facets so that each group is visualized in a separate line. - - (area) Area Charts: x: Temporal (preferred) or ordinal, y: Quantitative, color: Categorical (optional for creating stacked areas), - - best for: Trends over time, continuous data - - (heatmap) Heatmaps: x,y: Categorical (you need to convert quantitative to nominal), color: Quantitative intensity, - - best for: Pattern discovery in matrix data - - facet channel is available for all chart types, it supports a categorical field with small cardinality to visualize the data in different facets. - - if you really need additional legend fields: - - you can use opacity for legend (support Quantitative and Categorical). - - visualization fields require tidy data. - - similar to VegaLite and ggplot2 so that each field is mapped to a visualization axis or legend. - - consider data transformations if you want to visualize multiple fields together: - - exapmle 1: suggest reshaping the data into long format in data transformation description (if these fields are all of the same type, e.g., they are all about sales, price, two columns about min/max-values, etc. don't mix different types of fields in reshaping) so we can visualize multiple fields as categories or in different facets. - - exapmle 2: calculate some derived fields from these fields(e.g., correlation, difference, profit etc.) in data transformation description to visualize them in one visualization. - - example 3: create a visualization only with a subset of the fields, you don't have to visualize all of them in one chart, you can later create a visualization with the rest of the fields. With the subset of charts, you can also consider reshaping or calculate some derived value. - - again, it does not make sense to have five fields like [item, A, B, C, D, E] in visualization fields, you should consider data transformation to reduce the number of fields. - - when reshaping data to long format, only fields of the same semantic type should be rehaped into the same column. - - 2. Then, write a SQL query based on the inferred goal, the query input are tables (or multiple tables presented in the [CONTEXT] section) and the output is the transformed data. The output data should contain all "output_fields" from the refined goal. -The query should be as simple as possible and easily readable. If there is no data transformation needed based on "output_fields", the transformation function can simply "SELECT * FROM table". -note: - - the sql query should be written in the style of duckdb. - - if the user provided multiple tables, you should consider the join between tables to derive the output. - - 3. The output must only contain two items: - - a json object (wrapped in ```json```) representing the refined goal (including "mode", "recommendation", "output_fields", "chart_type", "chart_encodings") - - a sql query block (wrapped in ```sql```) representing the transformation code, do not add any extra text explanation. - -some notes: -- in DuckDB, you escape a single quote within a string by doubling it ('') rather than using a backslash (\'). -- in DuckDB, you need to use proper date functions to perform date operations. -- Critical: When using date/time functions in DuckDB, always cast date columns to explicit types to avoid function overload ambiguity: - * Use `CAST(date_column AS DATE)` for date operations - * Use `CAST(datetime_column AS TIMESTAMP)` for timestamp operations - * Use `CAST(datetime_column AS TIMESTAMP_NS)` for nanosecond precision timestamps - * Common patterns: - - Extract year: `CAST(strftime('%Y', CAST(date_column AS DATE)) AS INTEGER) AS year` - - Extract month: `CAST(strftime('%m', CAST(date_column AS DATE)) AS INTEGER) AS month` - - Format date: `strftime('%Y-%m-%d', CAST(date_column AS DATE)) AS formatted_date` - - Date arithmetic: `CAST(date_column AS DATE) + INTERVAL 1 DAY` - * This prevents "Could not choose a best candidate function" errors in DuckDB -- Critical: DuckDB regex limitations: - * Does NOT support Unicode escape sequences like \\u0400-\\u04FF - * For Unicode character detection, use character ranges directly: [а-яА-Я] for Cyrillic, [一-龥] for Chinese, etc. - * Alternative: Use ASCII ranges or specific character sets that DuckDB supports - * Example: Instead of quote ~ '[\\u0400-\\u04FF]', use quote ~ '[а-яА-ЯёЁ]' -''' - -example = """ -For example: - -[CONTEXT] - -Here are our datasets, here are their field summaries and samples: - -table_0 (student_exam) fields: - student -- type: int64, values: 1, 2, 3, ..., 997, 998, 999, 1000 - major -- type: object, values: liberal arts, science - math -- type: int64, values: 0, 8, 18, ..., 97, 98, 99, 100 - reading -- type: int64, values: 17, 23, 24, ..., 96, 97, 99, 100 - writing -- type: int64, values: 10, 15, 19, ..., 97, 98, 99, 100 - -table_0 (student_exam) sample: - -``` -|student|major|math|reading|writing -0|1|liberal arts|72|72|74 -1|2|liberal arts|69|90|88 -2|3|liberal arts|90|95|93 -3|4|science|47|57|44 -4|5|science|76|78|75 -...... -``` - -[GOAL] - -{"goal": "Rank students based on their average scores"} - -[OUTPUT] - -```json -{ - "input_tables": ["student_exam"], - "recap": "Rank students based on their average scores", - "display_instruction": "Rank students by **average scores**", - "mode": "infer", - "recommendation": "To rank students based on their average scores, we need to calculate the average score for each student, then sort the data, and finally assign a rank to each student based on their average score.", - "output_fields": ["student", "major", "average_score", "rank"], - "chart_type": "bar", - "chart_encodings": {"x": "student", "y": "average_score"} -} -``` - -```sql -SELECT - student, - major, - (math + reading + writing) / 3.0 AS average_score, - RANK() OVER (ORDER BY (math + reading + writing) / 3.0 DESC) AS rank -FROM - student_exam -ORDER BY average_score DESC; -``` -""" - -class SQLDataRecAgent(object): - - def __init__(self, client, conn, system_prompt=None, agent_coding_rules=""): - self.client = client - self.conn = conn - - # Incorporate agent coding rules into system prompt if provided - if system_prompt is not None: - self.system_prompt = system_prompt - else: - base_prompt = SYSTEM_PROMPT - if agent_coding_rules and agent_coding_rules.strip(): - self.system_prompt = base_prompt + "\n\n[AGENT CODING RULES]\nPlease follow these rules when generating code. Note: if the user instruction conflicts with these rules, you should priortize user instructions.\n\n" + agent_coding_rules.strip() - else: - self.system_prompt = base_prompt - - def process_gpt_response(self, input_tables, messages, response): - """process gpt response to handle execution""" - - #log = {'messages': messages, 'response': response.model_dump(mode='json')} - - if isinstance(response, Exception): - result = {'status': 'other error', 'content': str(response.body)} - return [result] - - candidates = [] - for choice in response.choices: - - logger.info("\n=== Data recommendation result ===>\n") - logger.info(choice.message.content + "\n") - - json_blocks = extract_json_objects(choice.message.content + "\n") - if len(json_blocks) > 0: - refined_goal = json_blocks[0] - else: - refined_goal = { 'mode': "", 'recommendation': "", 'output_fields': [], 'chart_encodings': {}, 'chart_type': "" } - - code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "sql") - - if len(code_blocks) > 0: - code_str = code_blocks[-1] - - try: - random_suffix = ''.join(random.choices(string.ascii_lowercase, k=4)) - table_name = f"view_{random_suffix}" - - create_query = f"CREATE VIEW IF NOT EXISTS {table_name} AS {code_str}" - self.conn.execute(create_query) - self.conn.commit() - - # Check how many rows are in the table - row_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] - - # Only limit to 5000 if there are more rows - query_output = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 5000").fetch_df() - - result = { - "status": "ok", - "code": code_str, - "content": { - 'rows': json.loads(query_output.to_json(orient='records')), - 'virtual': { - 'table_name': table_name, - 'row_count': row_count - } - }, - } - except duckdb.BinderException as e: - error_str = str(e) - if "Could not choose a best candidate function" in error_str: - logger.warning(f"DuckDB type ambiguity error: {error_str}") - result = { - 'status': 'sql_error', - 'code': code_str, - 'content': f"SQL type casting required. DuckDB needs explicit type casting for date/time functions. Error: {error_str}. Please cast date columns to specific types (DATE, TIMESTAMP, etc.) before using date functions." - } - else: - logger.warning(f"DuckDB binder error: {error_str}") - result = { - 'status': 'sql_error', - 'code': code_str, - 'content': f"SQL error: {error_str}" - } - except Exception as e: - logger.warning('other error:') - error_message = traceback.format_exc() - logger.warning(error_message) - result = {'status': 'other error', 'code': code_str, 'content': f"Unexpected error: {error_message}"} - else: - result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."} - - result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] - result['agent'] = 'SQLDataRecAgent' - result['refined_goal'] = refined_goal - candidates.append(result) - - logger.info("=== Recommendation Candidates ===>") - for candidate in candidates: - for key, value in candidate.items(): - if key in ['dialog', 'content']: - logger.info(f"##{key}:\n{str(value)[:1000]}...") - else: - logger.info(f"## {key}:\n{value}") - - return candidates - - - def run(self, input_tables, description, n=1, prev_messages: list[dict] = []): - data_summary = generate_sql_data_summary(self.conn, input_tables) - - user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" - if len(prev_messages) > 0: - user_query = f"The user wants a new recommendation based off the following updated context and goal:\n\n[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" - - logger.info(user_query) - - # Filter out system messages from prev_messages - filtered_prev_messages = [msg for msg in prev_messages if msg.get("role") != "system"] - - messages = [{"role":"system", "content": self.system_prompt}, - *filtered_prev_messages, - {"role":"user","content": user_query}] - - response = self.client.get_completion(messages = messages) - - return self.process_gpt_response(input_tables, messages, response) - - - def followup(self, input_tables, dialog, latest_data_sample, new_instruction: str, n=1): - """extend the input data (in json records format) to include new fields - latest_data_sample: the latest data sample that the user is working on, it's a json object that contains the data sample of the current table - new_instruction: the new instruction that the user wants to add to the latest data sample - """ - - logger.info(f"GOAL: \n\n{new_instruction}") - - # get the current table name - sample_data_str = pd.DataFrame(latest_data_sample).head(10).to_string() + '\n......' - - messages = [*dialog, - {"role":"user", - "content": f"This is the result from the latest sql query:\n\n{sample_data_str}\n\nUpdate the sql query above based on the following instruction:\n\n{new_instruction}"}] - - response = self.client.get_completion(messages = messages) - - return self.process_gpt_response(input_tables, messages, response) \ No newline at end of file diff --git a/py-src/data_formulator/agents/agent_sql_data_transform.py b/py-src/data_formulator/agents/agent_sql_data_transform.py deleted file mode 100644 index 89f158ee..00000000 --- a/py-src/data_formulator/agents/agent_sql_data_transform.py +++ /dev/null @@ -1,516 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import json -import random -import string - -from data_formulator.agents.agent_utils import extract_json_objects, extract_code_from_gpt_response -import pandas as pd - -import logging -import re -# Replace/update the logger configuration -logger = logging.getLogger(__name__) - -SYSTEM_PROMPT = '''You are a data scientist to help user to transform data that will be used for visualization. -The user will provide you information about what data would be needed, and your job is to create a sql query based on the input data summary, transformation instruction and expected fields. -The users' instruction includes "chart_type" and "chart_encodings" that describe the visualization they want, and natural language instructions "goal" that describe what data is needed. - -**Important:** -- NEVER make assumptions or judgments about a person's gender, biological sex, sexuality, religion, race, nationality, ethnicity, political stance, socioeconomic status, mental health, invisible disabilities, medical conditions, personality type, social impressions, emotional state, and cognitive state. -- NEVER create formulas that could be used to discriminate based on age. Ageism of any form (explicit and implicit) is strictly prohibited. -- If above issue occurs, generate columns with NULL. - -Concretely, you should first refine users' goal and then create a sql query in the output section based off the [CONTEXT] and [GOAL]: - - 1. First, refine users' [GOAL]. The main objective in this step is to check if "chart_type" and "chart_encodings" provided by the user are sufficient to achieve their "goal". Concretely: - - based on the user's "goal" and "chart_type" and "chart_encodings", elaborate the goal into a "detailed_instruction". - - determine "input_tables", the names of a subset of input tables from [CONTEXT] section that will be used to achieve the user's goal. - - **IMPORTANT** Note that the Table 1 in [CONTEXT] section is the table the user is currently viewing, it should take precedence if the user refers to insights about the "current table". - - At the same time, leverage table information to determine which tables are relevant to the user's goal and should be used. - - "display_instruction" is a short verb phrase describing the users' goal. - - it would be a short verbal description of user intent as a verb phrase (<12 words). - - generate it based on detailed_instruction and the suggested chart_type and chart_encodings, but don't need to mention the chart details. - - should capture key computation ideas: by reading the display, the user can understand the purpose and what's derived from the data. - - if the user specification follows up the previous instruction, the 'display_instruction' should only describe how it builds up the previous instruction without repeating information from previous steps. - - the phrase can be presented in different styles, e.g., question (what's xxx), instruction (show xxx), description, etc. - - if you mention column names from the input or the output data, highlight the text in **bold**. - * the column can either be a column in the input data, or a new column that will be computed in the output data. - * the mention don't have to be exact match, it can be semantically matching, e.g., if you mentioned "average score" in the text while the column to be computed is "Avg_Score", you should still highlight "**average score**" in the text. - - determine "output_fields", the desired fields that the output data should have to achieve the user's goal, it's a good idea to include intermediate fields here. - - then decide "chart_encodings", which maps visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of "output_fields" that will be visualized, - - the "chart_encodings" should be created to support the user's "chart_type". - - first, determine whether the user has provided sufficient fields in "chart_encodings" that are needed to achieve their goal: - - if the user's "chart_encodings" are sufficient, simply copy it. - - if the user didn't provide sufficient fields in "chart_encodings", add missing fields in "chart_encodings" (ordered them based on whether the field will be used in x,y axes or legends); - - "chart_encodings" should only include fields that will be visualized (do not include other intermediate fields from "output_fields") - - when adding new fields to "chart_encodings", be efficient and add only a minimal number of fields that are needed to achive the user's goal. - - generally, the total number of fields in "chart_encodings" should be no more than 3 for x,y,legend. - - if the user's "chart_encodings" is sufficient but can be optimized, you can reorder encodings to visualize the data more effectively. - - sometimes, user may provide instruction to update visualizations fields they provided. You should leverage the user's goal to resolve the conflict and decide the final "chart_encodings" - - e.g., they may mention "use B metric instead" while A metric is in provided fields, in this case, you should update "chart_encodings" to update A metric with B metric. - - Prepare the result in the following json format: - -``` -{ - "input_tables": ["student_exam"], - "detailed_instruction": "..." // string, elaborate user instruction with details if the user - "display_instruction": "..." // string, the short verb phrase describing the users' goal. - "output_fields": [...] // string[], describe the desired output fields that the output data should have based on the user's goal, it's a good idea to preserve intermediate fields here (i.e., the goal of transformed data) - "chart_encodings": { - "x": "", - "y": "", - "color": "", - "size": "", - "opacity": "", - "facet": "", - ... // other visualization channels user used - } // object: map visualization channels (x, y, color, size, opacity, facet, etc.) to a subset of "output_fields" that will be visualized. - "reason": "..." // string, explain why this refinement is made -} -``` - - 2. Then, write a sql query based on the refined goal, the query input are table (or multiple tables presented in the [CONTEXT] section) and the output is the desired table. The output table should contain all "output_fields" from the refined goal. -The query should be as simple as possible and easily readable. If there is no data transformation needed based on "output_fields", the transformation function can simply "SELECT * FROM table". -note: - - the sql query should be written in the style of duckdb. - - 3. The output must only contain two items: - - a json object (wrapped in ```json```) representing the refined goal (including "detailed_instruction", "output_fields", "chart_encodings" and "reason") - - a sql query block (wrapped in ```sql```) representing the transformation code, do not add any extra text explanation. - -some notes: -- in DuckDB, you escape a single quote within a string by doubling it ('') rather than using a backslash (\'). -- in DuckDB, you need to use proper date functions to perform date operations. -- Critical: When using date/time functions in DuckDB, always cast date columns to explicit types to avoid function overload ambiguity: - * Use `CAST(date_column AS DATE)` for date operations - * Use `CAST(datetime_column AS TIMESTAMP)` for timestamp operations - * Use `CAST(datetime_column AS TIMESTAMP_NS)` for nanosecond precision timestamps - * Common patterns: - - Extract year: `CAST(strftime('%Y', CAST(date_column AS DATE)) AS INTEGER) AS year` - - Extract month: `CAST(strftime('%m', CAST(date_column AS DATE)) AS INTEGER) AS month` - - Format date: `strftime('%Y-%m-%d', CAST(date_column AS DATE)) AS formatted_date` - - Date arithmetic: `CAST(date_column AS DATE) + INTERVAL 1 DAY` - * This prevents "Could not choose a best candidate function" errors in DuckDB -- Critical: DuckDB regex limitations: - * Does NOT support Unicode escape sequences like \\u0400-\\u04FF - * For Unicode character detection, use character ranges directly: [а-яА-Я] for Cyrillic, [一-龥] for Chinese, etc. - * Alternative: Use ASCII ranges or specific character sets that DuckDB supports - * Example: Instead of quote ~ '[\\u0400-\\u04FF]', use quote ~ '[а-яА-ЯёЁ]' -''' - -EXAMPLE=''' -[CONTEXT] - -Here are 1 dataset with their summaries: - -## Table 1: weather_seattle_atlanta (548 rows × 3 columns) - -### Schema (3 fields) - - Date -- type: VARCHAR, values: 1/1/2020, 1/10/2020, 1/11/2020, ..., 9/7/2020, 9/8/2020, 9/9/2020 - - City -- type: VARCHAR, values: Atlanta, Seattle - - Temperature -- type: INTEGER, range: [30, 86] - -### Sample Data (first 5 rows) -``` - Date City Temperature -0 1/1/2020 Seattle 51 -1 1/1/2020 Atlanta 45 -2 1/2/2020 Seattle 45 -3 1/2/2020 Atlanta 47 -4 1/3/2020 Seattle 48 -``` - -[GOAL] - -{ - "instruction": "create a scatter plot to with seattle and atlanta temperatures on x,y axes, color points by which city is warmer", - "chart_type": "scatter", - "chart_encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"} -} - -[OUTPUT] - -{ - "input_tables": ["weather_seattle_atlanta"], - "detailed_instruction": "Create a scatter plot to compare Seattle and Atlanta temperatures with Seattle temperatures on the x-axis and Atlanta temperatures on the y-axis. Color the points by which city is warmer.", - "display_instruction": "Create a scatter plot to compare Seattle and Atlanta temperatures", - "output_fields": ["Date", "Seattle Temperature", "Atlanta Temperature", "Warmer City"], - "chart_encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"}, - "reason": "To compare Seattle and Atlanta temperatures with Seattle temperatures on the x-axis and Atlanta temperatures on the y-axis, and color points by which city is warmer, separate temperature fields for Seattle and Atlanta are required. Additionally, a new field 'Warmer City' is needed to indicate which city is warmer." -} - -```sql -WITH pivoted AS ( - SELECT - Date, - MAX(CASE WHEN City = 'Seattle' THEN Temperature END) AS "Seattle Temperature", - MAX(CASE WHEN City = 'Atlanta' THEN Temperature END) AS "Atlanta Temperature" - FROM weather_seattle_atlanta - GROUP BY Date -) -SELECT - Date, - "Seattle Temperature", - "Atlanta Temperature", - CASE WHEN "Seattle Temperature" > "Atlanta Temperature" THEN 'Seattle' ELSE 'Atlanta' END AS "Warmer City" -FROM pivoted; -``` -''' - -def sanitize_table_name(table_name: str) -> str: - """Sanitize table name to be used in SQL queries""" - # Replace spaces with underscores - sanitized_name = table_name.replace(" ", "_") - sanitized_name = sanitized_name.replace("-", "_") - # Allow alphanumeric, underscore, dot, dash, and dollar sign - sanitized_name = re.sub(r'[^a-zA-Z0-9_\.$]', '', sanitized_name) - return sanitized_name - -class SQLDataTransformationAgent(object): - - def __init__(self, client, conn, system_prompt=None, agent_coding_rules=""): - self.client = client - self.conn = conn # duckdb connection - - # Incorporate agent coding rules into system prompt if provided - if system_prompt is not None: - self.system_prompt = system_prompt - else: - base_prompt = SYSTEM_PROMPT - if agent_coding_rules and agent_coding_rules.strip(): - self.system_prompt = base_prompt + "\n\n[AGENT CODING RULES]\nPlease follow these rules when generating code. Note: if the user instruction conflicts with these rules, you should priortize user instructions.\n\n" + agent_coding_rules.strip() - else: - self.system_prompt = base_prompt - - - def process_gpt_sql_response(self, response, messages): - """process gpt response to handle execution""" - - #log = {'messages': messages, 'response': response.model_dump(mode='json')} - #logger.info("=== prompt_filter_results ===>") - #logger.info(response.prompt_filter_results) - - if isinstance(response, Exception): - result = {'status': 'other error', 'content': str(response.body)} - return [result] - - candidates = [] - for choice in response.choices: - logger.info("=== SQL query result ===>") - logger.info(choice.message.content + "\n") - - json_blocks = extract_json_objects(choice.message.content + "\n") - if len(json_blocks) > 0: - refined_goal = json_blocks[0] - else: - refined_goal = {'chart_encodings': {}, 'instruction': '', 'reason': ''} - - query_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "sql") - - if len(query_blocks) > 0: - query_str = query_blocks[-1] - - try: - # Generate unique table name directly with timestamp and random suffix - random_suffix = ''.join(random.choices(string.ascii_lowercase, k=4)) - table_name = f"view_{random_suffix}" - - create_query = f"CREATE VIEW IF NOT EXISTS {table_name} AS {query_str}" - self.conn.execute(create_query) - self.conn.commit() - - # Check how many rows are in the table - row_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] - - # Only limit to 5000 if there are more rows - if row_count > 5000: - query_output = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 5000").fetch_df() - else: - query_output = self.conn.execute(f"SELECT * FROM {table_name}").fetch_df() - - result = { - "status": "ok", - "code": query_str, - "content": { - 'rows': json.loads(query_output.to_json(orient='records')), - 'virtual': { - 'table_name': table_name, - 'row_count': row_count - } - }, - } - - except Exception as e: - logger.warning('Error occurred during code execution:') - error_message = f"An error occurred during code execution. Error type: {type(e).__name__}" - logger.warning(error_message) - result = {'status': 'error', 'code': query_str, 'content': error_message} - - else: - result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."} - - result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}] - result['agent'] = 'SQLDataTransformationAgent' - result['refined_goal'] = refined_goal - candidates.append(result) - - logger.info("=== Transform Candidates ===>") - for candidate in candidates: - for key, value in candidate.items(): - if key in ['dialog', 'content']: - logger.info(f"##{key}:\n{str(value)[:1000]}...") - else: - logger.info(f"## {key}:\n{value}") - - return candidates - - - def run(self, input_tables, description, chart_type: str, chart_encodings: dict, prev_messages: list[dict] = [], n=1): - """Args: - input_tables: list[dict], each dict contains 'name' and 'rows' - description: str, the description of the data transformation - chart_type: str, the chart type for visualization - chart_encodings: dict, the chart encodings mapping visualization channels to fields - prev_messages: list[dict], the previous messages - n: int, the number of candidates - """ - - for table in input_tables: - table_name = sanitize_table_name(table['name']) - - # Check if table exists in the connection - try: - self.conn.execute(f"DESCRIBE {table_name}") - except Exception: - # Table doesn't exist, create it from the dataframe - df = pd.DataFrame(table['rows']) - - # Register the dataframe as a temporary view - self.conn.register(f'df_temp', df) - # Create a permanent table from the temporary view - self.conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp") - # Drop the temporary view - self.conn.execute(f"DROP VIEW df_temp") - - r = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 10").fetch_df() - print(r) - # Log the creation of the table - logger.info(f"Created table {table_name} from dataframe") - - - data_summary = generate_sql_data_summary(self.conn, input_tables) - - goal = { - "instruction": description, - "chart_type": chart_type, - "chart_encodings": chart_encodings, - } - - user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{json.dumps(goal, indent=4)}" - if len(prev_messages) > 0: - user_query = f"The user wants a new transformation based off the following updated context and goal:\n\n[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}" - - logger.info(user_query) - - # Filter out system messages from prev_messages - filtered_prev_messages = [msg for msg in prev_messages if msg.get("role") != "system"] - - messages = [{"role":"system", "content": self.system_prompt}, - *filtered_prev_messages, - {"role":"user","content": user_query}] - - response = self.client.get_completion(messages = messages) - - return self.process_gpt_sql_response(response, messages) - - - def followup(self, input_tables, dialog, latest_data_sample, chart_type: str, chart_encodings: dict, new_instruction: str, n=1): - """ - extend the input data (in json records format) to include new fields - latest_data_sample: the latest data sample that the user is working on, it's a json object that contains the data sample of the current table - chart_type: the chart type that the user wants to use - chart_encodings: the chart encodings that the user wants to use - new_instruction: the new instruction that the user wants to add to the latest data sample - """ - - goal = { - "followup_instruction": new_instruction, - "chart_type": chart_type, - "chart_encodings": chart_encodings - } - - logger.info(f"GOAL: \n\n{goal}") - - #logger.info(dialog) - - updated_dialog = [{"role":"system", "content": self.system_prompt}, *dialog[1:]] - - # get the current table name - sample_data_str = pd.DataFrame(latest_data_sample).head(10).to_string() + '\n......' - - messages = [*updated_dialog, {"role":"user", - "content": f"This is the result from the latest sql query:\n\n{sample_data_str}\n\nUpdate the sql query above based on the following instruction:\n\n{json.dumps(goal, indent=4)}"}] - - response = self.client.get_completion(messages = messages) - - return self.process_gpt_sql_response(response, messages) - - -def generate_sql_data_summary(conn, input_tables: list[dict], - row_sample_size: int = 5, - field_sample_size: int = 7, - max_val_chars: int = 140, - table_name_prefix: str = "Table" - ) -> str: - """ - Generate a natural, well-organized summary of SQL input tables. - This is the SQL equivalent of generate_data_summary for pandas DataFrames. - - Organization approach: - - Each table is clearly separated with a header - - Information flows logically: Overview → Schema → Examples - - Consistent section ordering for better readability - - Args: - conn: DuckDB connection - input_tables: list of dicts, each containing 'name' key for the table name - row_sample_size: number of rows to sample in the data preview - field_sample_size: number of example values for each field - max_val_chars: max characters to show for each value - table_name_prefix: prefix for table headers (default "Table") - - Returns: - A formatted string summary of all tables - """ - table_summaries = [] - - for idx, table in enumerate(input_tables): - table_name = sanitize_table_name(table['name']) - description = table.get("attached_metadata", "") - table_summary_str = get_sql_table_statistics_str( - conn, table_name, - row_sample_size=row_sample_size, - field_sample_size=field_sample_size, - max_val_chars=max_val_chars, - table_name_prefix=table_name_prefix, - table_idx=idx, - description=description - ) - table_summaries.append(table_summary_str) - - # Add visual separator between tables (except for the last one) - separator = "\n" + "─" * 60 + "\n\n" - joined_summaries = separator.join(table_summaries) - - return joined_summaries - - -def get_sql_table_statistics_str(conn, table_name: str, - row_sample_size: int = 5, # number of rows to be sampled in the sample data part - field_sample_size: int = 7, # number of example values for each field to be sampled - max_val_chars: int = 140, # max number of characters to be shown for each example value - table_name_prefix: str = "Table", - table_idx: int = 0, - description: str = "" - ) -> str: - """ - Get a string representation of the table statistics in markdown format. - - Organization: - - Header with table name and dimensions - - Description (if available) - - Schema section with field summaries - - Sample data section with code block - """ - - table_name = sanitize_table_name(table_name) - - # Get column information and row count - columns = conn.execute(f"DESCRIBE {table_name}").fetchall() - row_count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] - num_cols = len(columns) - - # Build sections in logical order: Overview → Description → Schema → Examples - sections = [] - - # 1. Table Header with basic stats - header = f"## {table_name_prefix} {table_idx + 1}: {table_name}" - if row_count > 0: - header += f" ({row_count:,} rows × {num_cols} columns)" - sections.append(header) - sections.append("") # Empty line for spacing - - # 2. Description (if available) - provides context first - if description: - sections.append(f"### Description\n{description}\n") - - # 3. Schema/Fields - core structure information - field_summaries = [] - for col in columns: - col_name = col[0] - col_type = col[1] - - # Properly quote column names to avoid SQL keywords issues - quoted_col_name = f'"{col_name}"' - - # Get sample values for the field - if col_type in ['INTEGER', 'BIGINT', 'DOUBLE', 'DECIMAL', 'FLOAT', 'REAL']: - # For numeric types, get min/max as value range indicator - range_query = f""" - SELECT MIN({quoted_col_name}), MAX({quoted_col_name}) - FROM {table_name} - WHERE {quoted_col_name} IS NOT NULL - """ - range_result = conn.execute(range_query).fetchone() - if range_result[0] is not None: - min_val, max_val = range_result - val_str = f"range: [{min_val}, {max_val}]" - else: - val_str = "all null" - else: - # For non-numeric types, get sample values similar to Python version - query_for_sample_values = f""" - SELECT DISTINCT {quoted_col_name} - FROM {table_name} - WHERE {quoted_col_name} IS NOT NULL - ORDER BY {quoted_col_name} - LIMIT {field_sample_size * 2} - """ - - try: - sample_values_result = conn.execute(query_for_sample_values).fetchall() - sample_values = [row[0] for row in sample_values_result] - - # Format values similar to Python version - def sample_val_cap(val): - s = str(val) - if len(s) > max_val_chars: - s = s[:max_val_chars] + "..." - if ',' in s: - s = f'"{s}"' - return s - - if len(sample_values) <= field_sample_size: - val_sample = sample_values - else: - half = field_sample_size // 2 - val_sample = sample_values[:half] + ["..."] + sample_values[-(field_sample_size - half):] - - val_str = "values: " + ', '.join([sample_val_cap(v) for v in val_sample]) - except Exception: - val_str = "values: N/A" - - field_summaries.append(f" - {col_name} -- type: {col_type}, {val_str}") - - fields_summary = '\n'.join(field_summaries) - sections.append(f"### Schema ({num_cols} fields)\n{fields_summary}\n") - - # 4. Sample data - concrete examples last - if row_count > 0: - sample_data = conn.execute(f"SELECT * FROM {table_name} LIMIT {row_sample_size}").fetch_df() - sections.append(f"### Sample Data (first {min(row_sample_size, row_count)} rows)\n```\n{sample_data.to_string()}\n```\n") - - return '\n'.join(sections) diff --git a/py-src/data_formulator/agents/agent_utils.py b/py-src/data_formulator/agents/agent_utils.py index 1d53d03f..1669cf76 100644 --- a/py-src/data_formulator/agents/agent_utils.py +++ b/py-src/data_formulator/agents/agent_utils.py @@ -3,9 +3,7 @@ import json import keyword -import pandas as pd import numpy as np - import re def string_to_py_varname(var_str): @@ -187,58 +185,88 @@ def sample_val_cap(val): return f"{field_name} -- type: {df[field_name].dtype}, values: {val_str}" -def generate_data_summary(input_tables, include_data_samples=True, field_sample_size=7, max_val_chars=140, table_name_prefix="Table"): +def generate_data_summary( + input_tables, + workspace, + include_data_samples=True, + field_sample_size=7, + row_sample_size=5, + max_val_chars=140, + table_name_prefix="Table" +): """ - Generate a natural, well-organized summary of input tables. - + Generate a natural, well-organized summary of input tables by reading workspace parquet files. + + All tables (including temp tables) should be in the workspace before calling this function. + Use WorkspaceWithTempData context manager to mount temp tables to workspace. + Organization approach: - Each table is clearly separated with a header - Information flows logically: Overview → Schema → Examples - Consistent section ordering for better readability + - Shows filename for workspace tables + + Args: + input_tables: list of dicts with 'name' key + workspace: Workspace instance with all tables mounted (including temp data) + include_data_samples: whether to include sample data + field_sample_size: number of example values per field + row_sample_size: number of sample rows to show + max_val_chars: max characters per value + table_name_prefix: prefix for table headers + + Returns: + Formatted string summary of all tables """ - - def assemble_table_summary(input_table, idx): - name = string_to_py_varname(input_table["name"]) - rows = input_table["rows"] - description = input_table.get("attached_metadata", "") - - df = pd.DataFrame(rows) + def assemble_table_summary(table, idx): + table_name = table['name'] + description = table.get("attached_metadata", "") + + # Read data into DataFrame (handles parquet, csv, excel, json, etc.) + df = workspace.read_data_as_df(table_name) + + # Get filename for display (LLM uses this to generate read_parquet/read_csv calls) + data_file_path = workspace.get_relative_data_file_path(table_name) + num_rows = len(df) num_cols = len(df.columns) - - # Build sections in logical order: Overview → Schema → Examples + + # Build sections in logical order: Overview → Description → Schema → Examples sections = [] - + # 1. Table Header with basic stats - header = f"## {table_name_prefix} {idx + 1}: {name}" + header = f"## {table_name_prefix} {idx + 1}: {table_name} (file: {data_file_path})" if num_rows > 0: header += f" ({num_rows:,} rows × {num_cols} columns)" sections.append(header) sections.append("") # Empty line for spacing - + # 2. Description (if available) - provides context first if description: sections.append(f"### Description\n{description}\n") - + # 3. Schema/Fields - core structure information - fields_summary = '\n'.join([' - ' + get_field_summary(fname, df, field_sample_size, max_val_chars) - for fname in list(df.columns.values)]) + fields_summary = '\n'.join([ + ' - ' + get_field_summary(fname, df, field_sample_size, max_val_chars) + for fname in df.columns + ]) sections.append(f"### Schema ({num_cols} fields)\n{fields_summary}\n") - + # 4. Sample data (if requested) - concrete examples last if include_data_samples and num_rows > 0: - sample_df = pd.DataFrame(rows[:5]) - sections.append(f"### Sample Data (first 5 rows)\n```\n{sample_df.to_string()}\n```\n") - + sample_df = df.head(row_sample_size) + sections.append( + f"### Sample Data (first {min(row_sample_size, num_rows)} rows)\n" + f"```\n{sample_df.to_string()}\n```\n" + ) + return '\n'.join(sections) - # Join tables with clear separators - table_summaries = [assemble_table_summary(input_table, i) for i, input_table in enumerate(input_tables)] - - # Add visual separator between tables (except for the last one) + # Build summaries for all tables + table_summaries = [assemble_table_summary(table, i) for i, table in enumerate(input_tables)] + + # Join with visual separators separator = "\n" + "─" * 60 + "\n\n" - joined_summaries = separator.join(table_summaries) - - full_summary = joined_summaries - return full_summary + return separator.join(table_summaries) + diff --git a/py-src/data_formulator/agents/agent_utils_sql.py b/py-src/data_formulator/agents/agent_utils_sql.py new file mode 100644 index 00000000..d9946951 --- /dev/null +++ b/py-src/data_formulator/agents/agent_utils_sql.py @@ -0,0 +1,43 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +SQL-related utility functions for agents. +These functions are used across multiple agents for DuckDB operations and SQL data summaries. +""" + +import re + + +def sanitize_table_name(table_name: str) -> str: + """Sanitize table name to be used in SQL queries""" + # Replace spaces with underscores + sanitized_name = table_name.replace(" ", "_") + sanitized_name = sanitized_name.replace("-", "_") + # Allow alphanumeric, underscore, dot, dash, and dollar sign + sanitized_name = re.sub(r'[^a-zA-Z0-9_\.$]', '', sanitized_name) + return sanitized_name + + +def create_duckdb_conn_with_parquet_views(workspace, input_tables: list[dict]): + """ + Create an in-memory DuckDB connection with a view for each parquet table in the workspace. + Input tables are expected to be parquet-backed tables in the datalake (parquet-to-parquet). + + Args: + workspace: Workspace instance + input_tables: list of dicts with 'name' key for the table name + + Returns: + DuckDB connection with views created for all input tables + """ + import duckdb + + conn = duckdb.connect(":memory:") + for table in input_tables: + name = table["name"] + view_name = sanitize_table_name(name) + path = workspace.get_parquet_path(name) + path_escaped = str(path).replace("\\", "\\\\").replace("'", "''") + conn.execute(f'CREATE VIEW "{view_name}" AS SELECT * FROM read_parquet(\'{path_escaped}\')') + return conn diff --git a/py-src/data_formulator/agents/client_utils.py b/py-src/data_formulator/agents/client_utils.py index 4ccce9d0..43cf0ee3 100644 --- a/py-src/data_formulator/agents/client_utils.py +++ b/py-src/data_formulator/agents/client_utils.py @@ -1,27 +1,7 @@ import litellm import openai from azure.identity import DefaultAzureCredential, get_bearer_token_provider -from typing import Dict, Optional, Union -class OpenAIClientAdapter(object): - """ - Wrapper around OpenAI or AzureOpenAI client that provides the same interface as Client. - """ - def __init__(self, openai_client: Union[openai.OpenAI, openai.AzureOpenAI], model: str): - self._openai_client = openai_client - self.model = model - self.params = {} - - def get_completion(self, messages): - """ - Returns a completion using the wrapped OpenAI client. - """ - completion_params = { - "model": self.model, - "messages": messages, - } - - return self._openai_client.chat.completions.create(**completion_params) class Client(object): """ @@ -69,7 +49,7 @@ def __init__(self, endpoint, model, api_key=None, api_base=None, api_version=No self.model = f"ollama/{model}" @classmethod - def from_config(cls, model_config: Dict[str, str]): + def from_config(cls, model_config: dict[str, str]): """ Create a client instance from model configuration. @@ -132,7 +112,7 @@ def get_completion(self, messages, stream=False): ) - def get_response(self, messages: list[dict], tools: Optional[list] = None): + def get_response(self, messages: list[dict], tools: list | None = None): """ Returns a response using OpenAI's Response API approach. """ diff --git a/py-src/data_formulator/agents/semantic_types.py b/py-src/data_formulator/agents/semantic_types.py new file mode 100644 index 00000000..13d9ee7b --- /dev/null +++ b/py-src/data_formulator/agents/semantic_types.py @@ -0,0 +1,309 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +============================================================================= +SEMANTIC TYPE SYSTEM +============================================================================= + +Semantic types classify data fields for intelligent chart recommendations. +Uses strings for flexibility and easy JSON serialization. + +DESIGN GOALS: +1. Comprehensive: Cover common data types seen in real-world datasets +2. Visualization-aware: Map to Vega-Lite encoding types (Q, O, N, T) +3. Hierarchical: Support generalization via lattice structure +4. Simple: Use strings with helper functions, no complex enums + +============================================================================= +""" + +from typing import Dict, List, Optional, Set + +# --------------------------------------------------------------------------- +# All Semantic Types (as string constants) +# --------------------------------------------------------------------------- + +# TEMPORAL TYPES - Time-related concepts +DATETIME = "DateTime" # Full date and time: "2024-01-15T14:30:00" +DATE = "Date" # Date only: "2024-01-15" +TIME = "Time" # Time only: "14:30:00" + +YEAR = "Year" # "2024" (as a time unit, not a measure) +QUARTER = "Quarter" # "Q1", "Q2", "2024-Q1" +MONTH = "Month" # "January", "Jan", 1-12 +WEEK = "Week" # "Week 1", 1-52 +DAY = "Day" # "Monday", "Mon", 1-31 +HOUR = "Hour" # 0-23 + +YEAR_MONTH = "YearMonth" # "2024-01", "Jan 2024" +YEAR_QUARTER = "YearQuarter"# "2024-Q1" +YEAR_WEEK = "YearWeek" # "2024-W01" +DECADE = "Decade" # "1990s", "2000s" + +DURATION = "Duration" # Time span: "2 hours", "3 days", milliseconds +TIME_RANGE = "TimeRange" # Time interval: "9am-5pm", "2020-2024" + +# NUMERIC MEASURE TYPES - Continuous values for aggregation +QUANTITY = "Quantity" # Generic continuous measure +COUNT = "Count" # Discrete count of items +AMOUNT = "Amount" # Monetary or general amounts +PRICE = "Price" # Unit price +REVENUE = "Revenue" # Total revenue/sales +COST = "Cost" # Expenses/costs +PERCENTAGE = "Percentage" # 0-100% or 0-1 ratio +RATE = "Rate" # Rate of change, interest rate +RATIO = "Ratio" # Proportion between values +DISTANCE = "Distance" # Length, height, width +AREA = "Area" # Square units +VOLUME = "Volume" # Cubic units +WEIGHT = "Weight" # Mass +TEMPERATURE = "Temperature" # Degrees +SPEED = "Speed" # Velocity + +# NUMERIC DISCRETE TYPES - Numbers with ordinal/identifier meaning +RANK = "Rank" # Position in ordered list: 1st, 2nd, 3rd +INDEX = "Index" # Row number, sequence number +ID = "ID" # Unique identifier (not for aggregation!) +SCORE = "Score" # Rating score: 1-5, 1-10, 0-100 +RATING = "Rating" # Star rating, letter grade +LEVEL = "Level" # Discrete levels: 1, 2, 3 + +# GEOGRAPHIC TYPES - Location-based data +LATITUDE = "Latitude" # -90 to 90 +LONGITUDE = "Longitude" # -180 to 180 +COORDINATES = "Coordinates" # Lat/Long pair +COUNTRY = "Country" # Country name or code +STATE = "State" # State/Province +CITY = "City" # City name +REGION = "Region" # Geographic region +ADDRESS = "Address" # Street address +ZIP_CODE = "ZipCode" # Postal code +LOCATION = "Location" # Generic location (fallback) + +# CATEGORICAL ENTITY TYPES - Named entities +PERSON_NAME = "PersonName" # Full name, first/last name +USERNAME = "Username" # Account username +EMAIL = "Email" # Email address +COMPANY = "Company" # Company/Organization name +BRAND = "Brand" # Brand name +DEPARTMENT = "Department" # Organizational unit +PRODUCT = "Product" # Product name +SKU = "SKU" # Product identifier +CATEGORY = "Category" # Product/item category +NAME = "Name" # Generic named entity (fallback) + +# CATEGORICAL CODED TYPES - Discrete categories/statuses +STATUS = "Status" # State: "Active", "Pending", "Closed" +TYPE = "Type" # Type classification +BOOLEAN = "Boolean" # True/False, Yes/No +BINARY = "Binary" # Two-value categorical +CODE = "Code" # Coded value: "A", "B", "C" + +# BINNED/RANGE TYPES - Discretized continuous values +RANGE = "Range" # Numeric range: "10000-20000", "<50", "50+" +AGE_GROUP = "AgeGroup" # Age range: "18-24", "25-34" +BUCKET = "Bucket" # Generic binned value + +# FALLBACK TYPES +STRING = "String" # Generic string (categorical fallback) +NUMBER = "Number" # Generic number (measure fallback) +UNKNOWN = "Unknown" # Cannot determine type + + +# --------------------------------------------------------------------------- +# All Semantic Types List (for prompt generation) +# --------------------------------------------------------------------------- + +ALL_SEMANTIC_TYPES: List[str] = [ + # Temporal + DATETIME, DATE, TIME, + YEAR, QUARTER, MONTH, WEEK, DAY, HOUR, + YEAR_MONTH, YEAR_QUARTER, YEAR_WEEK, DECADE, + DURATION, TIME_RANGE, + # Numeric measures + QUANTITY, COUNT, AMOUNT, PRICE, REVENUE, COST, + PERCENTAGE, RATE, RATIO, + DISTANCE, AREA, VOLUME, WEIGHT, TEMPERATURE, SPEED, + # Numeric discrete + RANK, INDEX, ID, SCORE, RATING, LEVEL, + # Geographic + LATITUDE, LONGITUDE, COORDINATES, + COUNTRY, STATE, CITY, REGION, ADDRESS, ZIP_CODE, LOCATION, + # Entity names + PERSON_NAME, USERNAME, EMAIL, COMPANY, BRAND, DEPARTMENT, + PRODUCT, SKU, CATEGORY, NAME, + # Coded + STATUS, TYPE, BOOLEAN, BINARY, CODE, + # Ranges + RANGE, AGE_GROUP, BUCKET, + # Fallbacks + STRING, NUMBER, UNKNOWN, +] + + +# --------------------------------------------------------------------------- +# Type Sets for Classification +# --------------------------------------------------------------------------- + +TIMESERIES_X_TYPES: Set[str] = { + DATETIME, DATE, TIME, + YEAR_MONTH, YEAR_QUARTER, YEAR_WEEK, + YEAR, QUARTER, MONTH, WEEK, DAY, HOUR, DECADE, +} + +MEASURE_TYPES: Set[str] = { + QUANTITY, COUNT, AMOUNT, PRICE, REVENUE, COST, + PERCENTAGE, RATE, RATIO, + DISTANCE, AREA, VOLUME, WEIGHT, TEMPERATURE, SPEED, + DURATION, NUMBER, +} + +NON_MEASURE_NUMERIC_TYPES: Set[str] = { + RANK, INDEX, ID, SCORE, RATING, LEVEL, + YEAR, MONTH, DAY, HOUR, + LATITUDE, LONGITUDE, +} + +CATEGORICAL_TYPES: Set[str] = { + NAME, PERSON_NAME, USERNAME, EMAIL, + COMPANY, BRAND, DEPARTMENT, PRODUCT, CATEGORY, + STATUS, TYPE, BOOLEAN, BINARY, CODE, + LOCATION, COUNTRY, STATE, CITY, REGION, + RANGE, AGE_GROUP, BUCKET, + STRING, +} + +ORDINAL_TYPES: Set[str] = { + YEAR, QUARTER, MONTH, WEEK, DAY, HOUR, DECADE, + RANK, SCORE, RATING, LEVEL, + RANGE, AGE_GROUP, BUCKET, TIME_RANGE, +} + +GEO_TYPES: Set[str] = { + LATITUDE, LONGITUDE, COORDINATES, + LOCATION, COUNTRY, STATE, CITY, REGION, ADDRESS, ZIP_CODE, +} + + +# --------------------------------------------------------------------------- +# Grouped by Category (for prompt generation) +# --------------------------------------------------------------------------- + +SEMANTIC_TYPE_CATEGORIES: Dict[str, List[str]] = { + "Temporal (point-in-time)": [DATETIME, DATE, TIME], + "Temporal (granules)": [YEAR, QUARTER, MONTH, WEEK, DAY, HOUR], + "Temporal (combined)": [YEAR_MONTH, YEAR_QUARTER, YEAR_WEEK, DECADE], + "Temporal (duration)": [DURATION, TIME_RANGE], + "Numeric measures": [ + QUANTITY, COUNT, AMOUNT, PRICE, REVENUE, COST, + PERCENTAGE, RATE, RATIO, + DISTANCE, AREA, VOLUME, WEIGHT, TEMPERATURE, SPEED + ], + "Numeric discrete": [RANK, INDEX, ID, SCORE, RATING, LEVEL], + "Geographic coordinates": [LATITUDE, LONGITUDE, COORDINATES], + "Geographic locations": [COUNTRY, STATE, CITY, REGION, ADDRESS, ZIP_CODE, LOCATION], + "Entity names": [PERSON_NAME, USERNAME, EMAIL, COMPANY, BRAND, DEPARTMENT, PRODUCT, SKU, CATEGORY, NAME], + "Categorical codes": [STATUS, TYPE, BOOLEAN, BINARY, CODE], + "Binned ranges": [RANGE, AGE_GROUP, BUCKET], + "Fallback": [STRING, NUMBER, UNKNOWN], +} + + +# --------------------------------------------------------------------------- +# Helper Functions +# --------------------------------------------------------------------------- + +def is_measure_type(semantic_type: str) -> bool: + """Check if a semantic type is a true measure (suitable for quantitative encoding).""" + return semantic_type in MEASURE_TYPES + + +def is_timeseries_type(semantic_type: str) -> bool: + """Check if a semantic type is suitable for time-series X axis.""" + return semantic_type in TIMESERIES_X_TYPES + + +def is_categorical_type(semantic_type: str) -> bool: + """Check if a semantic type is categorical (suitable for color/grouping).""" + return semantic_type in CATEGORICAL_TYPES + + +def is_ordinal_type(semantic_type: str) -> bool: + """Check if a semantic type is ordinal (has inherent order).""" + return semantic_type in ORDINAL_TYPES + + +def is_geo_type(semantic_type: str) -> bool: + """Check if a semantic type is geographic.""" + return semantic_type in GEO_TYPES + + +def is_non_measure_numeric(semantic_type: str) -> bool: + """Check if a semantic type is numeric but should not be aggregated.""" + return semantic_type in NON_MEASURE_NUMERIC_TYPES + + +# --------------------------------------------------------------------------- +# Prompt Generation +# --------------------------------------------------------------------------- + +def generate_semantic_types_prompt() -> str: + """Generate the semantic types section for the LLM prompt.""" + + lines = ["Semantic types to consider (grouped by category):"] + lines.append("") + + for category, types in SEMANTIC_TYPE_CATEGORIES.items(): + lines.append(f" {category}:") + lines.append(f" {', '.join(types)}") + + lines.append("") + lines.append("Guidelines for choosing semantic types:") + lines.append("") + lines.append("1. TEMPORAL types:") + lines.append(" - Use DateTime/Date/Time for full timestamps or dates") + lines.append(" - Use YearMonth, YearQuarter for combined temporal like '2024-01' or '2024-Q1'") + lines.append(" - Use Year, Month, Day for discrete time units (even if stored as numbers)") + lines.append(" - Use Duration for time spans (e.g., '2 hours', milliseconds)") + lines.append(" - Use TimeRange for intervals (e.g., '9am-5pm', '2020-2024')") + lines.append("") + lines.append("2. NUMERIC MEASURE types (can be aggregated/averaged):") + lines.append(" - Use Quantity for generic continuous measures") + lines.append(" - Use specific types like Price, Revenue, Percentage when applicable") + lines.append(" - Use Count for discrete counts of items") + lines.append("") + lines.append("3. NUMERIC DISCRETE types (should NOT be aggregated):") + lines.append(" - Use Rank for positions (1st, 2nd, 3rd)") + lines.append(" - Use ID for unique identifiers") + lines.append(" - Use Score/Rating for evaluation scores (1-5, A-F)") + lines.append(" - IMPORTANT: A column named 'year' with values like 2020, 2021 is Year, not Number!") + lines.append("") + lines.append("4. GEOGRAPHIC types:") + lines.append(" - Use Latitude/Longitude for coordinates") + lines.append(" - Use Country, State, City for named locations") + lines.append(" - Use Location as fallback for any geographic entity") + lines.append("") + lines.append("5. CATEGORICAL types:") + lines.append(" - Use specific entity types (PersonName, Company, Product) when applicable") + lines.append(" - Use Category for classification fields") + lines.append(" - Use Status for state/status fields ('Active', 'Pending')") + lines.append(" - Use Boolean for true/false, yes/no fields") + lines.append("") + lines.append("6. RANGE types:") + lines.append(" - Use Range for binned numeric values ('10000-20000', '<50', '50+')") + lines.append(" - Use AgeGroup specifically for age ranges ('18-24', '25-34')") + lines.append("") + lines.append("7. FALLBACK types:") + lines.append(" - Use String for generic text when no specific type applies") + lines.append(" - Use Number for generic numeric when no specific measure type applies") + + return "\n".join(lines) + + +# For backward compatibility with existing code +LEGACY_SEMANTIC_TYPES = [ + "Location", "Decade", "Year", "Month", "YearMonth", "Day", + "Date", "Time", "DateTime", "TimeRange", "Range", "Duration", + "Name", "Percentage", "String", "Number" +] diff --git a/py-src/data_formulator/agents/web_utils.py b/py-src/data_formulator/agents/web_utils.py index 1fd3aaea..a04f6f48 100644 --- a/py-src/data_formulator/agents/web_utils.py +++ b/py-src/data_formulator/agents/web_utils.py @@ -3,7 +3,6 @@ import requests from bs4 import BeautifulSoup -from typing import Optional, Union import logging from urllib.parse import urlparse import tempfile @@ -111,7 +110,7 @@ def _validate_url_for_ssrf(url: str) -> str: return url -def download_html_content(url: str, timeout: int = 30, headers: Optional[dict] = None) -> str: +def download_html_content(url: str, timeout: int = 30, headers: dict | None = None) -> str: """ Download HTML content from a given URL with SSRF protection. @@ -254,7 +253,7 @@ def html_to_text(html_content: str, remove_scripts: bool = True, remove_styles: # Fallback: return the raw content if parsing fails return html_content -def get_html_title(html_content: str) -> Optional[str]: +def get_html_title(html_content: str) -> str | None: """ Extract the title from HTML content. @@ -276,7 +275,7 @@ def get_html_title(html_content: str) -> Optional[str]: return None -def get_html_meta_description(html_content: str) -> Optional[str]: +def get_html_meta_description(html_content: str) -> str | None: """ Extract the meta description from HTML content. diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py index a767d277..4908b28e 100644 --- a/py-src/data_formulator/app.py +++ b/py-src/data_formulator/app.py @@ -2,22 +2,19 @@ # Licensed under the MIT License. import argparse -import random import sys import os import mimetypes -from functools import lru_cache mimetypes.add_type('application/javascript', '.js') mimetypes.add_type('application/javascript', '.mjs') import flask -from flask import Flask, request, send_from_directory, session +from flask import Flask, request, send_from_directory from flask import stream_with_context, Response import webbrowser import threading import numpy as np -import datetime import time import logging @@ -28,28 +25,14 @@ from dotenv import load_dotenv import secrets import base64 -APP_ROOT = Path(Path(__file__).parent).absolute() - -import os - -# blueprints -from data_formulator.tables_routes import tables_bp -from data_formulator.agent_routes import agent_bp -from data_formulator.demo_stream_routes import demo_stream_bp, limiter as demo_stream_limiter -from data_formulator.db_manager import db_manager -from data_formulator.example_datasets_config import EXAMPLE_DATASETS -import queue -from typing import Dict, Any +APP_ROOT = Path(Path(__file__).parent).absolute() +# Create Flask app (lightweight, no heavy imports yet) app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist")) -app.secret_key = secrets.token_hex(16) # Generate a random secret key for sessions +app.secret_key = secrets.token_hex(16) app.json.sort_keys = False -# Initialize rate limiter for demo stream routes that call external APIs -# The limiter is defined in demo_stream_routes.py to avoid circular imports -demo_stream_limiter.init_app(app) - class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.int64): @@ -65,28 +48,21 @@ def default(self, obj): load_dotenv(os.path.join(APP_ROOT, 'api-keys.env')) load_dotenv(os.path.join(APP_ROOT, '.env')) -# Add this line to store args at app level +# Default config from env (can be overridden by CLI args) app.config['CLI_ARGS'] = { 'exec_python_in_subprocess': os.environ.get('EXEC_PYTHON_IN_SUBPROCESS', 'false').lower() == 'true', 'disable_display_keys': os.environ.get('DISABLE_DISPLAY_KEYS', 'false').lower() == 'true', 'disable_database': os.environ.get('DISABLE_DATABASE', 'false').lower() == 'true', 'disable_file_upload': os.environ.get('DISABLE_FILE_UPLOAD', 'false').lower() == 'true', - 'project_front_page': os.environ.get('PROJECT_FRONT_PAGE', 'false').lower() == 'true' + 'project_front_page': os.environ.get('PROJECT_FRONT_PAGE', 'false').lower() == 'true', + 'max_display_rows': int(os.environ.get('MAX_DISPLAY_ROWS', '5000')), } -# register blueprints -# Only register tables blueprint if database is not disabled -if not app.config['CLI_ARGS']['disable_database']: - app.register_blueprint(tables_bp) -app.register_blueprint(agent_bp) -app.register_blueprint(demo_stream_bp) - # Get logger for this module (logging config moved to run_app function) logger = logging.getLogger(__name__) def configure_logging(): """Configure logging for the Flask application.""" - # Configure root logger for general application logging logging.basicConfig( level=logging.ERROR, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', @@ -98,14 +74,38 @@ def configure_logging(): logging.getLogger('litellm').setLevel(logging.WARNING) logging.getLogger('openai').setLevel(logging.WARNING) - # Configure Flask app logger to use the same settings app.logger.handlers = [] for handler in logging.getLogger().handlers: app.logger.addHandler(handler) +def _register_blueprints(disable_database: bool): + """ + Import and register blueprints. This is where heavy imports happen. + Called from run_app() with progress feedback. + """ + # Import tables routes (imports database connectors) + print(" Loading data connectors...", flush=True) + from data_formulator.tables_routes import tables_bp + + # Import agent routes (imports AI/ML libraries: litellm, sklearn, etc.) + print(" Loading AI agents...", flush=True) + from data_formulator.agent_routes import agent_bp + + # Import demo stream routes + from data_formulator.demo_stream_routes import demo_stream_bp, limiter as demo_stream_limiter + demo_stream_limiter.init_app(app) + + # Register blueprints + if not disable_database: + app.register_blueprint(tables_bp) + app.register_blueprint(agent_bp) + app.register_blueprint(demo_stream_bp) + + @app.route('/api/example-datasets') def get_sample_datasets(): + from data_formulator.example_datasets_config import EXAMPLE_DATASETS return flask.jsonify(EXAMPLE_DATASETS) @@ -116,116 +116,22 @@ def index_alt(path): @app.errorhandler(404) def page_not_found(e): - # your processing here logger.info(app.static_folder) - return send_from_directory(app.static_folder, "index.html") #'Hello 404!' #send_from_directory(app.static_folder, "index.html") - -###### test functions ###### - -@app.route('/api/hello') -def hello(): - values = [ - {"a": "A", "b": 28}, {"a": "B", "b": 55}, {"a": "C", "b": 43}, - {"a": "D", "b": 91}, {"a": "E", "b": 81}, {"a": "F", "b": 53}, - {"a": "G", "b": 19}, {"a": "H", "b": 87}, {"a": "I", "b": 52} - ] - spec = { - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "description": "A simple bar chart with embedded data.", - "data": { "values": values }, - "mark": "bar", - "encoding": { - "x": {"field": "a", "type": "nominal", "axis": {"labelAngle": 0}}, - "y": {"field": "b", "type": "quantitative"} - } - } - return json.dumps(spec) - -@app.route('/api/hello-stream') -def streamed_response(): - def generate(): - values = [ - {"a": "A", "b": 28}, {"a": "B", "b": 55}, {"a": "C", "b": 43}, - {"a": "D", "b": 91}, {"a": "E", "b": 81}, {"a": "F", "b": 53}, - {"a": "G", "b": 19}, {"a": "H", "b": 87}, {"a": "I", "b": 52} - ] - spec = { - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "description": "A simple bar chart with embedded data.", - "data": { "values": [] }, - "mark": "bar", - "encoding": { - "x": {"field": "a", "type": "nominal", "axis": {"labelAngle": 0}}, - "y": {"field": "b", "type": "quantitative"} - } - } - for i in range(3): - time.sleep(3) - spec["data"]["values"] = values[i:] - yield json.dumps(spec) - return Response(stream_with_context(generate())) + return send_from_directory(app.static_folder, "index.html") -@app.route('/api/get-session-id', methods=['GET', 'POST']) -def get_session_id(): - """Endpoint to get or confirm a session ID from the client""" - # if it is a POST request, we expect a session_id in the body - # if it is a GET request, we do not expect a session_id in the query params - - current_session_id = None - if request.is_json: - content = request.get_json() - current_session_id = content.get("session_id", None) - - # Check if database is disabled - database_disabled = app.config['CLI_ARGS']['disable_database'] - - if database_disabled: - # When database is disabled, don't use Flask sessions (cookies) - # Just return the provided session_id or generate a new one - if current_session_id is None: - current_session_id = secrets.token_hex(16) - logger.info(f"Generated session ID for disabled database: {current_session_id}") - else: - logger.info(f"Using provided session ID for disabled database: {current_session_id}") - - return flask.jsonify({ - "status": "ok", - "session_id": current_session_id - }) - else: - # When database is enabled, use Flask sessions (cookies) as before - if current_session_id is None: - if 'session_id' not in session: - session['session_id'] = secrets.token_hex(16) - session.permanent = True - logger.info(f"Created new session: {session['session_id']}") - else: - # override the session_id - session['session_id'] = current_session_id - session.permanent = True - - return flask.jsonify({ - "status": "ok", - "session_id": session['session_id'] - }) @app.route('/api/app-config', methods=['GET']) def get_app_config(): """Provide frontend configuration settings from CLI arguments""" args = app.config['CLI_ARGS'] - # When database is disabled, don't try to access session - session_id = None - if not args['disable_database']: - session_id = session.get('session_id', None) - config = { "EXEC_PYTHON_IN_SUBPROCESS": args['exec_python_in_subprocess'], "DISABLE_DISPLAY_KEYS": args['disable_display_keys'], "DISABLE_DATABASE": args['disable_database'], "DISABLE_FILE_UPLOAD": args['disable_file_upload'], "PROJECT_FRONT_PAGE": args['project_front_page'], - "SESSION_ID": session_id + "MAX_DISPLAY_ROWS": args['max_display_rows'], } return flask.jsonify(config) @@ -238,7 +144,6 @@ def database_disabled_fallback(path): "message": "Database functionality is disabled. Use --disable-database=false to enable table operations." }), 503 else: - # If database is not disabled but we're hitting this route, it means the tables blueprint wasn't registered return flask.jsonify({ "status": "error", "message": "Table routes are not available" @@ -258,38 +163,41 @@ def parse_args() -> argparse.Namespace: help="Disable file upload functionality. This prevents the app from uploading files to the server.") parser.add_argument("--project-front-page", action='store_true', default=False, help="Project the front page as the main page instead of the app.") + parser.add_argument("--max-display-rows", type=int, + default=int(os.environ.get('MAX_DISPLAY_ROWS', '10000')), + help="Maximum number of rows to send to the frontend for display (default: 10000)") parser.add_argument("--dev", action='store_true', default=False, help="Launch the app in development mode (prevents the app from opening the browser automatically)") return parser.parse_args() def run_app(): - # Configure logging only when actually running the app - configure_logging() + print("Starting Data Formulator...", flush=True) + configure_logging() args = parse_args() - # Add this line to make args available to routes - # override the args from the env file + + # Override config from CLI args app.config['CLI_ARGS'] = { 'exec_python_in_subprocess': args.exec_python_in_subprocess, 'disable_display_keys': args.disable_display_keys, 'disable_database': args.disable_database, 'disable_file_upload': args.disable_file_upload, - 'project_front_page': args.project_front_page + 'project_front_page': args.project_front_page, + 'max_display_rows': args.max_display_rows, } - # Update database manager state - db_manager._disabled = args.disable_database + # Register blueprints (this is where heavy imports happen) + _register_blueprints(args.disable_database) + url = "http://localhost:{0}".format(args.port) + print(f"Ready! Open {url} in your browser.", flush=True) + if not args.dev: - url = "http://localhost:{0}".format(args.port) - threading.Timer(2, lambda: webbrowser.open(url, new=2)).start() + threading.Timer(1.5, lambda: webbrowser.open(url, new=2)).start() - # Enable debug mode and auto-reload in development mode debug_mode = args.dev app.run(host='0.0.0.0', port=args.port, debug=debug_mode, use_reloader=debug_mode) if __name__ == '__main__': - #app.run(debug=True, host='127.0.0.1', port=5000) - #use 0.0.0.0 for public run_app() diff --git a/py-src/data_formulator/auth.py b/py-src/data_formulator/auth.py new file mode 100644 index 00000000..8b28ea56 --- /dev/null +++ b/py-src/data_formulator/auth.py @@ -0,0 +1,88 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Authentication and identity management for Data Formulator. + +This module provides a hybrid identity system that supports both anonymous +browser-based users and authenticated users (via Azure App Service or JWT). + +Security Model: +- Anonymous users: Browser UUID from X-Identity-Id header (prefixed with "browser:") +- Authenticated users: Verified identity from Azure headers or JWT (prefixed with "user:") +- Namespacing ensures authenticated user data cannot be accessed by spoofing headers +""" + +import logging +from flask import request, current_app + +logger = logging.getLogger(__name__) + + +def get_identity_id() -> str: + """ + Get identity ID with proper security priority: + + 1. Verified user from Azure App Service auth headers (trusted, set by Azure) + 2. Verified user from JWT bearer token (trusted, cryptographically verified) + 3. Browser ID from X-Identity-Id header (untrusted, for anonymous users only) + + The key insight: for anonymous users, we trust X-Identity-Id because there's + no security risk (who cares if someone "steals" a random UUID?). For authenticated + users, we MUST extract identity from verified sources, not client-provided headers. + + Identity is namespaced as "user:" or "browser:" to ensure authenticated + user data is never accessible via anonymous browser identity spoofing. + + Returns: + str: The namespaced identity ID string (e.g., "user:alice@..." or "browser:550e8400-...") + + Raises: + ValueError: If no identity could be determined + """ + + # Priority 1: Azure App Service Authentication (EasyAuth) + # When deployed to Azure with authentication enabled, Azure injects these headers. + # These are SET BY AZURE (not the client) after verifying the user's identity. + azure_principal_id = request.headers.get('X-MS-CLIENT-PRINCIPAL-ID') + if azure_principal_id: + logger.debug(f"Using Azure principal ID: {azure_principal_id[:8]}...") + return f"user:{azure_principal_id}" + + # Priority 2: JWT Bearer Token (for custom auth implementations) + # If you implement your own auth, verify the JWT here and extract user ID. + # Example (uncomment and configure when implementing JWT auth): + # + # auth_header = request.headers.get('Authorization', '') + # if auth_header.startswith('Bearer '): + # token = auth_header[7:] + # try: + # import jwt + # payload = jwt.decode(token, current_app.config['JWT_SECRET'], algorithms=['HS256']) + # user_id = payload.get('sub') or payload.get('user_id') + # if user_id: + # logger.debug(f"Using JWT user ID: {user_id[:8]}...") + # return f"user:{user_id}" + # except Exception as e: + # logger.warning(f"Invalid JWT token: {e}") + # # Fall through to browser identity + + # Priority 3: Anonymous browser identity (UNTRUSTED - from client header) + # SECURITY: We NEVER trust the namespace prefix from X-Identity-Id header. + # Even if client sends "user:alice@...", we force "browser:" prefix. + # Only verified auth (Azure headers, JWT) can result in "user:" prefix. + client_identity = request.headers.get('X-Identity-Id') + if client_identity: + # Extract the ID part, ignoring any client-provided prefix + # e.g., "browser:550e8400-..." → "550e8400-..." + # e.g., "user:alice@..." → "alice@..." (but forced to browser: namespace) + if ':' in client_identity: + # Strip the prefix - we don't trust client-provided namespaces + identity_value = client_identity.split(':', 1)[1] + else: + identity_value = client_identity + + # Always use browser: prefix for client-provided identities + return f"browser:{identity_value}" + + raise ValueError("X-Identity-Id header is required. Please refresh the page.") \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/README.md b/py-src/data_formulator/data_loader/README.md index 660e59b3..70079ec9 100644 --- a/py-src/data_formulator/data_loader/README.md +++ b/py-src/data_formulator/data_loader/README.md @@ -1,43 +1,51 @@ ## Data Loader Module -This module provides a framework for loading data from various external sources into DuckDB. It follows an abstract base class pattern to ensure consistent implementation across different data sources. +This module provides a framework for loading data from various external sources into the **workspace** (parquet files). It follows an abstract base class pattern so all loaders behave consistently. + +### Design + +- **Storage**: Ingested data is written as **parquet** in the workspace. DuckDB is **not** used for storage; it is only the computation engine elsewhere in the application. +- **Data flow**: **External source → PyArrow Table → Parquet (workspace)**. +- **Format**: Loaders use **PyArrow** as the standard in-memory format for speed and interoperability. Database loaders (PostgreSQL, MySQL, MSSQL) use **connectorx** for Arrow-native reads where applicable. ### Building a New Data Loader -The abstract class `ExternalDataLoader` defines the data loader interface. Each concrete implementation (e.g., `KustoDataLoader`, `MySQLDataLoader`) handles specific data source connections and data ingestion. +The abstract class `ExternalDataLoader` defines the interface. Each concrete implementation (e.g., `MySQLDataLoader`, `S3DataLoader`) handles one data source. -To create a new data loader: +To add a new data loader: -1. Create a new class that inherits from `ExternalDataLoader` -2. Implement the required abstract methods: - - `list_params()`: Define required connection parameters - - `__init__()`: Initialize connection to data source - - `list_tables()`: List available tables/views - - `ingest_data()`: Load data from source - - `view_query_sample()`: Preview query results - - `ingest_data_from_query()`: Load data from custom query -3. Register the new class into `__init__.py` so that the front-end can automatically discover the new data loader. +1. Create a class that inherits from `ExternalDataLoader`. +2. Implement the required pieces: + - **`list_params()`** (static): Connection parameters (names, types, defaults, descriptions). + - **`auth_instructions()`** (static): Short instructions for obtaining credentials/setup. + - **`__init__(self, params)`**: Validate params and establish or verify connection to the source. No `duck_db_conn`; storage is workspace-only. + - **`fetch_data_as_arrow(source_table, size=..., sort_columns=..., sort_order=...)`**: Fetch data from the source and return a `pyarrow.Table`. Only `source_table` (table/collection/file identifier) is supported; raw query strings are not accepted for security and dialect consistency. + - **`list_tables(table_filter=None)`**: Return a list of `{"name": ..., "metadata": {...}}` for tables/files the user can select. Metadata typically includes `row_count`, `columns`, and `sample_rows`. +3. Register the new class in the package `__init__.py` so the front-end can discover it. -The UI automatically provide the query completion option to help user generate queries for the given data loader (from NL or partial queries). +The base class provides **`ingest_to_workspace(workspace, ...)`**, which calls `fetch_data_as_arrow()` and writes the result to the workspace as parquet. You do not implement ingest logic in the loader. + +The UI uses the same loaders for connection setup, table listing, and ingestion into the workspace. ### Example Implementations -- `AthenaDataLoader`: AWS Athena integration (SQL queries on S3 data lakes) -- `BigQueryDataLoader`: Google BigQuery integration -- `KustoDataLoader`: Azure Data Explorer (Kusto) integration -- `MySQLDataLoader`: MySQL database integration -- `PostgreSQLDataLoader`: PostgreSQL database integration -- `MSSQLDataLoader`: Microsoft SQL Server integration -- `S3DataLoader`: Amazon S3 file integration (CSV, Parquet, JSON) -- `AzureBlobDataLoader`: Azure Blob Storage integration -- `MongoDBDataLoader`: MongoDB integration +- **`AthenaDataLoader`**: AWS Athena (SQL on S3 data lakes) +- **`BigQueryDataLoader`**: Google BigQuery +- **`KustoDataLoader`**: Azure Data Explorer (Kusto) +- **`MySQLDataLoader`**: MySQL (connectorx) +- **`PostgreSQLDataLoader`**: PostgreSQL (connectorx) +- **`MSSQLDataLoader`**: Microsoft SQL Server (connectorx) +- **`S3DataLoader`**: Amazon S3 files (CSV, Parquet, JSON) via PyArrow S3 filesystem +- **`AzureBlobDataLoader`**: Azure Blob Storage via PyArrow +- **`MongoDBDataLoader`**: MongoDB ### Testing -Ensure your implementation: -- Handles connection errors gracefully -- Properly sanitizes table names -- Respects size limits for data ingestion -- Returns consistent metadata format +When implementing or changing a loader: + +- Handle connection and read errors clearly (e.g., raise `ValueError` with a clear message). +- Sanitize or validate table/object names where appropriate. +- Respect the `size` limit (and optional sort) in `fetch_data_as_arrow`. +- Return the same metadata shape from `list_tables()` (e.g., `row_count`, `columns`, `sample_rows`) so the UI behaves consistently. -Launch the front-end and test the data loader. \ No newline at end of file +Test via the front-end: configure the loader, list tables, and run an ingest into the workspace; then confirm parquet appears in the workspace and DuckDB (or other engines) can read it for computation. diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py index f61a6851..898c50f5 100644 --- a/py-src/data_formulator/data_loader/__init__.py +++ b/py-src/data_formulator/data_loader/__init__.py @@ -21,4 +21,15 @@ "athena": AthenaDataLoader } -__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "PostgreSQLDataLoader", "MongoDBDataLoader", "BigQueryDataLoader", "AthenaDataLoader", "DATA_LOADERS"] \ No newline at end of file +__all__ = [ + "ExternalDataLoader", + "MySQLDataLoader", + "MSSQLDataLoader", + "KustoDataLoader", + "S3DataLoader", + "AzureBlobDataLoader", + "PostgreSQLDataLoader", + "MongoDBDataLoader", + "BigQueryDataLoader", + "AthenaDataLoader", + "DATA_LOADERS"] \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/athena_data_loader.py b/py-src/data_formulator/data_loader/athena_data_loader.py index 8ba617fe..1d546513 100644 --- a/py-src/data_formulator/data_loader/athena_data_loader.py +++ b/py-src/data_formulator/data_loader/athena_data_loader.py @@ -1,19 +1,14 @@ -import json import logging import re import time -import duckdb +import pyarrow as pa +import pyarrow.csv as pa_csv +import boto3 +import botocore.exceptions +from pyarrow import fs as pa_fs from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from typing import Dict, Any, List, Optional -from data_formulator.security import validate_sql_query - -try: - import boto3 - import botocore.exceptions - BOTO3_AVAILABLE = True -except ImportError: - BOTO3_AVAILABLE = False +from typing import Any log = logging.getLogger(__name__) @@ -54,22 +49,16 @@ def _validate_s3_url(url: str) -> None: raise ValueError(f"Invalid S3 URL format: '{url}'. Expected format: 's3://bucket/path'") -def _escape_sql_string(value: Optional[str]) -> str: - """Escape single quotes in SQL string values.""" - if value is None: - return "" - return value.replace("'", "''") - - class AthenaDataLoader(ExternalDataLoader): """AWS Athena data loader implementation. - Executes SQL queries on Athena and loads results from S3 into DuckDB. - The output bucket is automatically fetched from the workgroup configuration. + Executes SQL queries on Athena and reads results from S3 via PyArrow. + Output location is taken from the workgroup configuration or the output_location param. + Use ingest_to_workspace() to store results as parquet in the workspace. """ @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "aws_profile", "type": "string", "required": False, "default": "", "description": "AWS profile name from ~/.aws/credentials (if set, access key and secret are not required)"}, {"name": "aws_access_key_id", "type": "string", "required": False, "default": "", "description": "AWS access key ID (not required if using aws_profile)"}, @@ -160,15 +149,8 @@ def auth_instructions() -> str: **Security:** Never share secret keys, rotate regularly, use least privilege permissions. """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - if not BOTO3_AVAILABLE: - raise ImportError( - "boto3 is required for Athena connections. " - "Install with: pip install boto3" - ) - + def __init__(self, params: dict[str, Any]): self.params = params - self.duck_db_conn = duck_db_conn # Extract parameters self.aws_profile = params.get("aws_profile", "") @@ -219,7 +201,7 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti session = boto3.Session(profile_name=self.aws_profile, region_name=self.region_name) self.athena_client = session.client('athena') - # Get credentials from profile for DuckDB S3 access + # Get credentials from profile for PyArrow S3 access credentials = session.get_credentials() if credentials is None: raise ValueError( @@ -290,16 +272,14 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti # Get output location: prefer user-provided, then try workgroup self.output_location = self._get_output_location() - # Install and load the httpfs extension for S3 access - self.duck_db_conn.install_extension("httpfs") - self.duck_db_conn.load_extension("httpfs") - - # Set AWS credentials for DuckDB - self.duck_db_conn.execute(f"SET s3_region='{self.region_name}'") - self.duck_db_conn.execute(f"SET s3_access_key_id='{self.aws_access_key_id}'") - self.duck_db_conn.execute(f"SET s3_secret_access_key='{self.aws_secret_access_key}'") - if self.aws_session_token: - self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'") + # Setup PyArrow S3 filesystem for reading results + self.s3_fs = pa_fs.S3FileSystem( + access_key=self.aws_access_key_id, + secret_key=self.aws_secret_access_key, + session_token=self.aws_session_token if self.aws_session_token else None, + region=self.region_name + ) + log.info("Initialized PyArrow S3 filesystem for Athena results") def _get_output_location(self) -> str: """Get the output location for query results. @@ -398,7 +378,56 @@ def _execute_query(self, query: str) -> str: wait_time = min(2 ** (elapsed // 10), 10) time.sleep(wait_time) - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from Athena as a PyArrow Table. + + Executes the query on Athena and reads the CSV results from S3 + using PyArrow's S3 filesystem. + """ + if not source_table: + raise ValueError("source_table must be provided") + + _validate_athena_table_name(source_table) + base_query = f"SELECT * FROM {source_table}" + + # Add ORDER BY if sort columns specified + order_by_clause = "" + if sort_columns and len(sort_columns) > 0: + for col in sort_columns: + _validate_column_name(col) + order_direction = "DESC" if sort_order == 'desc' else "ASC" + sanitized_cols = [f'"{col}" {order_direction}' for col in sort_columns] + order_by_clause = f" ORDER BY {', '.join(sanitized_cols)}" + + query = f"{base_query}{order_by_clause} LIMIT {size}" + + log.info(f"Executing Athena query: {query[:200]}...") + + # Execute query and get result location + result_location = self._execute_query(query) + _validate_s3_url(result_location) + + log.info(f"Reading Athena results from: {result_location}") + + # Parse S3 URL: s3://bucket/key -> bucket/key + s3_path = result_location[5:] if result_location.startswith("s3://") else result_location + + # Athena outputs CSV files + with self.s3_fs.open_input_file(s3_path) as f: + arrow_table = pa_csv.read_csv(f) + + log.info(f"Fetched {arrow_table.num_rows} rows from Athena [Arrow-native]") + + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: """List tables from Athena catalog (Glue Data Catalog).""" results = [] @@ -468,92 +497,3 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: log.info(f"Returning {len(results)} tables") return results - - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - """Ingest data from an Athena table by executing a SELECT query.""" - # Validate table name to prevent SQL injection - _validate_athena_table_name(table_name) - - if name_as is None: - # Extract table name from "database.table" format - name_as = table_name.split('.')[-1] - - name_as = sanitize_table_name(name_as) - - # Validate and build ORDER BY clause if sort_columns are specified - order_by_clause = "" - if sort_columns and len(sort_columns) > 0: - # Validate each column name - for col in sort_columns: - _validate_column_name(col) - order_direction = "DESC" if sort_order == 'desc' else "ASC" - sanitized_cols = [f'"{col}" {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - # Validate size is a positive integer - if not isinstance(size, int) or size <= 0: - raise ValueError(f"Size must be a positive integer, got: {size}") - - # Build and execute the query - query = f"SELECT * FROM {table_name} {order_by_clause} LIMIT {size}" - log.info(f"Executing Athena query for table '{name_as}': {query}") - - result_location = self._execute_query(query) - - # Validate the result location is a proper S3 URL - _validate_s3_url(result_location) - - # Load results from S3 into DuckDB - log.info(f"Loading query results from {result_location}") - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_csv_auto('{_escape_sql_string(result_location)}') - """) - - log.info(f"Successfully ingested data into table '{name_as}'") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - """Execute query and return sample results.""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - # Add LIMIT if not present to avoid large result sets - query_upper = query.upper() - if "LIMIT" not in query_upper: - query = f"{query.rstrip().rstrip(';')} LIMIT 10" - - # Execute query on Athena - result_location = self._execute_query(query) - - # Validate the result location is a proper S3 URL - _validate_s3_url(result_location) - - # Load results from S3 - df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{_escape_sql_string(result_location)}')").df() - - return json.loads(df.head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str): - """Execute Athena query and ingest results into DuckDB.""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - name_as = sanitize_table_name(name_as) - - # Execute query on Athena - log.info(f"Executing Athena query for table '{name_as}'") - result_location = self._execute_query(query) - - # Validate the result location is a proper S3 URL - _validate_s3_url(result_location) - - # Load results from S3 into DuckDB - log.info(f"Loading query results from {result_location}") - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_csv_auto('{_escape_sql_string(result_location)}') - """) - - log.info(f"Successfully ingested data into table '{name_as}'") diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py index 1206f4e0..3c8bdf2d 100644 --- a/py-src/data_formulator/data_loader/azure_blob_data_loader.py +++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py @@ -1,23 +1,22 @@ import json +import logging import pandas as pd -import duckdb -import os +import pyarrow as pa +import pyarrow.parquet as pq +import pyarrow.csv as pa_csv +from azure.storage.blob import BlobServiceClient +from azure.identity import DefaultAzureCredential +from pyarrow import fs as pa_fs from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from typing import Dict, Any, List -from data_formulator.security import validate_sql_query +from typing import Any -try: - from azure.storage.blob import BlobServiceClient, ContainerClient - from azure.identity import DefaultAzureCredential, AzureCliCredential, ManagedIdentityCredential, EnvironmentCredential, ChainedTokenCredential - AZURE_BLOB_AVAILABLE = True -except ImportError: - AZURE_BLOB_AVAILABLE = False +logger = logging.getLogger(__name__) class AzureBlobDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "account_name", "type": "string", "required": True, "default": "", "description": "Azure storage account name"}, {"name": "container_name", "type": "string", "required": True, "default": "", "description": "Azure blob container name"}, @@ -65,16 +64,9 @@ def auth_instructions() -> str: - JSON files (.json, .jsonl) """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - if not AZURE_BLOB_AVAILABLE: - raise ImportError( - "Azure storage libraries are required for Azure Blob connections. " - "Install with: pip install azure-storage-blob azure-identity" - ) - + def __init__(self, params: dict[str, Any]): self.params = params - self.duck_db_conn = duck_db_conn - + # Extract parameters self.account_name = params.get("account_name", "") self.container_name = params.get("container_name", "") @@ -84,56 +76,93 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti self.sas_token = params.get("sas_token", "") self.endpoint = params.get("endpoint", "blob.core.windows.net") - # Install and load the azure extension - self.duck_db_conn.install_extension("azure") - self.duck_db_conn.load_extension("azure") + # Setup PyArrow Azure filesystem + if self.account_key: + self.azure_fs = pa_fs.AzureFileSystem( + account_name=self.account_name, + account_key=self.account_key + ) + elif self.connection_string: + self.azure_fs = pa_fs.AzureFileSystem.from_connection_string(self.connection_string) + else: + # Use default credential chain + self.azure_fs = pa_fs.AzureFileSystem(account_name=self.account_name) - # Set up Azure authentication using secrets (preferred method) - self._setup_azure_authentication() + logger.info(f"Initialized PyArrow Azure filesystem for account: {self.account_name}") - def _setup_azure_authentication(self): - """Set up Azure authentication using DuckDB secrets.""" - if self.connection_string: - # Use connection string authentication - self.duck_db_conn.execute(f""" - CREATE OR REPLACE SECRET azure_secret ( - TYPE AZURE, - CONNECTION_STRING '{self.connection_string}' - ) - """) - elif self.account_key: - # Use account key authentication - self.duck_db_conn.execute(f""" - CREATE OR REPLACE SECRET azure_secret ( - TYPE AZURE, - ACCOUNT_NAME '{self.account_name}', - ACCOUNT_KEY '{self.account_key}' - ) - """) - elif self.sas_token: - # Use SAS token authentication - self.duck_db_conn.execute(f""" - CREATE OR REPLACE SECRET azure_secret ( - TYPE AZURE, - ACCOUNT_NAME '{self.account_name}', - SAS_TOKEN '{self.sas_token}' - ) - """) + def _azure_path(self, azure_url: str) -> str: + """Convert Azure URL to path for PyArrow (container/blob).""" + if azure_url.startswith("az://"): + parts = azure_url[5:].split("/", 1) + return parts[1] if len(parts) > 1 else azure_url + return f"{self.container_name}/{azure_url}" + + def _read_sample(self, azure_url: str, limit: int) -> pd.DataFrame: + """Read sample rows from an Azure blob using PyArrow. Returns a pandas DataFrame.""" + azure_path = self._azure_path(azure_url) + if azure_url.lower().endswith('.parquet'): + table = pq.read_table(azure_path, filesystem=self.azure_fs) + elif azure_url.lower().endswith('.csv'): + with self.azure_fs.open_input_file(azure_path) as f: + table = pa_csv.read_csv(f) + elif azure_url.lower().endswith('.json') or azure_url.lower().endswith('.jsonl'): + import pyarrow.json as pa_json + with self.azure_fs.open_input_file(azure_path) as f: + table = pa_json.read_json(f) else: - # Use credential chain authentication (default) - self.duck_db_conn.execute(f""" - CREATE OR REPLACE SECRET azure_secret ( - TYPE AZURE, - PROVIDER credential_chain, - ACCOUNT_NAME '{self.account_name}', - CHAIN '{self.credential_chain}' - ) - """) + raise ValueError(f"Unsupported file type: {azure_url}") + if table.num_rows > limit: + table = table.slice(0, limit) + return table.to_pandas() - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: - # Use Azure SDK to list blobs in the container - from azure.storage.blob import BlobServiceClient + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from Azure Blob as a PyArrow Table. + + For files (parquet, csv), reads directly using PyArrow's Azure filesystem. + """ + if not source_table: + raise ValueError("source_table (Azure blob URL) must be provided") + azure_url = source_table + azure_path = self._azure_path(azure_url) + + logger.info("Reading Azure blob via PyArrow: %s", azure_url) + + if azure_url.lower().endswith('.parquet'): + arrow_table = pq.read_table(azure_path, filesystem=self.azure_fs) + elif azure_url.lower().endswith('.csv'): + with self.azure_fs.open_input_file(azure_path) as f: + arrow_table = pa_csv.read_csv(f) + elif azure_url.lower().endswith('.json') or azure_url.lower().endswith('.jsonl'): + import pyarrow.json as pa_json + with self.azure_fs.open_input_file(azure_path) as f: + arrow_table = pa_json.read_json(f) + else: + raise ValueError(f"Unsupported file type: {azure_url}") + + # Apply sorting if specified + if sort_columns and len(sort_columns) > 0: + df = arrow_table.to_pandas() + ascending = sort_order != 'desc' + df = df.sort_values(by=sort_columns, ascending=ascending) + arrow_table = pa.Table.from_pandas(df, preserve_index=False) + + # Apply size limit + if arrow_table.num_rows > size: + arrow_table = arrow_table.slice(0, size) + + logger.info(f"Fetched {arrow_table.num_rows} rows from Azure Blob [Arrow-native]") + + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: # Create blob service client based on authentication method if self.connection_string: blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) @@ -177,228 +206,95 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: azure_url = f"az://{self.account_name}.{self.endpoint}/{self.container_name}/{blob_name}" try: - # Choose the appropriate read function based on file extension - if azure_url.lower().endswith('.parquet'): - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{azure_url}') LIMIT 10").df() - elif azure_url.lower().endswith('.json') or azure_url.lower().endswith('.jsonl'): - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{azure_url}') LIMIT 10").df() - elif azure_url.lower().endswith('.csv'): - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT 10").df() - - # Get column information + sample_df = self._read_sample(azure_url, 10) + columns = [{ 'name': col, 'type': str(sample_df[col].dtype) } for col in sample_df.columns] - - # Get sample data + sample_rows = json.loads(sample_df.to_json(orient="records")) - - # Estimate row count row_count = self._estimate_row_count(azure_url, blob) - + table_metadata = { "row_count": row_count, "columns": columns, "sample_rows": sample_rows } - + results.append({ "name": azure_url, "metadata": table_metadata }) except Exception as e: - # Skip files that can't be read - print(f"Error reading {azure_url}: {e}") + logger.warning("Error reading %s: %s", azure_url, e) continue return results def _is_supported_file(self, blob_name: str) -> bool: - """Check if the file type is supported by DuckDB.""" + """Check if the file type is supported (PyArrow can read it).""" supported_extensions = ['.csv', '.parquet', '.json', '.jsonl'] return any(blob_name.lower().endswith(ext) for ext in supported_extensions) - + def _estimate_row_count(self, azure_url: str, blob_properties=None) -> int: - """Estimate the number of rows in a file using intelligent strategies.""" + """Estimate the number of rows in a file.""" try: file_extension = azure_url.lower().split('.')[-1] - - # For parquet files, use metadata to get exact count efficiently + if file_extension == 'parquet': try: - # Use DuckDB's parquet_file_metadata to get exact row count without full scan - metadata = self.duck_db_conn.execute( - f"SELECT num_rows FROM parquet_file_metadata('{azure_url}')" - ).fetchone() - if metadata and metadata[0] is not None: - return metadata[0] - except Exception as parquet_error: - print(f"Failed to get parquet metadata for {azure_url}: {parquet_error}") - # Fall back to counting (expensive but accurate) - try: - count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{azure_url}')").fetchone()[0] - return count - except Exception: - pass - - # For CSV, JSON, and JSONL files, use intelligent sampling - elif file_extension in ['csv', 'json', 'jsonl']: + azure_path = self._azure_path(azure_url) + pf = pq.ParquetFile(azure_path, filesystem=self.azure_fs) + return pf.metadata.num_rows + except Exception as e: + logger.debug("Failed to get parquet row count for %s: %s", azure_url, e) + return 0 + + if file_extension in ['csv', 'json', 'jsonl']: return self._estimate_rows_by_sampling(azure_url, blob_properties, file_extension) - + return 0 - except Exception as e: - print(f"Error estimating row count for {azure_url}: {e}") + logger.warning("Error estimating row count for %s: %s", azure_url, e) return 0 def _estimate_rows_by_sampling(self, azure_url: str, blob_properties, file_extension: str) -> int: - """Estimate row count for text-based files using sampling and file size.""" + """Estimate row count for text-based files using PyArrow sampling.""" try: - # Get file size from blob properties if available file_size_bytes = None if blob_properties and hasattr(blob_properties, 'size'): file_size_bytes = blob_properties.size - - # If no file size available, try a different approach + if file_size_bytes is None: - # Sample first 10,000 rows and extrapolate if needed return self._estimate_by_row_sampling(azure_url, file_extension) - - # Sample approach: read first N rows and estimate based on size - sample_size = min(10000, file_size_bytes // 100) # Adaptive sample size - sample_size = max(1000, sample_size) # At least 1000 rows - + + sample_size = min(10000, max(1000, file_size_bytes // 100)) try: - if file_extension == 'csv': - sample_df = self.duck_db_conn.execute( - f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {sample_size}" - ).df() - elif file_extension in ['json', 'jsonl']: - sample_df = self.duck_db_conn.execute( - f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {sample_size}" - ).df() - else: - return 0 - + sample_df = self._read_sample(azure_url, sample_size) sample_rows = len(sample_df) if sample_rows == 0: return 0 - - # If we got fewer rows than requested, that's probably all there is if sample_rows < sample_size: return sample_rows - - # Estimate bytes per row from sample - # For CSV: assume average line length based on file size - if file_extension == 'csv': - # Rough estimate: file_size / (sample_rows * estimated_line_overhead) - # CSV overhead includes delimiters, quotes, newlines - estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes) - estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 50)) # Min 50 bytes per row - else: - # For JSON: more complex structure, use conservative estimate - # Assume JSON overhead is higher - estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes) - estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 100)) # Min 100 bytes per row - - # Apply reasonable bounds - estimated_total_rows = max(sample_rows, estimated_total_rows) # At least as many as we sampled - estimated_total_rows = min(estimated_total_rows, file_size_bytes // 10) # Max based on very small rows - + + min_bytes_per_row = 50 if file_extension == 'csv' else 100 + estimated_total_rows = int(file_size_bytes / max(file_size_bytes / sample_rows, min_bytes_per_row)) + estimated_total_rows = max(sample_rows, min(estimated_total_rows, file_size_bytes // 10)) return estimated_total_rows - except Exception as e: - print(f"Error in size-based estimation for {azure_url}: {e}") + logger.debug("Size-based estimation failed for %s: %s", azure_url, e) return self._estimate_by_row_sampling(azure_url, file_extension) - except Exception as e: - print(f"Error in sampling estimation for {azure_url}: {e}") + logger.warning("Error in sampling estimation for %s: %s", azure_url, e) return 0 def _estimate_by_row_sampling(self, azure_url: str, file_extension: str) -> int: - """Fallback method: sample rows without file size info.""" + """Estimate row count by reading a capped sample with PyArrow.""" try: - # Try to read a reasonable sample and see if we get less than requested - # This indicates we've read the whole file test_limit = 50000 - - if file_extension == 'csv': - sample_df = self.duck_db_conn.execute( - f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {test_limit}" - ).df() - elif file_extension in ['json', 'jsonl']: - sample_df = self.duck_db_conn.execute( - f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {test_limit}" - ).df() - else: - return 0 - - sample_rows = len(sample_df) - - # If we got fewer rows than the limit, that's likely the total - if sample_rows < test_limit: - return sample_rows - - # Otherwise, we can't estimate accurately without more information - # Return the sample size as a lower bound - return sample_rows - + sample_df = self._read_sample(azure_url, test_limit) + return len(sample_df) except Exception as e: - print(f"Error in row sampling for {azure_url}: {e}") - return 0 - - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - if name_as is None: - name_as = table_name.split('/')[-1].split('.')[0] - - name_as = sanitize_table_name(name_as) - - # Build ORDER BY clause if sort_columns are specified - order_by_clause = "" - if sort_columns and len(sort_columns) > 0: - order_direction = "DESC" if sort_order == 'desc' else "ASC" - sanitized_cols = [f'"{col}" {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - # Determine file type and use appropriate DuckDB function - if table_name.lower().endswith('.csv'): - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_csv_auto('{table_name}') - {order_by_clause} - LIMIT {size} - """) - elif table_name.lower().endswith('.parquet'): - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_parquet('{table_name}') - {order_by_clause} - LIMIT {size} - """) - elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'): - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_json_auto('{table_name}') - {order_by_clause} - LIMIT {size} - """) - else: - raise ValueError(f"Unsupported file type: {table_name}") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str): - # Execute the query and get results as a DataFrame - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self.duck_db_conn.execute(query).df() - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) \ No newline at end of file + logger.debug("Row sampling failed for %s: %s", azure_url, e) + return 0 \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/bigquery_data_loader.py b/py-src/data_formulator/data_loader/bigquery_data_loader.py index cd3c1cf6..e9c6807b 100644 --- a/py-src/data_formulator/data_loader/bigquery_data_loader.py +++ b/py-src/data_formulator/data_loader/bigquery_data_loader.py @@ -1,19 +1,12 @@ -import json import logging import re -from typing import Dict, Any, List, Optional -import pandas as pd -import duckdb +from typing import Any +import pyarrow as pa from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from data_formulator.security import validate_sql_query -try: - from google.cloud import bigquery - from google.oauth2 import service_account - BIGQUERY_AVAILABLE = True -except ImportError: - BIGQUERY_AVAILABLE = False +from google.cloud import bigquery +from google.oauth2 import service_account log = logging.getLogger(__name__) @@ -21,7 +14,7 @@ class BigQueryDataLoader(ExternalDataLoader): """BigQuery data loader implementation""" @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: return [ {"name": "project_id", "type": "text", "required": True, "description": "Google Cloud Project ID", "default": ""}, {"name": "dataset_id", "type": "text", "required": False, "description": "Dataset ID(s) - leave empty for all, or specify one (e.g., 'billing') or multiple separated by commas (e.g., 'billing,enterprise_collected,ga_api')", "default": ""}, @@ -68,17 +61,10 @@ def auth_instructions() -> str: - Execute custom SQL queries """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - if not BIGQUERY_AVAILABLE: - raise ImportError( - "google-cloud-bigquery is required for BigQuery connections. " - "Install with: pip install google-cloud-bigquery google-auth" - ) - + def __init__(self, params: dict[str, Any]): self.params = params - self.duck_db_conn = duck_db_conn self.project_id = params.get("project_id") - self.dataset_ids = [d.strip() for d in params.get("dataset_id", "").split(",") if d.strip()] # Support multiple datasets + self.dataset_ids = [d.strip() for d in params.get("dataset_id", "").split(",") if d.strip()] self.location = params.get("location", "US") # Initialize BigQuery client @@ -95,8 +81,10 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti project=self.project_id, location=self.location ) + + log.info(f"Successfully connected to BigQuery project: {self.project_id}") - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: """List tables from BigQuery datasets""" results = [] @@ -170,163 +158,78 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: log.info(f"Returning {len(results)} tables") return results - def _convert_bigquery_dtypes(self, df: pd.DataFrame) -> pd.DataFrame: - """Convert BigQuery-specific dtypes to standard pandas dtypes""" - - def safe_convert(x): - try: - if x is None or pd.isna(x): - return None - if isinstance(x, (dict, list)): - return json.dumps(x, default=str) - if hasattr(x, "__dict__"): - return json.dumps(x.__dict__, default=str) - s = str(x) - if "[object Object]" in s: - return json.dumps(x, default=str) - return s - except Exception: - return str(x) if x is not None else None - - for col in df.columns: - # Convert db_dtypes.DateDtype to standard datetime - if hasattr(df[col].dtype, "name") and "dbdate" in str(df[col].dtype).lower(): - df[col] = pd.to_datetime(df[col]) - # Convert other db_dtypes if needed - elif str(df[col].dtype).startswith("db_dtypes"): - try: - df[col] = df[col].astype(str) - except Exception as e: - logging.error(f"Failed to convert column '{col}' to string: {e}") - # Handle nested objects/JSON columns - elif df[col].dtype == "object": - df[col] = df[col].apply(safe_convert) - - return df - - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - """Ingest data from BigQuery table into DuckDB with stable, de-duplicated column aliases.""" - if name_as is None: - name_as = table_name.split('.')[-1] - - name_as = sanitize_table_name(name_as) - - - table_ref = self.client.get_table(table_name) - - select_parts: list[str] = [] - used_aliases: dict[str, str] = {} # alias -> field_path - - def build_alias(field_path: str) -> str: - """ - Build a human-readable, globally unique alias from a BigQuery field path. - - Examples: - 'geo.country' -> 'geo_country' - 'device.category' -> 'device_category' - 'event_params.value' -> 'event_params_value' - """ - # path "a.b.c" -> "a_b_c" - alias = field_path.replace('.', '_') - - # remove weird characters - alias = re.sub(r'[^0-9a-zA-Z_]', '_', alias) - alias = re.sub(r'_+', '_', alias).strip('_') or "col" - - # must start with letter or underscore - if not alias[0].isalpha() and alias[0] != '_': - alias = f"_{alias}" - - base_alias = alias - counter = 1 - while alias in used_aliases: - # same alias from another path – suffix and log once - alias = f"{base_alias}_{counter}" - counter += 1 - - used_aliases[alias] = field_path - return alias - - def add_field(field_path: str): - alias = build_alias(field_path) - select_parts.append(f"`{table_name}`.{field_path} AS `{alias}`") - - def process_field(field, parent_path: str = ""): - """ - Recursively process fields, flattening non-repeated RECORDs. - """ - current_path = f"{parent_path}.{field.name}" if parent_path else field.name - - # Flatten STRUCT / RECORD that is not REPEATED - if field.field_type == "RECORD" and field.mode != "REPEATED": - for subfield in field.fields: - process_field(subfield, current_path) - else: - # Regular field or REPEATED RECORD/array – select as a single column - add_field(current_path) - - # Process all top-level fields - for field in table_ref.schema: - process_field(field) - - if not select_parts: - raise ValueError(f"No fields found for table {table_name}") - - # Build ORDER BY clause if sort_columns are specified - order_by_clause = "" - if sort_columns and len(sort_columns) > 0: - # Use backticks for BigQuery column quoting - order_direction = "DESC" if sort_order == 'desc' else "ASC" - sanitized_cols = [f'`{col}` {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - query = f"SELECT {', '.join(select_parts)} FROM `{table_name}` {order_by_clause} LIMIT {size}" - - df = self.client.query(query).to_dataframe() - - # Safety net: drop exact duplicate names if something slipped through - if df.columns.duplicated().any(): - dupes = df.columns[df.columns.duplicated()].tolist() - log.warning(f"Duplicate column names detected in DataFrame, dropping later ones: {dupes}") - df = df.loc[:, ~df.columns.duplicated()] - - - # Convert BigQuery-specific dtypes - df = self._convert_bigquery_dtypes(df) - - self.ingest_df_to_duckdb(df, name_as) - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - """Execute query and return sample results as a list of dictionaries""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from BigQuery as a PyArrow Table using native Arrow support. - # Add LIMIT if not present - if "LIMIT" not in query.upper(): - query += " LIMIT 10" + BigQuery's Python client provides .to_arrow() for efficient Arrow-native + data transfer, avoiding pandas conversion overhead. + """ + if not source_table: + raise ValueError("source_table must be provided") - df = self.client.query(query).to_dataframe() - return json.loads(df.to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """Execute custom query and ingest results into DuckDB""" - name_as = sanitize_table_name(name_as) + # Get table schema to handle nested fields + table_ref = self.client.get_table(source_table) + select_parts = self._build_select_parts(table_ref, source_table) + base_query = f"SELECT {', '.join(select_parts)} FROM `{source_table}`" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) + # Add ORDER BY if sort columns specified + order_by_clause = "" + if sort_columns and len(sort_columns) > 0: + order_direction = "DESC" if sort_order == 'desc' else "ASC" + sanitized_cols = [f'`{col}` {order_direction}' for col in sort_columns] + order_by_clause = f" ORDER BY {', '.join(sanitized_cols)}" - # Execute query and get DataFrame - df = self.client.query(query).to_dataframe() - - # Drop duplicate columns - df = df.loc[:, ~df.columns.duplicated()] - - # Convert BigQuery-specific dtypes - df = self._convert_bigquery_dtypes(df) - - # Use base class method to ingest DataFrame - self.ingest_df_to_duckdb(df, name_as) + query = f"{base_query}{order_by_clause} LIMIT {size}" + + log.info(f"Executing BigQuery query: {query[:200]}...") + + # Execute query and get Arrow table directly (no pandas conversion) + query_job = self.client.query(query) + arrow_table = query_job.to_arrow() - return df + log.info(f"Fetched {arrow_table.num_rows} rows from BigQuery [Arrow-native]") + + return arrow_table + + def _build_select_parts(self, table_ref, table_name: str) -> list[str]: + """Build SELECT parts handling nested BigQuery fields.""" + select_parts: list[str] = [] + used_aliases: dict[str, str] = {} + + def build_alias(field_path: str) -> str: + alias = field_path.replace('.', '_') + alias = re.sub(r'[^0-9a-zA-Z_]', '_', alias) + alias = re.sub(r'_+', '_', alias).strip('_') or "col" + if not alias[0].isalpha() and alias[0] != '_': + alias = f"_{alias}" + base_alias = alias + counter = 1 + while alias in used_aliases: + alias = f"{base_alias}_{counter}" + counter += 1 + used_aliases[alias] = field_path + return alias + + def add_field(field_path: str): + alias = build_alias(field_path) + select_parts.append(f"`{table_name}`.{field_path} AS `{alias}`") + + def process_field(field, parent_path: str = ""): + current_path = f"{parent_path}.{field.name}" if parent_path else field.name + if field.field_type == "RECORD" and field.mode != "REPEATED": + for subfield in field.fields: + process_field(subfield, current_path) + else: + add_field(current_path) + + for field in table_ref.schema: + process_field(field) + + return select_parts if select_parts else ["*"] diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py index 41060d87..420cedd7 100644 --- a/py-src/data_formulator/data_loader/external_data_loader.py +++ b/py-src/data_formulator/data_loader/external_data_loader.py @@ -1,11 +1,19 @@ from abc import ABC, abstractmethod -from typing import Dict, Any, List +from typing import Any, TYPE_CHECKING import pandas as pd -import json -import duckdb -import random -import string +import pyarrow as pa import re +import logging + +if TYPE_CHECKING: + from data_formulator.datalake.workspace import Workspace + from data_formulator.datalake.metadata import TableMetadata + +logger = logging.getLogger(__name__) + +# Sensitive parameter names that should be excluded from stored metadata +SENSITIVE_PARAMS = {'password', 'api_key', 'secret', 'token', 'access_key', 'secret_key'} + def sanitize_table_name(name_as: str) -> str: if not name_as: @@ -42,74 +50,178 @@ def sanitize_table_name(name_as: str) -> str: return sanitized class ExternalDataLoader(ABC): + """ + Abstract base class for external data loaders. - def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str): - # Log DataFrame info before ingestion - import logging - logger = logging.getLogger(__name__) - logger.info(f"Ingesting DataFrame to DuckDB table '{table_name}'") - logger.info(f"DataFrame shape: {df.shape}") - logger.info(f"DataFrame dtypes: {dict(df.dtypes)}") - - # Log sample of datetime columns - for col in df.columns: - if pd.api.types.is_datetime64_any_dtype(df[col]): - sample_values = df[col].dropna().head(3) - logger.info(f"Datetime column '{col}' sample values: {list(sample_values)}") + Data loaders fetch data from external sources (databases, cloud storage, etc.) + and store data as parquet files in the workspace. DuckDB is not used for storage; + it is only the computation engine elsewhere in the application. + + Ingest flow: External Source → PyArrow Table → Parquet (workspace). - # Create or replace table (replaces existing table with same name) - random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6)) - self.duck_db_conn.register(f'df_temp_{random_suffix}', df) + - `fetch_data_as_arrow()`: each loader must implement; fetches data as PyArrow Table. + - `ingest_to_workspace()`: fetches via Arrow and writes parquet to the given workspace. + """ + + def get_safe_params(self) -> dict[str, Any]: + """ + Get connection parameters with sensitive values removed. - # Log table schema after registration - try: - schema_info = self.duck_db_conn.execute(f"DESCRIBE df_temp_{random_suffix}").fetchall() - logger.info(f"DuckDB table schema: {schema_info}") - except Exception as e: - logger.warning(f"Could not get schema info: {e}") + Returns: + Dictionary of parameters safe to store in metadata + """ + if not hasattr(self, 'params'): + return {} - self.duck_db_conn.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM df_temp_{random_suffix}") - self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}") # Drop the temporary view after creating the table + return { + k: v for k, v in self.params.items() + if k.lower() not in SENSITIVE_PARAMS + } + + @abstractmethod + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from the external source as a PyArrow Table. + + This is the primary method for data fetching. Each loader must implement + this method to fetch data directly as Arrow format for optimal performance. + Only source_table is supported (no raw query strings) to avoid security + and dialect diversity issues across loaders. - logger.info(f"Successfully created/replaced DuckDB table '{table_name}'") + Args: + source_table: Full table name (or table identifier) to fetch from + size: Maximum number of rows to fetch + sort_columns: Columns to sort by before limiting + sort_order: Sort direction ('asc' or 'desc') + + Returns: + PyArrow Table with the fetched data + + Raises: + ValueError: If source_table is not provided + NotImplementedError: If the loader doesn't support this method yet + """ + pass + def fetch_data_as_dataframe( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pd.DataFrame: + """ + Fetch data from the external source as a pandas DataFrame. + + This method converts the Arrow table to pandas. For better performance, + prefer using `fetch_data_as_arrow()` directly when possible. + + Args: + source_table: Full table name to fetch from + size: Maximum number of rows to fetch + sort_columns: Columns to sort by before limiting + sort_order: Sort direction ('asc' or 'desc') + + Returns: + pandas DataFrame with the fetched data + """ + arrow_table = self.fetch_data_as_arrow( + source_table=source_table, + size=size, + sort_columns=sort_columns, + sort_order=sort_order, + ) + return arrow_table.to_pandas() + def ingest_to_workspace( + self, + workspace: "Workspace", + table_name: str, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> "TableMetadata": + """ + Fetch data from external source and store as parquet in workspace. + + Uses PyArrow for efficient data transfer: External Source → Arrow → Parquet. + This avoids pandas conversion overhead entirely. + + Args: + workspace: The workspace to store data in + table_name: Name for the table in the workspace + source_table: Full table name to fetch from + size: Maximum number of rows to fetch + sort_columns: Columns to sort by before limiting + sort_order: Sort direction ('asc' or 'desc') + + Returns: + TableMetadata for the created parquet file + """ + # Fetch data as Arrow table (efficient, no pandas conversion) + arrow_table = self.fetch_data_as_arrow( + source_table=source_table, + size=size, + sort_columns=sort_columns, + sort_order=sort_order, + ) + + # Prepare loader metadata + loader_metadata = { + "loader_type": self.__class__.__name__, + "loader_params": self.get_safe_params(), + "source_table": source_table, + } + + # Write Arrow table directly to parquet (no pandas conversion) + table_metadata = workspace.write_parquet_from_arrow( + table=arrow_table, + table_name=table_name, + loader_metadata=loader_metadata, + ) + + logger.info( + f"Ingested {arrow_table.num_rows} rows from {self.__class__.__name__} " + f"to workspace as {table_name}.parquet" + ) + + return table_metadata + @staticmethod @abstractmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: + """Return list of parameters needed to configure this data loader.""" pass @staticmethod @abstractmethod - def auth_instructions() -> str: pass - - @abstractmethod - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def auth_instructions() -> str: + """Return human-readable authentication instructions.""" pass @abstractmethod - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: - # should include: table_name, column_names, column_types, sample_data - pass + def __init__(self, params: dict[str, Any]): + """ + Initialize the data loader. - @abstractmethod - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - """Ingest data from a table into DuckDB. - Args: - table_name: The source table name - name_as: Optional name for the destination table - size: Maximum number of rows to import (row limit) - sort_columns: Optional list of columns to sort by before applying the limit - sort_order: Sort direction, 'asc' for ascending or 'desc' for descending + params: Configuration parameters for the loader (e.g. host, credentials). """ pass @abstractmethod - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - pass + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """ + List available tables (or files) from the data source. - @abstractmethod - def ingest_data_from_query(self, query: str, name_as: str): + Returns: + List of dicts with: name (table/file identifier), metadata (row_count, columns, sample_rows). + """ pass - diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py index 6ed6d602..ae893f68 100644 --- a/py-src/data_formulator/data_loader/kusto_data_loader.py +++ b/py-src/data_formulator/data_loader/kusto_data_loader.py @@ -1,29 +1,20 @@ +import json import logging -import sys -from typing import Dict, Any, List +from typing import Any import pandas as pd -import json -import duckdb -import random -import string -from datetime import datetime +import pyarrow as pa from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -try: - from azure.kusto.data import KustoClient, KustoConnectionStringBuilder - from azure.kusto.data.helpers import dataframe_from_result_table - KUSTO_AVAILABLE = True -except ImportError: - KUSTO_AVAILABLE = False +from azure.kusto.data import KustoClient, KustoConnectionStringBuilder +from azure.kusto.data.helpers import dataframe_from_result_table -# Get logger for this module (logging config done in app.py) logger = logging.getLogger(__name__) class KustoDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> bool: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "kusto_cluster", "type": "string", "required": True, "description": ""}, {"name": "kusto_database", "type": "string", "required": True, "description": ""}, @@ -60,35 +51,30 @@ def auth_instructions() -> str: - kusto_database: Name of the database you want to access """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - if not KUSTO_AVAILABLE: - raise ImportError( - "azure-kusto-data is required for Kusto/Azure Data Explorer connections. " - "Install with: pip install azure-kusto-data" - ) - + def __init__(self, params: dict[str, Any]): + self.params = params self.kusto_cluster = params.get("kusto_cluster", None) self.kusto_database = params.get("kusto_database", None) - + self.client_id = params.get("client_id", None) self.client_secret = params.get("client_secret", None) self.tenant_id = params.get("tenant_id", None) try: if self.client_id and self.client_secret and self.tenant_id: - # This function provides an interface to Kusto. It uses AAD application key authentication. self.client = KustoClient(KustoConnectionStringBuilder.with_aad_application_key_authentication( self.kusto_cluster, self.client_id, self.client_secret, self.tenant_id)) else: - # This function provides an interface to Kusto. It uses Azure CLI auth, but you can also use other auth types. cluster_url = KustoConnectionStringBuilder.with_az_cli_authentication(self.kusto_cluster) logger.info(f"Connecting to Kusto cluster: {self.kusto_cluster}") self.client = KustoClient(cluster_url) - logger.info("Using Azure CLI authentication for Kusto client. Ensure you have run `az login` in your terminal.") + logger.info("Using Azure CLI authentication for Kusto client.") except Exception as e: logger.error(f"Error creating Kusto client: {e}") - raise Exception(f"Error creating Kusto client: {e}, please authenticate with Azure CLI when starting the app.") - self.duck_db_conn = duck_db_conn + raise RuntimeError( + f"Error creating Kusto client: {e}. " + "Please authenticate with Azure CLI (az login) when starting the app." + ) from e def _convert_kusto_datetime_columns(self, df: pd.DataFrame) -> pd.DataFrame: """Convert Kusto datetime columns to proper pandas datetime format""" @@ -156,7 +142,52 @@ def query(self, kql: str) -> pd.DataFrame: return df - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from Kusto/Azure Data Explorer as a PyArrow Table. + + Kusto SDK returns pandas, so we convert to Arrow format. + + Args: + source_table: Kusto table name + size: Maximum number of rows to fetch + sort_columns: Columns to sort by + sort_order: Sort direction + """ + if not source_table: + raise ValueError("source_table must be provided") + + base_query = f"['{source_table}']" + + # Add sort if specified (KQL syntax) + sort_clause = "" + if sort_columns and len(sort_columns) > 0: + order_direction = "desc" if sort_order == 'desc' else "asc" + sort_cols_with_order = [f"{col} {order_direction}" for col in sort_columns] + sort_clause = f" | sort by {', '.join(sort_cols_with_order)}" + + # Add take limit + kql_query = f"{base_query}{sort_clause} | take {size}" + + logger.info(f"Executing Kusto query: {kql_query[:200]}...") + + # Execute query + df = self.query(kql_query) + + # Convert to Arrow + arrow_table = pa.Table.from_pandas(df, preserve_index=False) + + logger.info(f"Fetched {arrow_table.num_rows} rows from Kusto") + + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: query = ".show tables" tables_df = self.query(query) @@ -195,71 +226,4 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: "metadata": table_metadata }) - return tables - - def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000, sort_columns: List[str] = None, sort_order: str = 'asc') -> pd.DataFrame: - if name_as is None: - name_as = table_name - - # Build sort clause for Kusto (KQL syntax) - sort_clause = "" - if sort_columns and len(sort_columns) > 0: - # Kusto uses | sort by col1 asc/desc syntax - order_direction = "desc" if sort_order == 'desc' else "asc" - sort_cols_with_order = [f"{col} {order_direction}" for col in sort_columns] - sort_clause = f" | sort by {', '.join(sort_cols_with_order)}" - - # Create a subquery that applies random ordering once with a fixed seed - total_rows_ingested = 0 - first_chunk = True - chunk_size = 100000 - - size_estimate_query = f"['{table_name}'] | take {10000} | summarize Total=sum(estimate_data_size(*))" - size_estimate_result = self.query(size_estimate_query) - size_estimate = size_estimate_result['Total'].values[0] - print(f"size_estimate: {size_estimate}") - - chunk_size = min(64 * 1024 * 1024 / size_estimate * 0.9 * 10000, 5000000) - print(f"estimated_chunk_size: {chunk_size}") - - while total_rows_ingested < size: - try: - # Apply sort if specified, then apply row numbering for pagination - query = f"['{table_name}']{sort_clause} | serialize | extend rn=row_number() | where rn >= {total_rows_ingested} and rn < {total_rows_ingested + chunk_size} | project-away rn" - chunk_df = self.query(query) - except Exception as e: - chunk_size = int(chunk_size * 0.8) - continue - - print(f"total_rows_ingested: {total_rows_ingested}") - print(chunk_df.head()) - - # Stop if no more data - if chunk_df.empty: - break - - # Sanitize the table name for SQL compatibility - name_as = sanitize_table_name(name_as) - - # For first chunk, create new table; for subsequent chunks, append - if first_chunk: - self.ingest_df_to_duckdb(chunk_df, name_as) - first_chunk = False - else: - # Append to existing table - random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6)) - self.duck_db_conn.register(f'df_temp_{random_suffix}', chunk_df) - self.duck_db_conn.execute(f"INSERT INTO {name_as} SELECT * FROM df_temp_{random_suffix}") - self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}") - - total_rows_ingested += len(chunk_df) - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - df = self.query(query).head(10) - return json.loads(df.to_json(orient="records", date_format='iso')) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - # Sanitize the table name for SQL compatibility - name_as = sanitize_table_name(name_as) - df = self.query(query) - self.ingest_df_to_duckdb(df, name_as) \ No newline at end of file + return tables \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/mongodb_data_loader.py b/py-src/data_formulator/data_loader/mongodb_data_loader.py index 6b460354..cf8d84e6 100644 --- a/py-src/data_formulator/data_loader/mongodb_data_loader.py +++ b/py-src/data_formulator/data_loader/mongodb_data_loader.py @@ -1,23 +1,22 @@ import json -import string -import random as rand +import logging +from datetime import datetime import pandas as pd -import duckdb +import pyarrow as pa import pymongo from bson import ObjectId -from datetime import datetime from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name +from typing import Any -from data_formulator.security import validate_sql_query -from typing import Dict, Any, Optional, List +logger = logging.getLogger(__name__) class MongoDBDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> bool: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "host", "type": "string", "required": True, "default": "localhost", "description": ""}, {"name": "port", "type": "int", "required": False, "default": 27017, "description": "MongoDB server port (default 27017)"}, @@ -56,48 +55,46 @@ def auth_instructions() -> str: - Test connection: `mongosh --host [host] --port [port]` """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any]): self.params = params - self.duck_db_conn = duck_db_conn - + + self.host = self.params.get("host", "localhost") + self.port = int(self.params.get("port", 27017)) + self.username = self.params.get("username", "") + self.password = self.params.get("password", "") + self.database_name = self.params.get("database", "") + self.collection_name = self.params.get("collection", "") + auth_source = self.params.get("authSource", "") or self.database_name + try: - # Create MongoDB client - host = self.params.get("host", "localhost") - port = int(self.params.get("port", 27017)) - username = self.params.get("username", "") - password = self.params.get("password", "") - database = self.params.get("database", "") - collection = self.params.get("collection", "") - auth_source = self.params.get("authSource", "") or database # Default to target database - - if username and password: - # Use authSource to specify which database contains user credentials + if self.username and self.password: self.mongo_client = pymongo.MongoClient( - host=host, - port=port, - username=username, - password=password, + host=self.host, + port=self.port, + username=self.username, + password=self.password, authSource=auth_source ) else: - self.mongo_client = pymongo.MongoClient(host=host, port=port) - - self.db = self.mongo_client[database] - self.database_name = database - - self.collection = self.db[collection] if collection else None - + self.mongo_client = pymongo.MongoClient(host=self.host, port=self.port) + + self.db = self.mongo_client[self.database_name] + self.collection = self.db[self.collection_name] if self.collection_name else None + + logger.info(f"Successfully connected to MongoDB: {self.host}:{self.port}/{self.database_name}") + except Exception as e: - raise Exception(f"Failed to connect to MongoDB: {e}") + logger.error(f"Failed to connect to MongoDB: {e}") + raise RuntimeError(f"Failed to connect to MongoDB: {e}") from e def close(self): - """Close the MongoDB connection""" + """Close the MongoDB connection.""" if hasattr(self, 'mongo_client') and self.mongo_client is not None: try: self.mongo_client.close() self.mongo_client = None except Exception as e: - print(f"Warning: Failed to close MongoDB connection: {e}") + logger.warning(f"Failed to close MongoDB connection: {e}") def __enter__(self): """Context manager entry""" @@ -113,7 +110,7 @@ def __del__(self): self.close() @staticmethod - def _flatten_document(doc: Dict[str, Any], parent_key: str = '', sep: str = '_') -> Dict[str, Any]: + def _flatten_document(doc: dict[str, Any], parent_key: str = '', sep: str = '_') -> dict[str, Any]: """ Use recursion to flatten nested MongoDB documents """ @@ -139,7 +136,7 @@ def _flatten_document(doc: Dict[str, Any], parent_key: str = '', sep: str = '_') return dict(items) @staticmethod - def _convert_special_types(doc: Dict[str, Any]) -> Dict[str, Any]: + def _convert_special_types(doc: dict[str, Any]) -> dict[str, Any]: """ Convert MongoDB special types (ObjectId, datetime, etc.) to serializable types """ @@ -165,7 +162,7 @@ def _convert_special_types(doc: Dict[str, Any]) -> Dict[str, Any]: result[key] = value return result - def _process_documents(self, documents: List[Dict[str, Any]]) -> pd.DataFrame: + def _process_documents(self, documents: list[dict[str, Any]]) -> pd.DataFrame: """ Process MongoDB documents list, flatten and convert to DataFrame """ @@ -180,8 +177,64 @@ def _process_documents(self, documents: List[Dict[str, Any]]) -> pd.DataFrame: df = pd.DataFrame(processed_docs) return df + + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from MongoDB as a PyArrow Table. + + MongoDB doesn't have native Arrow support, so we fetch documents, + process them, and convert to Arrow format. + + Args: + source_table: Collection name to fetch from + size: Maximum number of documents to fetch + sort_columns: Columns to sort by + sort_order: Sort direction ('asc' or 'desc') + """ + if not source_table: + raise ValueError("source_table (collection name) must be provided") + + # Get collection + collection_name = source_table + # Handle full table names like "database.collection" + if '.' in collection_name: + parts = collection_name.split('.') + collection_name = parts[-1] + + collection = self.db[collection_name] + + logger.info(f"Fetching from MongoDB collection: {collection_name}") + + # Build cursor with optional sorting + data_cursor = collection.find() + if sort_columns and len(sort_columns) > 0: + sort_direction = -1 if sort_order == 'desc' else 1 + sort_spec = [(col, sort_direction) for col in sort_columns] + data_cursor = data_cursor.sort(sort_spec) + data_cursor = data_cursor.limit(size) + + # Fetch and process documents + data_list = list(data_cursor) + if not data_list: + logger.warning(f"No data found in MongoDB collection '{collection_name}'") + return pa.table({}) + + df = self._process_documents(data_list) + + # Convert to Arrow + arrow_table = pa.Table.from_pandas(df, preserve_index=False) + + logger.info(f"Fetched {arrow_table.num_rows} rows from MongoDB collection '{collection_name}'") - def list_tables(self, table_filter: str = None): + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: """ List all collections """ @@ -236,192 +289,7 @@ def list_tables(self, table_filter: str = None): "metadata": table_metadata }) except Exception as e: + logger.debug(f"Error listing collection {collection_name}: {e}") continue - - return results - - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 100000, sort_columns: List[str] = None, sort_order: str = 'asc'): - """ - Import MongoDB collection data into DuckDB - """ - # Extract collection name from full table name - parts = table_name.split('.') - if len(parts) >= 3: - collection_name = parts[-1] - else: - collection_name = table_name - - if name_as is None: - name_as = collection_name - - # Get and process data from MongoDB (limit rows) - collection = self.db[collection_name] - - # Build cursor with optional sorting - data_cursor = collection.find() - if sort_columns and len(sort_columns) > 0: - # MongoDB sort format: 1 for ascending, -1 for descending - sort_direction = -1 if sort_order == 'desc' else 1 - sort_spec = [(col, sort_direction) for col in sort_columns] - data_cursor = data_cursor.sort(sort_spec) - data_cursor = data_cursor.limit(size) - - data_list = list(data_cursor) - if not data_list: - raise Exception(f"No data found in MongoDB collection '{collection_name}'.") - df = self._process_documents(data_list) - - name_as = sanitize_table_name(name_as) - - self._load_dataframe_to_duckdb(df, name_as, size) - return - - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - - self._existed_collections_in_duckdb() - self._difference_collections() - self._preload_all_collections(self.collection.name if self.collection else "") - - result, error_message = validate_sql_query(query) - if not result: - print(error_message) - raise ValueError(error_message) - - result_query = json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - self._drop_all_loaded_tables() - - for collection_name, df in self.existed_collections.items(): - self._load_dataframe_to_duckdb(df, collection_name) - - return result_query - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """ - Create a new table from query results - """ - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - name_as = sanitize_table_name(name_as) - - self._existed_collections_in_duckdb() - self._difference_collections() - self._preload_all_collections(self.collection.name if self.collection else "") - - query_result_df = self.duck_db_conn.execute(query).df() - - self._drop_all_loaded_tables() - - for collection_name, existing_df in self.existed_collections.items(): - self._load_dataframe_to_duckdb(existing_df, collection_name) - - self._load_dataframe_to_duckdb(query_result_df, name_as) - - return query_result_df - - @staticmethod - def _quote_identifier(name: str) -> str: - """ - Safely quote a SQL identifier to prevent SQL injection. - Double quotes are escaped by doubling them. - """ - # Escape any double quotes in the identifier by doubling them - escaped = name.replace('"', '""') - return f'"{escaped}"' - - def _existed_collections_in_duckdb(self): - """ - Return the names and contents of tables already loaded into DuckDB - """ - self.existed_collections = {} - duckdb_tables = self.duck_db_conn.execute("SHOW TABLES").df() - for _, row in duckdb_tables.iterrows(): - collection_name = row['name'] - quoted_name = self._quote_identifier(collection_name) - df = self.duck_db_conn.execute(f"SELECT * FROM {quoted_name}").df() - self.existed_collections[collection_name] = df - - - def _difference_collections(self): - """ - Return the difference between all collections and loaded collections - """ - self.diff_collections = [] - all_collections = set(self.db.list_collection_names()) - loaded_collections = set(self.existed_collections) - diff_collections = all_collections - loaded_collections - self.diff_collections = list(diff_collections) - print(f'Difference collections: {self.diff_collections}') - - def _drop_all_loaded_tables(self): - """ - Drop all tables loaded into DuckDB - """ - for table_name in self.loaded_tables.values(): - try: - quoted_name = self._quote_identifier(table_name) - self.duck_db_conn.execute(f"DROP TABLE IF EXISTS main.{quoted_name}") - print(f"Dropped loaded table: {table_name}") - except Exception as e: - print(f"Warning: Failed to drop table '{table_name}': {e}") - - def _preload_all_collections(self, specified_collection: str = "", size: int = 100000): - """ - Preload all MongoDB collections into DuckDB memory - """ - # Get the list of collections to load - if specified_collection: - collection_names = [specified_collection] - else: - collection_names = self.db.list_collection_names() - - # Record loaded tables - self.loaded_tables = {} - - for collection_name in collection_names: - try: - collection = self.db[collection_name] - - # Get data - data_cursor = collection.find().limit(size) - data_list = list(data_cursor) - - if not data_list: - print(f"Skipping empty collection: {collection_name}") - continue - - df = self._process_documents(data_list) - - # Generate table name - table_name = sanitize_table_name(collection_name) - - # Load into DuckDB - self._load_dataframe_to_duckdb(df, table_name) - - # Record mapping - self.loaded_tables[collection_name] = table_name - print(f"Preloaded collection '{collection_name}' as table '{table_name}' ({len(data_list)} rows)") - - except Exception as e: - print(f"Warning: Failed to preload collection '{collection_name}': {e}") - - def _load_dataframe_to_duckdb(self, df: pd.DataFrame, table_name: str, size: int = 1000000): - """ - Load DataFrame into DuckDB - """ - # Create table using a temporary view - random_suffix = ''.join(rand.choices(string.ascii_letters + string.digits, k=6)) - temp_view_name = f'df_temp_{random_suffix}' - self.duck_db_conn.register(temp_view_name, df) - # Use CREATE OR REPLACE to directly replace existing table - # Quote identifiers to prevent SQL injection - quoted_table_name = self._quote_identifier(table_name) - quoted_temp_view = self._quote_identifier(temp_view_name) - # Ensure size is an integer to prevent injection via size parameter - safe_size = int(size) - self.duck_db_conn.execute(f"CREATE OR REPLACE TABLE main.{quoted_table_name} AS SELECT * FROM {quoted_temp_view} LIMIT {safe_size}") - self.duck_db_conn.execute(f"DROP VIEW {quoted_temp_view}") \ No newline at end of file + return results \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/mssql_data_loader.py b/py-src/data_formulator/data_loader/mssql_data_loader.py index 1f18a794..c73e93a8 100644 --- a/py-src/data_formulator/data_loader/mssql_data_loader.py +++ b/py-src/data_formulator/data_loader/mssql_data_loader.py @@ -1,25 +1,19 @@ import json import logging -from typing import Dict, Any, Optional, List +from typing import Any -import duckdb import pandas as pd - -try: - import pyodbc - PYODBC_AVAILABLE = True -except ImportError: - PYODBC_AVAILABLE = False +import pyarrow as pa +import connectorx as cx from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from data_formulator.security import validate_sql_query log = logging.getLogger(__name__) class MSSQLDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> bool: + def list_params() -> list[dict[str, Any]]: params_list = [ { "name": "server", @@ -93,14 +87,14 @@ def auth_instructions() -> str: SQL Server Connection Instructions: 1. Prerequisites: - - Install pyodbc dependencies: + - Install connectorx: pip install connectorx (used for fast Arrow-native data access) + - Install ODBC stack for connectorx: * macOS: brew install unixodbc * Linux: sudo apt-get install unixodbc-dev (Ubuntu/Debian) or sudo yum install unixODBC-devel (CentOS/RHEL) - * Windows: Usually included with pyodbc installation - - Install pyodbc: pip install pyodbc + * Windows: Usually included with ODBC driver installation - Install Microsoft ODBC Driver for SQL Server: * Windows: Usually pre-installed with SQL Server - * macOS: Download from Microsoft's official site or use: brew tap microsoft/mssql-release && brew install msodbcsql17 + * macOS: brew tap microsoft/mssql-release && brew install msodbcsql17 * Linux: Install via package manager (msodbcsql17 or msodbcsql18) 2. Local SQL Server Setup: @@ -128,120 +122,99 @@ def auth_instructions() -> str: - Custom port: server='localhost,1434' (note the comma, not colon) 6. Common Issues & Troubleshooting: - - If pyodbc import fails: Install unixodbc first (macOS/Linux) + - If connectorx fails: Install unixodbc first (macOS/Linux) - Ensure SQL Server service is running - Check SQL Server Browser service for named instances - Verify TCP/IP protocol is enabled in SQL Server Configuration Manager - Check Windows Firewall settings for SQL Server port - - Test connection: `sqlcmd -S server -d database -U username -P password` + - Test connection: sqlcmd -S server -d database -U username -P password - For named instances, ensure SQL Server Browser service is running - - Check ODBC drivers: `odbcinst -q -d` (on Unix/Linux) + - Check ODBC drivers: odbcinst -q -d (on Unix/Linux) 7. Driver Installation: - - macOS: `brew install msodbcsql17` or download from Microsoft - - Ubuntu/Debian: `sudo apt-get install msodbcsql17` - - CentOS/RHEL: `sudo yum install msodbcsql17` + - macOS: brew install msodbcsql17 or download from Microsoft + - Ubuntu/Debian: sudo apt-get install msodbcsql17 + - CentOS/RHEL: sudo yum install msodbcsql17 - Windows: Install SQL Server or download ODBC driver separately """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - log.info("Initializing MSSQL DataLoader with parameters: %s", params) - - if not PYODBC_AVAILABLE: - raise ImportError( - "pyodbc is required for MSSQL connections. " - "Install with: pip install pyodbc\n" - "Note for macOS: You may also need to run 'brew install unixodbc' first.\n" - "For other platforms, see: https://github.com/mkleehammer/pyodbc/wiki" - ) + def __init__(self, params: dict[str, Any]): + log.info(f"Initializing MSSQL DataLoader with parameters: {params}") self.params = params - self.duck_db_conn = duck_db_conn - - # Build connection string for pyodbc - self.connection_string = self._build_connection_string() - log.info("SQL Server connection string built") - - # Test the connection - self._test_connection() - - def _build_connection_string(self) -> str: - """Build ODBC connection string from parameters""" - conn_parts = [] - - # Driver - driver = self.params.get("driver", "ODBC Driver 17 for SQL Server") - conn_parts.append(f"DRIVER={{{driver}}}") - - # Server (handle different server formats) - server = self.params.get("server", "localhost") - port = self.params.get("port", "1433") - - # Handle different server formats - if "\\" in server: - # Named instance format: server\instance - conn_parts.append(f"SERVER={server}") - elif "," in server: - # Port already specified in server: server,port - conn_parts.append(f"SERVER={server}") - else: - # Standard format: add port if not default - if port and port != "1433": - conn_parts.append(f"SERVER={server},{port}") - else: - conn_parts.append(f"SERVER={server}") - - # Database - database = self.params.get("database", "master") - conn_parts.append(f"DATABASE={database}") - - # Authentication - user = self.params.get("user", "").strip() - password = self.params.get("password", "").strip() - - if user: - conn_parts.append(f"UID={user}") - conn_parts.append(f"PWD={password}") - else: - # Use Windows Authentication - conn_parts.append("Trusted_Connection=yes") - # Connection settings - encrypt = self.params.get("encrypt", "yes") - trust_cert = self.params.get("trust_server_certificate", "no") - timeout = self.params.get("connection_timeout", "30") - - conn_parts.append(f"Encrypt={encrypt}") - conn_parts.append(f"TrustServerCertificate={trust_cert}") - conn_parts.append(f"Connection Timeout={timeout}") - - return ";".join(conn_parts) + self.server = params.get("server", "localhost") + self.database = params.get("database", "master") + self.user = params.get("user", "").strip() + self.password = params.get("password", "").strip() + self.port = params.get("port", "1433") + + # Build connection URL for connectorx: mssql://user:password@host:port/database + # - Use explicit empty password (user:@host) when user is set but password is blank. + # - Use 127.0.0.1 when server is localhost to force IPv4 TCP and avoid IPv6 ::1 connection issues. + server_for_url = "127.0.0.1" if (self.server or "").strip().lower() == "localhost" else self.server + if self.user: + self.connection_url = f"mssql://{self.user}:{self.password}@{server_for_url}:{self.port}/{self.database}?TrustServerCertificate=true" + else: + self.connection_url = f"mssql://{server_for_url}:{self.port}/{self.database}?TrustServerCertificate=true&IntegratedSecurity=true" - def _test_connection(self): - """Test the SQL Server connection""" try: - with pyodbc.connect(self.connection_string, timeout=10) as conn: - cursor = conn.cursor() - cursor.execute("SELECT @@VERSION") - version = cursor.fetchone()[0] - log.info(f"SQL Server connection successful. Version: {version[:50]}...") + cx.read_sql(self.connection_url, "SELECT 1", return_type="arrow") + log.info(f"Successfully connected to SQL Server: {self.server}/{self.database}") except Exception as e: - log.error(f"SQL Server connection test failed: {e}") - raise ConnectionError(f"Failed to connect to SQL Server: {e}") + log.error(f"Failed to connect to SQL Server: {e}") + raise ValueError(f"Failed to connect to SQL Server '{self.server}': {e}") from e - def _execute_query(self, query: str) -> pd.DataFrame: - """Execute a query and return results as DataFrame""" + def _execute_query(self, query: str) -> pa.Table: + """Execute a query and return results as a PyArrow Table (via connectorx).""" try: - with pyodbc.connect(self.connection_string) as conn: - return pd.read_sql(query, conn) + return cx.read_sql(self.connection_url, query, return_type="arrow") except Exception as e: log.error(f"Failed to execute query: {e}") raise - def list_tables(self): - """List all tables from SQL Server database""" + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from SQL Server as a PyArrow Table using connectorx. + """ + if not source_table: + raise ValueError("source_table must be provided") + + # Parse table name + if "." in source_table: + schema, table = source_table.split(".", 1) + else: + schema = "dbo" + table = source_table + + base_query = f"SELECT * FROM [{schema}].[{table}]" + + # Add ORDER BY if sort columns specified + order_by_clause = "" + if sort_columns and len(sort_columns) > 0: + order_direction = "DESC" if sort_order == 'desc' else "ASC" + sanitized_cols = [f'[{col}] {order_direction}' for col in sort_columns] + order_by_clause = f" ORDER BY {', '.join(sanitized_cols)}" + + # SQL Server uses TOP instead of LIMIT + query = f"SELECT TOP {size} * FROM ({base_query}{order_by_clause}) AS limited" + + log.info(f"Executing SQL Server query: {query[:200]}...") + + arrow_table = cx.read_sql(self.connection_url, query, return_type="arrow") + log.info(f"Fetched {arrow_table.num_rows} rows from SQL Server [Arrow-native]") + + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """List all tables from SQL Server database.""" try: - # Query SQL Server system tables to get table information tables_query = """ SELECT TABLE_SCHEMA, @@ -253,7 +226,7 @@ def list_tables(self): ORDER BY TABLE_SCHEMA, TABLE_NAME """ - tables_df = self._execute_query(tables_query) + tables_df = self._execute_query(tables_query).to_pandas() results = [] for _, row in tables_df.iterrows(): @@ -262,6 +235,9 @@ def list_tables(self): table_type = row.get("TABLE_TYPE", "BASE TABLE") full_table_name = f"{schema}.{table_name}" + if table_filter and table_filter.lower() not in full_table_name.lower(): + continue + try: # Get column information columns_query = f""" @@ -277,7 +253,7 @@ def list_tables(self): WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table_name}' ORDER BY ORDINAL_POSITION """ - columns_df = self._execute_query(columns_query) + columns_df = self._execute_query(columns_query).to_pandas() columns = [] for _, col_row in columns_df.iterrows(): @@ -320,7 +296,7 @@ def list_tables(self): # Get sample data (first 10 rows) sample_query = f"SELECT TOP 10 * FROM [{schema}].[{table_name}]" - sample_df = self._execute_query(sample_query) + sample_df = self._execute_query(sample_query).to_pandas() # Handle NaN values in sample data for JSON serialization try: @@ -339,7 +315,7 @@ def list_tables(self): # Get row count count_query = f"SELECT COUNT(*) as row_count FROM [{schema}].[{table_name}]" - count_df = self._execute_query(count_query) + count_df = self._execute_query(count_query).to_pandas() # Handle NaN values in row count raw_count = count_df.iloc[0]["row_count"] @@ -386,80 +362,3 @@ def list_tables(self): results = [] return results - - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - """Ingest data from SQL Server table into DuckDB""" - # Parse table name (assuming format: schema.table) - if "." in table_name: - schema, table = table_name.split(".", 1) - else: - schema = "dbo" # Default schema - table = table_name - - if name_as is None: - name_as = table - - name_as = sanitize_table_name(name_as) - - try: - # Build ORDER BY clause if sort_columns are specified - order_by_clause = "" - if sort_columns and len(sort_columns) > 0: - # Use square brackets for SQL Server column quoting - order_direction = "DESC" if sort_order == 'desc' else "ASC" - sanitized_cols = [f'[{col}] {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - # Query data from SQL Server with limit - query = f"SELECT TOP {size} * FROM [{schema}].[{table}] {order_by_clause}" - df = self._execute_query(query) - - # Use the base class method to ingest DataFrame into DuckDB - self.ingest_df_to_duckdb(df, name_as) - log.info(f"Successfully ingested {len(df)} rows from {schema}.{table} to {name_as}") - except Exception as e: - log.error(f"Failed to ingest data from {table_name}: {e}") - raise - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - """Execute a custom query and return sample results""" - try: - # Add TOP 10 if not already present for SELECT queries - modified_query = query.strip() - if ( - modified_query.upper().startswith("SELECT") - and not modified_query.upper().startswith("SELECT TOP") - and "TOP " not in modified_query.upper()[:50] - ): # Check first 50 chars - modified_query = modified_query.replace("SELECT", "SELECT TOP 10", 1) - - result, error_message = validate_sql_query(modified_query) - if not result: - raise ValueError(error_message) - - df = self._execute_query(modified_query) - - # Handle NaN values for JSON serialization - df_clean = df.fillna(value=None) - return json.loads( - df_clean.head(10).to_json(orient="records", date_format="iso", default_handler=str) - ) - except Exception as e: - log.error(f"Failed to execute query sample: {e}") - raise - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """Execute a custom query and ingest results into DuckDB""" - try: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self._execute_query(query) - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) - log.info(f"Successfully ingested {len(df)} rows from custom query to {name_as}") - return df - except Exception as e: - log.error(f"Failed to execute and ingest custom query: {e}") - raise diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py index 0430a57a..f10180c9 100644 --- a/py-src/data_formulator/data_loader/mysql_data_loader.py +++ b/py-src/data_formulator/data_loader/mysql_data_loader.py @@ -1,19 +1,12 @@ import json import logging +from typing import Any import pandas as pd -import duckdb +import pyarrow as pa +import connectorx as cx -from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name - -from data_formulator.security import validate_sql_query -from typing import Dict, Any, Optional, List - -try: - import pymysql - PYMYSQL_AVAILABLE = True -except ImportError: - PYMYSQL_AVAILABLE = False +from data_formulator.data_loader.external_data_loader import ExternalDataLoader logger = logging.getLogger(__name__) @@ -21,7 +14,7 @@ class MySQLDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "user", "type": "string", "required": True, "default": "root", "description": ""}, {"name": "password", "type": "string", "required": False, "default": "", "description": "leave blank for no password"}, @@ -58,254 +51,159 @@ def auth_instructions() -> str: - Test connection: `mysql -u [username] -p -h [host] -P [port] [database]` """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - if not PYMYSQL_AVAILABLE: - raise ImportError( - "pymysql is required for MySQL connections. " - "Install with: pip install pymysql" - ) - + def __init__(self, params: dict[str, Any]): self.params = params - self.duck_db_conn = duck_db_conn - - # Get params as-is from frontend - host = self.params.get('host', '') - user = self.params.get('user', '') - password = self.params.get('password', '') - database = self.params.get('database', '') - - # Validate required params - if not host: + + self.host = self.params.get("host", "") + self.user = self.params.get("user", "") + self.password = self.params.get("password", "") + self.database = self.params.get("database", "") + + if not self.host: raise ValueError("MySQL host is required") - if not user: + if not self.user: raise ValueError("MySQL user is required") - if not database: + if not self.database: raise ValueError("MySQL database is required") - - # Handle port (only field with sensible default) - port = self.params.get('port', '') + + port = self.params.get("port", "") if isinstance(port, str): - port = int(port) if port else 3306 + self.port = int(port) if port else 3306 elif not port: - port = 3306 + self.port = 3306 + else: + self.port = int(port) - try: - self.mysql_conn = pymysql.connect( - host=host, - user=user, - password=password, - database=database, - port=port, - cursorclass=pymysql.cursors.DictCursor, - charset='utf8mb4' - ) - self.database = database - logger.info(f"Successfully connected to MySQL database: {self.database}") - except Exception as e: - logger.error(f"Failed to connect to MySQL: {e}") - raise - - def _execute_query(self, query: str, params: tuple = None) -> pd.DataFrame: - """Execute a query using native MySQL connection and return a DataFrame. + # Build connection URL for connectorx + # Format: mysql://user:password@host:port/database + # - Use explicit empty password (user:@host) so the URL parser sees user vs password correctly. + # - Use 127.0.0.1 when host is localhost to force IPv4 TCP and avoid IPv6 ::1 connection issues. + host_for_url = "127.0.0.1" if (self.host or "").strip().lower() == "localhost" else self.host + if self.password: + self.connection_url = f"mysql://{self.user}:{self.password}@{host_for_url}:{self.port}/{self.database}" + else: + self.connection_url = f"mysql://{self.user}:@{host_for_url}:{self.port}/{self.database}" - Args: - query: SQL query string. Use %s for parameterized queries. - params: Optional tuple of parameters for parameterized queries. - """ - try: - with self.mysql_conn.cursor() as cursor: - cursor.execute(query, params) - rows = cursor.fetchall() - if rows: - return pd.DataFrame(rows) - else: - # Return empty DataFrame with column names - return pd.DataFrame() - except Exception as e: - logger.error(f"Error executing MySQL query: {e}") - # Try to reconnect if connection was lost - self._reconnect_if_needed() - raise - - def _reconnect_if_needed(self): - """Attempt to reconnect to MySQL if the connection was lost.""" + self._sanitized_url = f"mysql://{self.user}:***@{self.host}:{self.port}/{self.database}" + + # Test connection try: - self.mysql_conn.ping(reconnect=True) + cx.read_sql(self.connection_url, "SELECT 1", return_type="arrow") except Exception as e: - logger.warning(f"Reconnection attempt failed: {e}") - # Try to create a new connection using stored params - host = self.params.get('host', '') - user = self.params.get('user', '') - password = self.params.get('password', '') - - port = self.params.get('port', '') - if isinstance(port, str): - port = int(port) if port else 3306 - elif not port: - port = 3306 - - self.mysql_conn = pymysql.connect( - host=host, - user=user, - password=password, - database=self.database, - port=port, - cursorclass=pymysql.cursors.DictCursor, - charset='utf8mb4' - ) - - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: - # Get list of tables from the connected database - # Filter by the specific database we're connected to for better performance - tables_query = """ - SELECT TABLE_SCHEMA, TABLE_NAME - FROM information_schema.tables - WHERE TABLE_SCHEMA = %s - AND TABLE_TYPE = 'BASE TABLE' + logger.error(f"Failed to connect to MySQL (mysql://{self.user}:***@{self.host}:{self.port}/{self.database}): {e}") + raise ValueError(f"Failed to connect to MySQL database '{self.database}' on host '{self.host}': {e}") from e + logger.info(f"Successfully connected to MySQL: mysql://{self.user}:***@{self.host}:{self.port}/{self.database}") + + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: """ - tables_df = self._execute_query(tables_query, (self.database,)) + Fetch data from MySQL as a PyArrow Table using connectorx. - if tables_df.empty: - return [] - - results = [] + connectorx provides extremely fast Arrow-native database access. + """ + if not source_table: + raise ValueError("source_table must be provided") - for _, row in tables_df.iterrows(): - schema = row['TABLE_SCHEMA'] - table_name = row['TABLE_NAME'] - - # Apply table filter if provided - if table_filter and table_filter.lower() not in table_name.lower(): - continue - - full_table_name = f"{schema}.{table_name}" - - try: - # Get column information from MySQL - columns_query = ( - "SELECT COLUMN_NAME, DATA_TYPE " - "FROM information_schema.columns " - "WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s " - "ORDER BY ORDINAL_POSITION" - ) - columns_df = self._execute_query(columns_query, (schema, table_name)) - columns = [{ - 'name': col_row['COLUMN_NAME'], - 'type': col_row['DATA_TYPE'] - } for _, col_row in columns_df.iterrows()] - - # Get sample data - sample_query = "SELECT * FROM `{}`.`{}` LIMIT 10".format(schema, table_name) - sample_df = self._execute_query(sample_query) - sample_rows = json.loads(sample_df.to_json(orient="records", date_format='iso')) - - # Get row count - count_query = "SELECT COUNT(*) as cnt FROM `{}`.`{}`".format(schema, table_name) - count_df = self._execute_query(count_query) - row_count = int(count_df['cnt'].iloc[0]) if not count_df.empty else 0 - - table_metadata = { - "row_count": row_count, - "columns": columns, - "sample_rows": sample_rows - } - - results.append({ - "name": full_table_name, - "metadata": table_metadata - }) - except Exception as e: - logger.warning(f"Error processing table {full_table_name}: {e}") - continue - - return results - - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - """Fetch data from MySQL and ingest into DuckDB.""" - if name_as is None: - name_as = table_name.split('.')[-1] - - name_as = sanitize_table_name(name_as) - - # Validate and sanitize table name components - sanitized_size = None - try: - sanitized_size = int(size) - if sanitized_size <= 0: - raise ValueError("Size must be a positive integer.") - except Exception: - raise ValueError("Size parameter must be a positive integer.") - - # Build ORDER BY clause if sort_columns are specified + # Handle table names + if '.' in source_table: + base_query = f"SELECT * FROM {source_table}" + else: + base_query = f"SELECT * FROM `{source_table}`" + + # Add ORDER BY if sort columns specified order_by_clause = "" if sort_columns and len(sort_columns) > 0: - # Use backticks for MySQL column quoting order_direction = "DESC" if sort_order == 'desc' else "ASC" sanitized_cols = [f'`{col}` {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - if '.' in table_name: - parts = table_name.split('.') - schema = sanitize_table_name(parts[0]) - tbl = sanitize_table_name(parts[1]) - query = f"SELECT * FROM `{schema}`.`{tbl}` {order_by_clause} LIMIT {sanitized_size}" - else: - sanitized_table_name = sanitize_table_name(table_name) - query = f"SELECT * FROM `{sanitized_table_name}` {order_by_clause} LIMIT {sanitized_size}" - - # Fetch data from MySQL - df = self._execute_query(query) + order_by_clause = f" ORDER BY {', '.join(sanitized_cols)}" - if df.empty: - logger.warning(f"No data fetched from table {table_name}") - return + query = f"{base_query}{order_by_clause} LIMIT {size}" - # Ingest into DuckDB using the base class method - self.ingest_df_to_duckdb(df, name_as) - logger.info(f"Successfully ingested {len(df)} rows from {table_name} into DuckDB table {name_as}") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) + logger.info(f"Executing MySQL query via connectorx: {query[:200]}...") - # Execute query via native MySQL connection - df = self._execute_query(query) - return json.loads(df.head(10).to_json(orient="records", date_format='iso')) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """Execute custom query and ingest results into DuckDB.""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) + arrow_table = cx.read_sql(self.connection_url, query, return_type="arrow") - # Execute query via native MySQL connection - df = self._execute_query(query) + logger.info(f"Fetched {arrow_table.num_rows} rows from MySQL [Arrow-native]") - # Ingest into DuckDB using the base class method - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) - return df - - def close(self): - """Explicitly close the MySQL connection.""" - if hasattr(self, 'mysql_conn') and self.mysql_conn: - try: - self.mysql_conn.close() - except Exception as e: - logger.warning(f"Error closing MySQL connection: {e}") - - def __enter__(self): - """Support context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Support context manager exit and cleanup.""" - self.close() - - def __del__(self): - """Clean up MySQL connection when the loader is destroyed.""" + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """List available tables from MySQL database.""" + return self._list_tables_connectorx(table_filter) + + def _list_tables_connectorx(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """List tables using connectorx.""" try: - self.close() - except Exception: - # Ignore errors during destruction to prevent exceptions in garbage collection - pass \ No newline at end of file + tables_query = f""" + SELECT TABLE_SCHEMA, TABLE_NAME + FROM information_schema.tables + WHERE TABLE_SCHEMA = '{self.database}' + AND TABLE_TYPE = 'BASE TABLE' + """ + tables_arrow = cx.read_sql(self.connection_url, tables_query, return_type="arrow") + tables_df = tables_arrow.to_pandas() + + if tables_df.empty: + return [] + + results = [] + + for _, row in tables_df.iterrows(): + schema = row['TABLE_SCHEMA'] + table_name = row['TABLE_NAME'] + + if table_filter and table_filter.lower() not in table_name.lower(): + continue + + full_table_name = f"{schema}.{table_name}" + + try: + # Get column information + columns_query = f""" + SELECT COLUMN_NAME, DATA_TYPE + FROM information_schema.columns + WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table_name}' + ORDER BY ORDINAL_POSITION + """ + columns_arrow = cx.read_sql(self.connection_url, columns_query, return_type="arrow") + columns_df = columns_arrow.to_pandas() + columns = [{ + 'name': col_row['COLUMN_NAME'], + 'type': col_row['DATA_TYPE'] + } for _, col_row in columns_df.iterrows()] + + # Get sample data + sample_query = f"SELECT * FROM `{schema}`.`{table_name}` LIMIT 10" + sample_arrow = cx.read_sql(self.connection_url, sample_query, return_type="arrow") + sample_df = sample_arrow.to_pandas() + sample_rows = json.loads(sample_df.to_json(orient="records", date_format='iso')) + + # Get row count + count_query = f"SELECT COUNT(*) as cnt FROM `{schema}`.`{table_name}`" + count_arrow = cx.read_sql(self.connection_url, count_query, return_type="arrow") + row_count = int(count_arrow.to_pandas()['cnt'].iloc[0]) + + table_metadata = { + "row_count": row_count, + "columns": columns, + "sample_rows": sample_rows + } + + results.append({ + "name": full_table_name, + "metadata": table_metadata + }) + except Exception as e: + logger.warning(f"Error processing table {full_table_name}: {e}") + continue + + return results + + except Exception as e: + logger.error(f"Error listing tables: {e}") + return [] \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/postgresql_data_loader.py b/py-src/data_formulator/data_loader/postgresql_data_loader.py index b327a737..779aaf84 100644 --- a/py-src/data_formulator/data_loader/postgresql_data_loader.py +++ b/py-src/data_formulator/data_loader/postgresql_data_loader.py @@ -1,171 +1,183 @@ -import json - -import pandas as pd -import duckdb - -from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name - -from typing import Dict, Any, List, Optional -from data_formulator.security import validate_sql_query - -class PostgreSQLDataLoader(ExternalDataLoader): - - @staticmethod - def list_params() -> List[Dict[str, Any]]: - params_list = [ - {"name": "user", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL username"}, - {"name": "password", "type": "string", "required": False, "default": "", "description": "leave blank for no password"}, - {"name": "host", "type": "string", "required": True, "default": "localhost", "description": "PostgreSQL host"}, - {"name": "port", "type": "string", "required": False, "default": "5432", "description": "PostgreSQL port"}, - {"name": "database", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL database name"} - ] - return params_list - - @staticmethod - def auth_instructions() -> str: - return "Provide your PostgreSQL connection details. The user must have SELECT permissions on the tables you want to access." - - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - self.params = params - self.duck_db_conn = duck_db_conn - - # Get params as-is from frontend - host = self.params.get('host', '') - port = self.params.get('port', '') or '5432' # Only port has a sensible default - user = self.params.get('user', '') - database = self.params.get('database', '') - password = self.params.get('password', '') - - # Validate required params - if not host: - raise ValueError("PostgreSQL host is required") - if not user: - raise ValueError("PostgreSQL user is required") - if not database: - raise ValueError("PostgreSQL database is required") - - # Create a sanitized version for logging (excludes password) - sanitized_attach_string = f"host={host} port={port} user={user} dbname={database}" - - try: - # Install and load the Postgres extension - self.duck_db_conn.install_extension("postgres") - self.duck_db_conn.load_extension("postgres") - - # Prepare the connection string for Postgres - # Note: attach_string contains sensitive credentials - do not log it - password_part = f" password={password}" if password else "" - attach_string = f"host={host} port={port} user={user}{password_part} dbname={database}" - - # Detach existing postgres connection if it exists - try: - self.duck_db_conn.execute("DETACH mypostgresdb;") - except: - pass # Ignore if connection doesn't exist - - # Register Postgres connection - self.duck_db_conn.execute(f"ATTACH '{attach_string}' AS mypostgresdb (TYPE postgres);") - print(f"Successfully connected to PostgreSQL database: {database}") - - except Exception as e: - # Log error with sanitized connection string to avoid exposing password - error_type = type(e).__name__ - print(f"Failed to connect to PostgreSQL ({sanitized_attach_string}): {error_type}") - raise ValueError(f"Failed to connect to PostgreSQL database '{database}' on host '{host}': {error_type}") - - def list_tables(self): - try: - # Query tables through DuckDB's attached PostgreSQL connection - tables_df = self.duck_db_conn.execute(""" - SELECT table_schema as schemaname, table_name as tablename - FROM mypostgresdb.information_schema.tables - WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast') - AND table_schema NOT LIKE '%_intern%' - AND table_schema NOT LIKE '%timescaledb%' - AND table_name NOT LIKE '%/%' - AND table_type = 'BASE TABLE' - ORDER BY table_schema, table_name - """).fetch_df() - - print(f"Found tables: {tables_df}") - - results = [] - - for schema, table_name in tables_df.values: - full_table_name = f"mypostgresdb.{schema}.{table_name}" - - try: - # Get column information using DuckDB's DESCRIBE - columns_df = self.duck_db_conn.execute(f"DESCRIBE {full_table_name}").df() - columns = [{ - 'name': row['column_name'], - 'type': row['column_type'] - } for _, row in columns_df.iterrows()] - - # Get sample data - sample_df = self.duck_db_conn.execute(f"SELECT * FROM {full_table_name} LIMIT 10").df() - sample_rows = json.loads(sample_df.to_json(orient="records")) - - # Get row count - row_count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM {full_table_name}").fetchone()[0] - - table_metadata = { - "row_count": row_count, - "columns": columns, - "sample_rows": sample_rows - } - - results.append({ - "name": full_table_name, - "metadata": table_metadata - }) - - except Exception as e: - print(f"Error processing table {full_table_name}: {e}") - continue - - return results - - except Exception as e: - print(f"Error listing tables: {e}") - return [] - - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - # Create table in the main DuckDB database from Postgres data - if name_as is None: - name_as = table_name.split('.')[-1] - - name_as = sanitize_table_name(name_as) - - # Build ORDER BY clause if sort_columns are specified - order_by_clause = "" - if sort_columns and len(sort_columns) > 0: - # Sanitize column names to prevent SQL injection - order_direction = "DESC" if sort_order == 'desc' else "ASC" - sanitized_cols = [f'"{col}" {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM {table_name} - {order_by_clause} - LIMIT {size} - """) - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - # Execute the query and get results as a DataFrame - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self.duck_db_conn.execute(query).df() - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) - return df +import json +import logging +from typing import Any + +import pandas as pd +import pyarrow as pa +import connectorx as cx + +from data_formulator.data_loader.external_data_loader import ExternalDataLoader + +logger = logging.getLogger(__name__) + + +class PostgreSQLDataLoader(ExternalDataLoader): + + @staticmethod + def list_params() -> list[dict[str, Any]]: + params_list = [ + {"name": "user", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL username"}, + {"name": "password", "type": "string", "required": False, "default": "", "description": "leave blank for no password"}, + {"name": "host", "type": "string", "required": True, "default": "localhost", "description": "PostgreSQL host"}, + {"name": "port", "type": "string", "required": False, "default": "5432", "description": "PostgreSQL port"}, + {"name": "database", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL database name"} + ] + return params_list + + @staticmethod + def auth_instructions() -> str: + return "Provide your PostgreSQL connection details. The user must have SELECT permissions on the tables you want to access. Uses connectorx for fast Arrow-native data access." + + def __init__(self, params: dict[str, Any]): + self.params = params + + self.host = self.params.get("host", "") + self.port = self.params.get("port", "") or "5432" + self.user = self.params.get("user", "") + self.database = self.params.get("database", "") + self.password = self.params.get("password", "") + + if not self.host: + raise ValueError("PostgreSQL host is required") + if not self.user: + raise ValueError("PostgreSQL user is required") + if not self.database: + raise ValueError("PostgreSQL database is required") + + # Build connection URL for connectorx: postgresql://user:password@host:port/database + # - Use explicit empty password (user:@host) so the URL parser sees user vs password correctly. + # - Use 127.0.0.1 when host is localhost to force IPv4 TCP and avoid IPv6 ::1 connection issues. + host_for_url = "127.0.0.1" if (self.host or "").strip().lower() == "localhost" else self.host + if self.password: + self.connection_url = f"postgresql://{self.user}:{self.password}@{host_for_url}:{self.port}/{self.database}" + else: + self.connection_url = f"postgresql://{self.user}:@{host_for_url}:{self.port}/{self.database}" + + try: + cx.read_sql(self.connection_url, "SELECT 1", return_type="arrow") + except Exception as e: + logger.error(f"Failed to connect to PostgreSQL (postgresql://{self.user}:***@{self.host}:{self.port}/{self.database}): {e}") + raise ValueError(f"Failed to connect to PostgreSQL database '{self.database}' on host '{self.host}': {e}") from e + logger.info(f"Successfully connected to PostgreSQL: postgresql://{self.user}:***@{self.host}:{self.port}/{self.database}") + + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from PostgreSQL as a PyArrow Table using connectorx. + + connectorx provides extremely fast Arrow-native database access, + typically 2-10x faster than pandas-based approaches. + """ + if not source_table: + raise ValueError("source_table must be provided") + + # Handle table names like "mypostgresdb.schema.table" -> "schema.table" + table_ref = source_table + if source_table.startswith("mypostgresdb."): + table_ref = source_table[len("mypostgresdb."):] + base_query = f"SELECT * FROM {table_ref}" + + # Add ORDER BY if sort columns specified + order_by_clause = "" + if sort_columns and len(sort_columns) > 0: + order_direction = "DESC" if sort_order == 'desc' else "ASC" + sanitized_cols = [f'"{col}" {order_direction}' for col in sort_columns] + order_by_clause = f" ORDER BY {', '.join(sanitized_cols)}" + + # Build full query with limit + query = f"{base_query}{order_by_clause} LIMIT {size}" + + logger.info(f"Executing PostgreSQL query via connectorx: {query[:200]}...") + + # Execute with connectorx - returns Arrow table directly + arrow_table = cx.read_sql(self.connection_url, query, return_type="arrow") + + logger.info(f"Fetched {arrow_table.num_rows} rows from PostgreSQL [Arrow-native]") + + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """List available tables from PostgreSQL.""" + return self._list_tables_connectorx(table_filter) + + def _list_tables_connectorx(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """List tables using connectorx.""" + try: + # Query tables from information_schema + query = """ + SELECT table_schema as schemaname, table_name as tablename + FROM information_schema.tables + WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast') + AND table_schema NOT LIKE '%_intern%' + AND table_schema NOT LIKE '%timescaledb%' + AND table_name NOT LIKE '%/%' + AND table_type = 'BASE TABLE' + ORDER BY table_schema, table_name + """ + tables_arrow = cx.read_sql(self.connection_url, query, return_type="arrow") + tables_df = tables_arrow.to_pandas() + + logger.info(f"Found {len(tables_df)} tables") + + results = [] + + for _, row in tables_df.iterrows(): + schema = row['schemaname'] + table_name = row['tablename'] + full_table_name = f"{schema}.{table_name}" + + # Apply filter if provided + if table_filter and table_filter.lower() not in full_table_name.lower(): + continue + + try: + # Get column information + columns_query = f""" + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_schema = '{schema}' AND table_name = '{table_name}' + ORDER BY ordinal_position + """ + columns_arrow = cx.read_sql(self.connection_url, columns_query, return_type="arrow") + columns_df = columns_arrow.to_pandas() + columns = [{ + 'name': col_row['column_name'], + 'type': col_row['data_type'] + } for _, col_row in columns_df.iterrows()] + + # Get sample data + sample_query = f'SELECT * FROM "{schema}"."{table_name}" LIMIT 10' + sample_arrow = cx.read_sql(self.connection_url, sample_query, return_type="arrow") + sample_df = sample_arrow.to_pandas() + sample_rows = json.loads(sample_df.to_json(orient="records")) + + # Get row count + count_query = f'SELECT COUNT(*) as cnt FROM "{schema}"."{table_name}"' + count_arrow = cx.read_sql(self.connection_url, count_query, return_type="arrow") + row_count = count_arrow.to_pandas()['cnt'].iloc[0] + + table_metadata = { + "row_count": int(row_count), + "columns": columns, + "sample_rows": sample_rows + } + + results.append({ + "name": full_table_name, + "metadata": table_metadata + }) + + except Exception as e: + logger.warning(f"Error processing table {full_table_name}: {e}") + continue + + return results + + except Exception as e: + logger.error(f"Error listing tables: {e}") + return [] diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py index d92b7c41..7e703be1 100644 --- a/py-src/data_formulator/data_loader/s3_data_loader.py +++ b/py-src/data_formulator/data_loader/s3_data_loader.py @@ -1,22 +1,23 @@ import json +import logging +from typing import Any + +import boto3 import pandas as pd -import duckdb -import os +import pyarrow as pa +import pyarrow.csv as pa_csv +import pyarrow.parquet as pq +from pyarrow import fs as pa_fs + +from data_formulator.data_loader.external_data_loader import ExternalDataLoader -from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from typing import Dict, Any, List -from data_formulator.security import validate_sql_query +logger = logging.getLogger(__name__) -try: - import boto3 - BOTO3_AVAILABLE = True -except ImportError: - BOTO3_AVAILABLE = False class S3DataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "aws_access_key_id", "type": "string", "required": True, "default": "", "description": "AWS access key ID"}, {"name": "aws_secret_access_key", "type": "string", "required": True, "default": "", "description": "AWS secret access key"}, @@ -63,38 +64,78 @@ def auth_instructions() -> str: **Security:** Never share secret keys, rotate regularly, use least privilege permissions. """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): - if not BOTO3_AVAILABLE: - raise ImportError( - "boto3 is required for S3 connections. " - "Install with: pip install boto3" - ) - + def __init__(self, params: dict[str, Any]): self.params = params - self.duck_db_conn = duck_db_conn - - # Extract parameters + self.aws_access_key_id = params.get("aws_access_key_id", "") self.aws_secret_access_key = params.get("aws_secret_access_key", "") self.aws_session_token = params.get("aws_session_token", "") self.region_name = params.get("region_name", "us-east-1") self.bucket = params.get("bucket", "") + + self.s3_fs = pa_fs.S3FileSystem( + access_key=self.aws_access_key_id, + secret_key=self.aws_secret_access_key, + session_token=self.aws_session_token if self.aws_session_token else None, + region=self.region_name, + ) + logger.info(f"Initialized PyArrow S3 filesystem for bucket: {self.bucket}") + + def fetch_data_as_arrow( + self, + source_table: str, + size: int = 1000000, + sort_columns: list[str] | None = None, + sort_order: str = 'asc' + ) -> pa.Table: + """ + Fetch data from S3 as a PyArrow Table using PyArrow's native S3 filesystem. + + For files (parquet, csv), reads directly using PyArrow. + """ + if not source_table: + raise ValueError("source_table (S3 URL) must be provided") + + s3_url = source_table + + # Parse S3 URL: s3://bucket/key -> bucket/key for PyArrow + if s3_url.startswith("s3://"): + s3_path = s3_url[5:] # Remove "s3://" + else: + s3_path = f"{self.bucket}/{s3_url}" + + logger.info(f"Reading S3 file via PyArrow: {s3_url}") + + # Read based on file extension + if s3_url.lower().endswith('.parquet'): + arrow_table = pq.read_table(s3_path, filesystem=self.s3_fs) + elif s3_url.lower().endswith('.csv'): + with self.s3_fs.open_input_file(s3_path) as f: + arrow_table = pa_csv.read_csv(f) + elif s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'): + import pyarrow.json as pa_json + with self.s3_fs.open_input_file(s3_path) as f: + arrow_table = pa_json.read_json(f) + else: + raise ValueError(f"Unsupported file type: {s3_url}") + + # Apply sorting if specified + if sort_columns and len(sort_columns) > 0: + df = arrow_table.to_pandas() + ascending = sort_order != 'desc' + df = df.sort_values(by=sort_columns, ascending=ascending) + arrow_table = pa.Table.from_pandas(df, preserve_index=False) - # Install and load the httpfs extension for S3 access - self.duck_db_conn.install_extension("httpfs") - self.duck_db_conn.load_extension("httpfs") + # Apply size limit + if arrow_table.num_rows > size: + arrow_table = arrow_table.slice(0, size) - # Set AWS credentials for DuckDB - self.duck_db_conn.execute(f"SET s3_region='{self.region_name}'") - self.duck_db_conn.execute(f"SET s3_access_key_id='{self.aws_access_key_id}'") - self.duck_db_conn.execute(f"SET s3_secret_access_key='{self.aws_secret_access_key}'") - if self.aws_session_token: # Add this block - self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'") - - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: - # Use boto3 to list objects in the bucket - import boto3 + logger.info(f"Fetched {arrow_table.num_rows} rows from S3 [Arrow-native]") + return arrow_table + + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: + """List available files from S3 bucket.""" s3_client = boto3.client( 's3', aws_access_key_id=self.aws_access_key_id, @@ -103,7 +144,6 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: region_name=self.region_name ) - # List objects in the bucket response = s3_client.list_objects_v2(Bucket=self.bucket) results = [] @@ -112,36 +152,24 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: for obj in response['Contents']: key = obj['Key'] - # Skip directories and non-data files if key.endswith('/') or not self._is_supported_file(key): continue - # Apply table filter if provided if table_filter and table_filter.lower() not in key.lower(): continue - # Create S3 URL s3_url = f"s3://{self.bucket}/{key}" try: - # Choose the appropriate read function based on file extension - if s3_url.lower().endswith('.parquet'): - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{s3_url}') LIMIT 10").df() - elif s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'): - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{s3_url}') LIMIT 10").df() - elif s3_url.lower().endswith('.csv'): # Default to CSV for other formats - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT 10").df() + sample_table = self._read_sample_arrow(s3_url, 10) + sample_df = sample_table.to_pandas() - # Get column information columns = [{ 'name': col, 'type': str(sample_df[col].dtype) } for col in sample_df.columns] - # Get sample data sample_rows = json.loads(sample_df.to_json(orient="records")) - - # Estimate row count (this is approximate for CSV files) row_count = self._estimate_row_count(s3_url) table_metadata = { @@ -155,83 +183,45 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: "metadata": table_metadata }) except Exception as e: - # Skip files that can't be read - print(f"Error reading {s3_url}: {e}") + logger.warning(f"Error reading {s3_url}: {e}") continue return results + def _read_sample_arrow(self, s3_url: str, limit: int) -> pa.Table: + """Read sample data using PyArrow S3 filesystem.""" + s3_path = s3_url[5:] if s3_url.startswith("s3://") else s3_url + + if s3_url.lower().endswith('.parquet'): + table = pq.read_table(s3_path, filesystem=self.s3_fs) + elif s3_url.lower().endswith('.csv'): + with self.s3_fs.open_input_file(s3_path) as f: + table = pa_csv.read_csv(f) + elif s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'): + import pyarrow.json as pa_json + with self.s3_fs.open_input_file(s3_path) as f: + table = pa_json.read_json(f) + else: + raise ValueError(f"Unsupported file type: {s3_url}") + + return table.slice(0, limit) if table.num_rows > limit else table + def _is_supported_file(self, key: str) -> bool: - """Check if the file type is supported by DuckDB.""" - supported_extensions = ['.csv', '.parquet', '.json', '.jsonl'] + """Check if the file type is supported (CSV, Parquet, JSON).""" + supported_extensions = [".csv", ".parquet", ".json", ".jsonl"] return any(key.lower().endswith(ext) for ext in supported_extensions) def _estimate_row_count(self, s3_url: str) -> int: """Estimate the number of rows in a file.""" try: - # For parquet files, we can get the exact count + # For parquet files, use PyArrow metadata for exact count if s3_url.lower().endswith('.parquet'): - count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0] - return count + s3_path = s3_url[5:] if s3_url.startswith("s3://") else s3_url + parquet_file = pq.ParquetFile(s3_path, filesystem=self.s3_fs) + return parquet_file.metadata.num_rows - # For CSV, JSON, and JSONL files, we'll skip row count - if s3_url.lower().endswith('.csv') or s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'): - return 0 - except Exception as e: - print(f"Error estimating row count for {s3_url}: {e}") + # For CSV, JSON, and JSONL files, skip row count for efficiency return 0 - - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): - if name_as is None: - name_as = table_name.split('/')[-1].split('.')[0] - - name_as = sanitize_table_name(name_as) - - # Build ORDER BY clause if sort_columns are specified - order_by_clause = "" - if sort_columns and len(sort_columns) > 0: - order_direction = "DESC" if sort_order == 'desc' else "ASC" - sanitized_cols = [f'"{col}" {order_direction}' for col in sort_columns] - order_by_clause = f"ORDER BY {', '.join(sanitized_cols)}" - - # Determine file type and use appropriate DuckDB function - if table_name.lower().endswith('.csv'): - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_csv_auto('{table_name}') - {order_by_clause} - LIMIT {size} - """) - elif table_name.lower().endswith('.parquet'): - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_parquet('{table_name}') - {order_by_clause} - LIMIT {size} - """) - elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'): - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_json_auto('{table_name}') - {order_by_clause} - LIMIT {size} - """) - else: - raise ValueError(f"Unsupported file type: {table_name}") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str): - # Execute the query and get results as a DataFrame - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self.duck_db_conn.execute(query).df() - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) \ No newline at end of file + except Exception as e: + logger.warning(f"Error estimating row count for {s3_url}: {e}") + return 0 \ No newline at end of file diff --git a/py-src/data_formulator/datalake/__init__.py b/py-src/data_formulator/datalake/__init__.py new file mode 100644 index 00000000..c81f9f6c --- /dev/null +++ b/py-src/data_formulator/datalake/__init__.py @@ -0,0 +1,98 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Data Lake module for Data Formulator. + +This module provides a unified data management layer that: +- Manages user workspaces with identity-based directories +- Stores user-uploaded files as-is (CSV, Excel, TXT, HTML, JSON, PDF) +- Stores data from external loaders as parquet via pyarrow +- Tracks all data sources in a workspace.yaml metadata file + +Example usage: + + from data_formulator.datalake import Workspace, save_uploaded_file, write_parquet + + # Get or create a workspace for a user + workspace = Workspace("user:123") + + # Save an uploaded file + with open("sales.csv", "rb") as f: + metadata = save_uploaded_file(workspace, f.read(), "sales.csv") + + # Write a DataFrame as parquet (typically from data loaders) + import pandas as pd + df = pd.DataFrame({"id": [1, 2, 3], "name": ["a", "b", "c"]}) + metadata = write_parquet(workspace, df, "customers") + + # List tables in workspace + tables = workspace.list_tables() + + # Read parquet back + df = read_parquet(workspace, "customers") +""" + +# Workspace management +from data_formulator.datalake.workspace import ( + Workspace, + WorkspaceWithTempData, + get_default_workspace_root, + DATALAKE_ROOT_ENV, +) + +# Metadata types and operations +from data_formulator.datalake.metadata import ( + TableMetadata, + ColumnInfo, + WorkspaceMetadata, + load_metadata, + save_metadata, + metadata_exists, + METADATA_VERSION, + METADATA_FILENAME, +) + +# File operations (for user uploads) +from data_formulator.datalake.file_manager import ( + save_uploaded_file, + save_uploaded_file_from_path, + is_supported_file, + get_file_type, + get_file_info, + SUPPORTED_EXTENSIONS, +) + +# Parquet utilities (pure helpers, no Workspace dependency) +from data_formulator.datalake.parquet_utils import ( + sanitize_table_name, + DEFAULT_COMPRESSION, +) + +__all__ = [ + # Workspace + "Workspace", + "WorkspaceWithTempData", + "get_default_workspace_root", + "DATALAKE_ROOT_ENV", + # Metadata + "TableMetadata", + "ColumnInfo", + "WorkspaceMetadata", + "load_metadata", + "save_metadata", + "metadata_exists", + "METADATA_VERSION", + "METADATA_FILENAME", + # File manager + "save_uploaded_file", + "save_uploaded_file_from_path", + "get_supported_extensions", + "is_supported_file", + "get_file_type", + "get_file_info", + "SUPPORTED_EXTENSIONS", + # Parquet utilities + "sanitize_table_name", + "DEFAULT_COMPRESSION", +] diff --git a/py-src/data_formulator/datalake/file_manager.py b/py-src/data_formulator/datalake/file_manager.py new file mode 100644 index 00000000..88dec0d8 --- /dev/null +++ b/py-src/data_formulator/datalake/file_manager.py @@ -0,0 +1,293 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +File manager for user-uploaded files in the Data Lake. + +This module handles storing user-uploaded files (CSV, Excel, TXT, HTML, JSON, PDF) +as-is in the workspace without conversion. +""" + +import hashlib +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import BinaryIO, Union + +from data_formulator.datalake.metadata import TableMetadata +from data_formulator.datalake.workspace import Workspace + +logger = logging.getLogger(__name__) + +# Supported file extensions for upload +SUPPORTED_EXTENSIONS = { + '.csv': 'csv', + '.xlsx': 'excel', + '.xls': 'excel', + '.txt': 'txt', + '.html': 'html', + '.htm': 'html', + '.json': 'json', + '.pdf': 'pdf', +} + + +def is_supported_file(filename: str) -> bool: + ext = Path(filename).suffix.lower() + return ext in SUPPORTED_EXTENSIONS + + +def get_file_type(filename: str) -> str | None: + """ + Get the file type based on extension. + + Args: + filename: Name of the file + + Returns: + File type string (e.g., 'csv', 'excel') or None if unsupported + """ + ext = Path(filename).suffix.lower() + return SUPPORTED_EXTENSIONS.get(ext) + + +def compute_file_hash(content: bytes) -> str: + """ + Compute MD5 hash of file content. + + Args: + content: File content as bytes + + Returns: + MD5 hash as hex string + """ + return hashlib.md5(content).hexdigest() + + +def sanitize_table_name(name: str) -> str: + """ + Sanitize a string to be a valid table name. + + Args: + name: Original name + + Returns: + Sanitized name suitable for use as a table identifier + """ + # Remove extension if present + name = Path(name).stem + + # Replace invalid characters with underscores + sanitized = [] + for char in name: + if char.isalnum() or char == '_': + sanitized.append(char) + else: + sanitized.append('_') + + result = ''.join(sanitized) + + # Ensure it starts with a letter or underscore + if result and not (result[0].isalpha() or result[0] == '_'): + result = '_' + result + + # Ensure it's not empty + if not result: + result = '_unnamed' + + return result.lower() + + +def generate_unique_filename( + workspace: Workspace, + desired_filename: str, +) -> str: + """ + Generate a unique filename if the desired one already exists. + + Args: + workspace: The workspace to check + desired_filename: The desired filename + + Returns: + A unique filename (may be the original if it doesn't exist) + """ + if not workspace.file_exists(desired_filename): + return desired_filename + + # Split filename and extension + path = Path(desired_filename) + stem = path.stem + suffix = path.suffix + + # Try adding numbers until we find a unique name + counter = 1 + while True: + new_filename = f"{stem}_{counter}{suffix}" + if not workspace.file_exists(new_filename): + return new_filename + counter += 1 + if counter > 1000: # Safety limit + raise ValueError(f"Could not generate unique filename for {desired_filename}") + + +def save_uploaded_file( + workspace: Workspace, + file_content: Union[bytes, BinaryIO], + filename: str, + table_name: str | None = None, + overwrite: bool = False, +) -> TableMetadata: + """ + Save an uploaded file to the workspace. + + The file is stored as-is without conversion. Metadata is added to track + the file in the workspace. + + Args: + workspace: The workspace to save to + file_content: File content as bytes or file-like object + filename: Original filename (used for extension detection) + table_name: Name to use for the table. If None, derived from filename. + overwrite: If True, overwrite existing file. If False, generate unique name. + + Returns: + TableMetadata for the saved file + + Raises: + ValueError: If file type is not supported + """ + # Validate file type + file_type = get_file_type(filename) + if file_type is None: + raise ValueError( + f"Unsupported file type: {filename}. " + f"Supported extensions: {', '.join(SUPPORTED_EXTENSIONS.keys())}" + ) + + # Read content if it's a file-like object + if hasattr(file_content, 'read'): + content = file_content.read() + else: + content = file_content + + # Determine the actual filename to use + if overwrite: + actual_filename = filename + else: + actual_filename = generate_unique_filename(workspace, filename) + + # Determine table name + if table_name is None: + table_name = sanitize_table_name(actual_filename) + + # Ensure table name is unique in metadata + metadata = workspace.get_metadata() + if table_name in metadata.tables and not overwrite: + # Generate unique table name + base_name = table_name + counter = 1 + while table_name in metadata.tables: + table_name = f"{base_name}_{counter}" + counter += 1 + + # Write the file + file_path = workspace.get_file_path(actual_filename) + with open(file_path, 'wb') as f: + f.write(content) + + # Compute hash and size + content_hash = compute_file_hash(content) + file_size = len(content) + + # Create metadata + table_metadata = TableMetadata( + name=table_name, + source_type="upload", + filename=actual_filename, + file_type=file_type, + created_at=datetime.now(timezone.utc), + content_hash=content_hash, + file_size=file_size, + ) + + # Save metadata + workspace.add_table_metadata(table_metadata) + + logger.info( + f"Saved uploaded file {actual_filename} as table {table_name} " + f"({file_size} bytes, hash={content_hash[:8]}...)" + ) + + return table_metadata + + +def save_uploaded_file_from_path( + workspace: Workspace, + source_path: Union[str, Path], + table_name: str | None = None, + overwrite: bool = False, +) -> TableMetadata: + """ + Save a file from a local path to the workspace. + + Args: + workspace: The workspace to save to + source_path: Path to the source file + table_name: Name to use for the table. If None, derived from filename. + overwrite: If True, overwrite existing file. + + Returns: + TableMetadata for the saved file + """ + source_path = Path(source_path) + + if not source_path.exists(): + raise FileNotFoundError(f"Source file not found: {source_path}") + + with open(source_path, 'rb') as f: + content = f.read() + + return save_uploaded_file( + workspace=workspace, + file_content=content, + filename=source_path.name, + table_name=table_name, + overwrite=overwrite, + ) + + +def get_file_info(workspace: Workspace, table_name: str) -> dict | None: + """ + Get information about an uploaded file. + + Args: + workspace: The workspace + table_name: Name of the table + + Returns: + Dictionary with file information or None if not found + """ + table_meta = workspace.get_table_metadata(table_name) + if table_meta is None: + return None + + file_path = workspace.get_file_path(table_meta.filename) + + result = { + "table_name": table_name, + "filename": table_meta.filename, + "file_type": table_meta.file_type, + "file_size": table_meta.file_size, + "content_hash": table_meta.content_hash, + "created_at": table_meta.created_at.isoformat(), + "exists": file_path.exists(), + } + + if file_path.exists(): + stat = file_path.stat() + result["current_size"] = stat.st_size + result["modified_at"] = datetime.fromtimestamp(stat.st_mtime).isoformat() + + return result diff --git a/py-src/data_formulator/datalake/metadata.py b/py-src/data_formulator/datalake/metadata.py new file mode 100644 index 00000000..800ec46d --- /dev/null +++ b/py-src/data_formulator/datalake/metadata.py @@ -0,0 +1,294 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Metadata management for the Data Lake workspace. + +This module defines the schema and operations for workspace.yaml, +which tracks all data sources (uploaded files and data loader ingests). +""" + +from dataclasses import dataclass, field, asdict +from datetime import datetime, date, timezone +from decimal import Decimal +from pathlib import Path +from typing import Literal, Any +import yaml +import logging + +logger = logging.getLogger(__name__) + +METADATA_VERSION = "1.1" +METADATA_FILENAME = "workspace.yaml" + + +def make_json_safe(value: Any) -> Any: + """ + Convert a value (possibly containing numpy/pandas/pyarrow scalars) into + a JSON/YAML-safe primitive structure. + """ + if value is None or isinstance(value, (bool, int, float, str)): + return value + + if isinstance(value, (datetime, date)): + return value.isoformat() + + if isinstance(value, Decimal): + return str(value) + + if isinstance(value, Path): + return str(value) + + if isinstance(value, dict): + return {str(k): make_json_safe(v) for k, v in value.items()} + + if isinstance(value, (list, tuple)): + return [make_json_safe(v) for v in value] + + # numpy scalars, pandas scalars, etc. + item = getattr(value, "item", None) + if callable(item): + try: + return make_json_safe(item()) + except Exception: + pass + + return str(value) + + +@dataclass +class ColumnInfo: + """Information about a single column in a table.""" + name: str + dtype: str + + def to_dict(self) -> dict: + return {"name": self.name, "dtype": self.dtype} + + @classmethod + def from_dict(cls, data: dict) -> "ColumnInfo": + return cls(name=data["name"], dtype=data["dtype"]) + + +@dataclass +class TableMetadata: + """Metadata for a single table/file in the workspace.""" + name: str + source_type: Literal["upload", "data_loader"] + filename: str + file_type: str + created_at: datetime + content_hash: str | None = None + file_size: int | None = None + # For data_loader sources: + loader_type: str | None = None + loader_params: dict | None = None + source_table: str | None = None + source_query: str | None = None + last_synced: datetime | None = None + row_count: int | None = None + columns: list[ColumnInfo] | None = None + + def to_dict(self) -> dict: + """Convert to dictionary for YAML serialization.""" + result = { + "source_type": self.source_type, + "filename": self.filename, + "file_type": self.file_type, + "created_at": self.created_at.isoformat(), + } + + if self.content_hash is not None: + result["content_hash"] = self.content_hash + if self.file_size is not None: + result["file_size"] = self.file_size + if self.loader_type is not None: + result["loader_type"] = self.loader_type + if self.loader_params is not None: + result["loader_params"] = self.loader_params + if self.source_table is not None: + result["source_table"] = self.source_table + if self.source_query is not None: + result["source_query"] = self.source_query + if self.last_synced is not None: + result["last_synced"] = self.last_synced.isoformat() + if self.row_count is not None: + result["row_count"] = self.row_count + if self.columns is not None: + result["columns"] = [col.to_dict() for col in self.columns] + + return result + + @classmethod + def from_dict(cls, name: str, data: dict) -> "TableMetadata": + """Create from dictionary (YAML deserialization).""" + columns = None + if "columns" in data and data["columns"] is not None: + columns = [ColumnInfo.from_dict(col) for col in data["columns"]] + + created_at = data["created_at"] + if isinstance(created_at, str): + created_at = datetime.fromisoformat(created_at) + + last_synced = data.get("last_synced") + if isinstance(last_synced, str): + last_synced = datetime.fromisoformat(last_synced) + + return cls( + name=name, + source_type=data["source_type"], + filename=data["filename"], + file_type=data["file_type"], + created_at=created_at, + content_hash=data.get("content_hash"), + file_size=data.get("file_size"), + loader_type=data.get("loader_type"), + loader_params=data.get("loader_params"), + source_table=data.get("source_table"), + source_query=data.get("source_query"), + last_synced=last_synced, + row_count=data.get("row_count"), + columns=columns, + ) + + +@dataclass +class WorkspaceMetadata: + """Metadata for the entire workspace.""" + version: str + created_at: datetime + updated_at: datetime + tables: dict[str, TableMetadata] = field(default_factory=dict) + + def add_table(self, table: TableMetadata) -> None: + """Add or update a table in the metadata.""" + self.tables[table.name] = table + self.updated_at = datetime.now(timezone.utc) + + def remove_table(self, name: str) -> bool: + """Remove a table from the metadata. Returns True if removed.""" + if name in self.tables: + del self.tables[name] + self.updated_at = datetime.now(timezone.utc) + return True + return False + + def get_table(self, name: str) -> TableMetadata | None: + """Get metadata for a specific table.""" + return self.tables.get(name) + + def list_tables(self) -> list[str]: + """List all table names.""" + return list(self.tables.keys()) + + def to_dict(self) -> dict: + """Convert to dictionary for YAML serialization.""" + return { + "version": self.version, + "created_at": self.created_at.isoformat(), + "updated_at": self.updated_at.isoformat(), + "tables": { + name: table.to_dict() + for name, table in self.tables.items() + }, + } + + @classmethod + def from_dict(cls, data: dict) -> "WorkspaceMetadata": + """Create from dictionary (YAML deserialization).""" + created_at = data["created_at"] + if isinstance(created_at, str): + created_at = datetime.fromisoformat(created_at) + + updated_at = data["updated_at"] + if isinstance(updated_at, str): + updated_at = datetime.fromisoformat(updated_at) + + tables = {} + tables_data = data.get("tables", {}) + if tables_data: + for name, table_data in tables_data.items(): + tables[name] = TableMetadata.from_dict(name, table_data) + + return cls( + version=data["version"], + created_at=created_at, + updated_at=updated_at, + tables=tables, + ) + + @classmethod + def create_new(cls) -> "WorkspaceMetadata": + """Create a new empty workspace metadata.""" + now = datetime.now(timezone.utc) + return cls( + version=METADATA_VERSION, + created_at=now, + updated_at=now, + tables={}, + ) + + +def load_metadata(workspace_path: Path) -> WorkspaceMetadata: + """ + Load workspace metadata from YAML file. + + Args: + workspace_path: Path to the workspace directory + + Returns: + WorkspaceMetadata object + + Raises: + FileNotFoundError: If metadata file doesn't exist + ValueError: If metadata file is invalid + """ + metadata_file = workspace_path / METADATA_FILENAME + + if not metadata_file.exists(): + raise FileNotFoundError(f"Metadata file not found: {metadata_file}") + + try: + with open(metadata_file, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + if data is None: + raise ValueError("Empty metadata file") + + return WorkspaceMetadata.from_dict(data) + + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML in metadata file: {e}") + + +def save_metadata(workspace_path: Path, metadata: WorkspaceMetadata) -> None: + """ + Save workspace metadata to YAML file. + + Args: + workspace_path: Path to the workspace directory + metadata: WorkspaceMetadata object to save + """ + metadata_file = workspace_path / METADATA_FILENAME + + # Update the updated_at timestamp + metadata.updated_at = datetime.now(timezone.utc) + + # Ensure directory exists + workspace_path.mkdir(parents=True, exist_ok=True) + + with open(metadata_file, "w", encoding="utf-8") as f: + yaml.dump( + metadata.to_dict(), + f, + default_flow_style=False, + allow_unicode=True, + sort_keys=False, + ) + + logger.debug(f"Saved metadata to {metadata_file}") + + +def metadata_exists(workspace_path: Path) -> bool: + """Check if workspace metadata file exists.""" + return (workspace_path / METADATA_FILENAME).exists() diff --git a/py-src/data_formulator/datalake/parquet_utils.py b/py-src/data_formulator/datalake/parquet_utils.py new file mode 100644 index 00000000..1e507e9e --- /dev/null +++ b/py-src/data_formulator/datalake/parquet_utils.py @@ -0,0 +1,178 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Parquet utility functions for the Data Lake. + +Pure helper functions for parquet I/O, hashing, column introspection, and +name sanitisation. These utilities have **no dependency on Workspace** and +are consumed by Workspace methods that handle metadata bookkeeping. +""" + +import hashlib +import logging +from typing import Any + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq + +from data_formulator.datalake.metadata import ColumnInfo, make_json_safe + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Default compression for parquet files +DEFAULT_COMPRESSION = "snappy" + +# Default number of rows to persist in metadata for preview +DEFAULT_METADATA_SAMPLE_ROWS = 50 + + +# --------------------------------------------------------------------------- +# Name helpers +# --------------------------------------------------------------------------- + +def sanitize_table_name(name: str) -> str: + """ + Sanitize a string to be a valid table/file name. + + Args: + name: Original name + + Returns: + Sanitized name + """ + sanitized = [] + for char in name: + if char.isalnum() or char == '_': + sanitized.append(char) + else: + sanitized.append('_') + + result = ''.join(sanitized) + + # Ensure it starts with a letter or underscore + if result and not (result[0].isalpha() or result[0] == '_'): + result = '_' + result + + # Ensure it's not empty + if not result: + result = '_unnamed' + + return result.lower() + + +# --------------------------------------------------------------------------- +# Arrow / DataFrame introspection +# --------------------------------------------------------------------------- + +def get_sample_rows_from_arrow( + table: pa.Table, limit: int = DEFAULT_METADATA_SAMPLE_ROWS +) -> list[dict[str, Any]]: + """Get a small sample of rows from an Arrow table as JSON/YAML-safe records.""" + if table.num_rows <= 0 or limit <= 0: + return [] + sample = table.slice(0, min(limit, table.num_rows)) + return make_json_safe(sample.to_pylist()) + + +def get_arrow_column_info(table: pa.Table) -> list[ColumnInfo]: + """Extract column information from a PyArrow Table.""" + return [ColumnInfo(name=field.name, dtype=str(field.type)) for field in table.schema] + + +def get_column_info(df: pd.DataFrame) -> list[ColumnInfo]: + """Extract column information from a pandas DataFrame.""" + return [ColumnInfo(name=str(col), dtype=str(df[col].dtype)) for col in df.columns] + + +# --------------------------------------------------------------------------- +# Hashing +# --------------------------------------------------------------------------- + +def compute_arrow_table_hash(table: pa.Table, sample_rows: int = 100) -> str: + """ + Compute an MD5 hash representing the Arrow Table content. + + Uses row count, column names, and sampled rows for efficiency. + """ + hash_parts = [ + f"rows:{table.num_rows}", + f"cols:{','.join(table.column_names)}", + ] + + if table.num_rows > 0: + if table.num_rows <= sample_rows: + sample = table + else: + n = sample_rows // 3 + indices = ( + list(range(n)) + + list(range(table.num_rows // 4, table.num_rows // 4 + n)) + + list(range(table.num_rows - n, table.num_rows)) + ) + sample = table.take(indices) + hash_parts.append(f"data:{sample.to_string()}") + + content = '|'.join(hash_parts) + return hashlib.md5(content.encode()).hexdigest() + + +def sanitize_dataframe_for_arrow(df: pd.DataFrame) -> pd.DataFrame: + """ + Sanitize a DataFrame for conversion to PyArrow Table. + + Handles common issues that cause ArrowTypeError: + - Mixed types in object columns (e.g., strings and integers) + - Columns with all nulls that have ambiguous type + + For object dtype columns, converts all non-null values to strings + to ensure consistent typing. + + Returns: + A copy of the DataFrame with sanitized columns. + """ + df = df.copy() + + for col in df.columns: + # Handle object dtype columns (potential mixed types) + if df[col].dtype == 'object': + # Convert all non-null values to string + # This handles mixed int/string columns safely + df[col] = df[col].apply( + lambda x: str(x) if pd.notna(x) and x is not None else None + ) + + return df + + +def compute_dataframe_hash(df: pd.DataFrame, sample_rows: int = 100) -> str: + """ + Compute an MD5 hash representing the DataFrame content. + + Uses row count, column names, and sampled rows for efficiency. + """ + hash_parts = [ + f"rows:{len(df)}", + f"cols:{','.join(df.columns.tolist())}", + ] + + if len(df) > 0: + if len(df) <= sample_rows: + sample = df + else: + n = sample_rows // 3 + first = df.head(n) + last = df.tail(n) + middle = df.iloc[len(df) // 4 : len(df) * 3 // 4].sample( + min(n, len(df) // 2), random_state=42 + ) + sample = pd.concat([first, middle, last]) + hash_parts.append(f"data:{sample.to_string()}") + + content = '|'.join(hash_parts) + return hashlib.md5(content.encode()).hexdigest() diff --git a/py-src/data_formulator/datalake/workspace.py b/py-src/data_formulator/datalake/workspace.py new file mode 100644 index 00000000..506c136d --- /dev/null +++ b/py-src/data_formulator/datalake/workspace.py @@ -0,0 +1,558 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Workspace management for the Data Lake. + +Each user has a workspace directory identified by their identity_id. +The workspace contains all their data files (uploaded and ingested) +plus a workspace.yaml metadata file. +""" + +import os +import shutil +import tempfile +import logging +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq + +from data_formulator.datalake.metadata import ( + WorkspaceMetadata, + TableMetadata, + load_metadata, + save_metadata, + metadata_exists, +) +from data_formulator.datalake.parquet_utils import ( + sanitize_table_name, + get_arrow_column_info, + compute_arrow_table_hash, + get_column_info, + compute_dataframe_hash, + sanitize_dataframe_for_arrow, + DEFAULT_COMPRESSION, +) + +logger = logging.getLogger(__name__) + +# Environment variable for configuring workspace root +DATALAKE_ROOT_ENV = "DATALAKE_ROOT" + +# Default subdirectory name under temp for workspaces +DEFAULT_WORKSPACE_SUBDIR = "data_formulator_workspaces" + + +def get_default_workspace_root() -> Path: + """ + Get the default workspace root directory. + + Uses DATALAKE_ROOT env variable if set, otherwise uses system temp directory. + """ + env_root = os.getenv(DATALAKE_ROOT_ENV) + if env_root: + return Path(env_root) + return Path(tempfile.gettempdir()) / DEFAULT_WORKSPACE_SUBDIR + + +class Workspace: + """ + Manages a user's workspace directory in the Data Lake. + + The workspace contains: + - workspace.yaml: Metadata file tracking all data sources + - Data files: User uploaded files (CSV, Excel, etc.) and parquet files from data loaders + + All files are stored in a single flat directory per user. + """ + + def __init__(self, identity_id: str, root_dir: Optional[str | Path] = None): + """ + Initialize a workspace for a user. + + Args: + identity_id: Unique identifier for the user (e.g., "user:123" or "browser:abc") + root_dir: Root directory for all workspaces. If None, uses default. + """ + if not identity_id: + raise ValueError("identity_id cannot be empty") + + # Sanitize identity_id for filesystem safety + self._identity_id = identity_id + self._safe_id = self._sanitize_identity_id(identity_id) + + # Determine root directory + if root_dir is None: + self._root = get_default_workspace_root() + else: + self._root = Path(root_dir) + + # Workspace path is root / sanitized_identity_id + self._path = self._root / self._safe_id + + # Ensure workspace directory exists + self._path.mkdir(parents=True, exist_ok=True) + + # Initialize metadata if it doesn't exist + if not metadata_exists(self._path): + self._init_metadata() + + logger.debug(f"Initialized workspace at {self._path}") + + @staticmethod + def _sanitize_identity_id(identity_id: str) -> str: + """ + Sanitize identity_id for use as a directory name. + + Replaces potentially problematic characters with underscores. + """ + # Replace colons, slashes, and other special characters + safe_chars = [] + for char in identity_id: + if char.isalnum() or char in ('_', '-'): + safe_chars.append(char) + else: + safe_chars.append('_') + return ''.join(safe_chars) + + def _init_metadata(self) -> None: + """Initialize a new workspace with empty metadata.""" + metadata = WorkspaceMetadata.create_new() + save_metadata(self._path, metadata) + logger.info(f"Initialized new workspace metadata at {self._path}") + + def get_file_path(self, filename: str) -> Path: + """ + Get the full path for a file in the workspace. + + Args: + filename: Name of the file + + Returns: + Full path to the file + """ + # Prevent directory traversal attacks + safe_filename = Path(filename).name + return self._path / safe_filename + + def file_exists(self, filename: str) -> bool: + """ + Check if a file exists in the workspace. + + Args: + filename: Name of the file + + Returns: + True if file exists, False otherwise + """ + return self.get_file_path(filename).exists() + + + def delete_table(self, table_name: str) -> bool: + """ + Delete a table by name (removes both file and metadata). + + Args: + table_name: Name of the table to delete + + Returns: + True if table was deleted, False if it didn't exist + """ + metadata = self.get_metadata() + table = metadata.get_table(table_name) + + if table is None: + return False + + # Delete the file + file_path = self.get_file_path(table.filename) + if file_path.exists(): + file_path.unlink() + + # Remove from metadata + metadata.remove_table(table_name) + self.save_metadata(metadata) + + logger.info(f"Deleted table {table_name} from workspace {self._safe_id}") + return True + + def get_metadata(self) -> WorkspaceMetadata: + return load_metadata(self._path) + + def save_metadata(self, metadata: WorkspaceMetadata) -> None: + save_metadata(self._path, metadata) + + def add_table_metadata(self, table: TableMetadata) -> None: + metadata = self.get_metadata() + metadata.add_table(table) + self.save_metadata(metadata) + + def get_table_metadata(self, table_name: str) -> Optional[TableMetadata]: + """Look up table metadata, falling back to sanitized name.""" + ws_metadata = self.get_metadata() + result = ws_metadata.get_table(table_name) + if result is None: + result = ws_metadata.get_table(sanitize_table_name(table_name)) + return result + + def list_tables(self) -> list[str]: + metadata = self.get_metadata() + return metadata.list_tables() + + def get_fresh_name(self, name: str) -> str: + """ + Generate a unique table name that doesn't conflict with existing tables. + + Sanitizes the input name, then checks if it already exists in the workspace. + If it does, appends an incrementing numeric suffix (_2, _3, ...) until + a unique name is found. + + Args: + name: Desired table name (will be sanitized) + + Returns: + A sanitized, unique table name safe for use in write_parquet etc. + """ + base = sanitize_table_name(name) + existing = set(self.list_tables()) + + if base not in existing: + return base + + # Try incrementing suffixes + counter = 2 + while f"{base}_{counter}" in existing: + counter += 1 + return f"{base}_{counter}" + + def cleanup(self) -> None: + """ Remove the entire workspace directory. """ + if self._path.exists(): + shutil.rmtree(self._path) + logger.info(f"Cleaned up workspace {self._safe_id}") + + def get_relative_data_file_path(self, table_name: str) -> str: + """ + Get the filename for a table, suitable for use in generated code. + + Since files are stored flat in the workspace directory and code runs + with the workspace as cwd, this returns just the filename + (e.g. "sales_data.parquet", "report.csv"). + + Falls back to sanitized table name if the original is not found. + + Args: + table_name: Name of the table in the workspace + + Returns: + Filename string that can be used in read_parquet() / read_csv() etc. + + Raises: + FileNotFoundError: If the table doesn't exist + """ + metadata = self.get_table_metadata(table_name) + if metadata is None: + raise FileNotFoundError(f"Table not found: {table_name}") + return metadata.filename + + def read_data_as_df(self, table_name: str) -> pd.DataFrame: + """ + Read a table from the workspace as a pandas DataFrame. + + Automatically selects the appropriate reader based on the file's type + (stored in metadata). Supports parquet, csv, excel, json, and txt. + Falls back to sanitized table name if the original name is not found. + + Args: + table_name: Name of the table in the workspace + + Returns: + pandas DataFrame with the table data + + Raises: + FileNotFoundError: If the table or file doesn't exist + ValueError: If the file type is not supported for DataFrame reading + """ + metadata = self.get_table_metadata(table_name) + if metadata is None: + raise FileNotFoundError(f"Table not found: {table_name}") + + file_path = self.get_file_path(metadata.filename) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + file_type = metadata.file_type + + if file_type == "parquet": + return pd.read_parquet(file_path) + elif file_type == "csv": + return pd.read_csv(file_path) + elif file_type == "excel": + return pd.read_excel(file_path) + elif file_type == "json": + return pd.read_json(file_path) + elif file_type == "txt": + return pd.read_csv(file_path, sep="\t") + else: + raise ValueError( + f"Unsupported file type '{file_type}' for table '{table_name}'. " + f"Supported types: parquet, csv, excel, json, txt." + ) + + # ------------------------------------------------------------------ + # Parquet management + # ------------------------------------------------------------------ + + def write_parquet_from_arrow( + self, + table: pa.Table, + table_name: str, + compression: str = DEFAULT_COMPRESSION, + loader_metadata: Optional[dict[str, Any]] = None, + ) -> TableMetadata: + """ + Write a PyArrow Table directly to parquet. + + This is the preferred path because it avoids pandas conversion. + """ + safe_name = sanitize_table_name(table_name) + filename = f"{safe_name}.parquet" + + # Overwrite existing file if present + metadata = self.get_metadata() + if safe_name in metadata.tables: + old_file = self.get_file_path(metadata.tables[safe_name].filename) + if old_file.exists(): + old_file.unlink() + + file_path = self.get_file_path(filename) + pq.write_table(table, file_path, compression=compression) + + now = datetime.now(timezone.utc) + table_metadata = TableMetadata( + name=safe_name, + source_type="data_loader", + filename=filename, + file_type="parquet", + created_at=now, + content_hash=compute_arrow_table_hash(table), + file_size=file_path.stat().st_size, + row_count=table.num_rows, + columns=get_arrow_column_info(table), + last_synced=now, + ) + + if loader_metadata: + table_metadata.loader_type = loader_metadata.get('loader_type') + table_metadata.loader_params = loader_metadata.get('loader_params') + table_metadata.source_table = loader_metadata.get('source_table') + table_metadata.source_query = loader_metadata.get('source_query') + + self.add_table_metadata(table_metadata) + logger.info( + f"Wrote parquet {filename}: {table.num_rows} rows, " + f"{table.num_columns} cols ({table_metadata.file_size} bytes) [Arrow]" + ) + return table_metadata + + def write_parquet( + self, + df: pd.DataFrame, + table_name: str, + compression: str = DEFAULT_COMPRESSION, + loader_metadata: Optional[dict[str, Any]] = None, + ) -> TableMetadata: + """Write a pandas DataFrame to parquet.""" + safe_name = sanitize_table_name(table_name) + filename = f"{safe_name}.parquet" + + metadata = self.get_metadata() + if safe_name in metadata.tables: + old_file = self.get_file_path(metadata.tables[safe_name].filename) + if old_file.exists(): + old_file.unlink() + + file_path = self.get_file_path(filename) + # Sanitize DataFrame to handle mixed types in object columns + sanitized_df = sanitize_dataframe_for_arrow(df) + arrow_table = pa.Table.from_pandas(sanitized_df) + pq.write_table(arrow_table, file_path, compression=compression) + + now = datetime.now(timezone.utc) + table_metadata = TableMetadata( + name=safe_name, + source_type="data_loader", + filename=filename, + file_type="parquet", + created_at=now, + content_hash=compute_dataframe_hash(df), + file_size=file_path.stat().st_size, + row_count=len(df), + columns=get_column_info(df), + last_synced=now, + ) + + if loader_metadata: + table_metadata.loader_type = loader_metadata.get('loader_type') + table_metadata.loader_params = loader_metadata.get('loader_params') + table_metadata.source_table = loader_metadata.get('source_table') + table_metadata.source_query = loader_metadata.get('source_query') + + self.add_table_metadata(table_metadata) + logger.info( + f"Wrote parquet {filename}: {len(df)} rows, " + f"{len(df.columns)} cols ({table_metadata.file_size} bytes)" + ) + return table_metadata + + def get_parquet_schema(self, table_name: str) -> dict: + """Get schema information for a parquet table without reading all data.""" + meta = self.get_table_metadata(table_name) + if meta is None: + raise FileNotFoundError(f"Table not found: {table_name}") + if meta.file_type != "parquet": + raise ValueError(f"Table {table_name} is not a parquet file") + path = self.get_file_path(meta.filename) + if not path.exists(): + raise FileNotFoundError(f"Parquet file not found: {path}") + + pf = pq.ParquetFile(path) + schema = pf.schema_arrow + return { + "table_name": table_name, + "filename": meta.filename, + "num_rows": pf.metadata.num_rows, + "num_columns": len(schema), + "columns": [ + {"name": f.name, "type": str(f.type), "nullable": f.nullable} + for f in schema + ], + "created_at": meta.created_at.isoformat(), + "last_synced": meta.last_synced.isoformat() if meta.last_synced else None, + } + + def get_parquet_path(self, table_name: str) -> Path: + """Return the resolved filesystem path of the parquet file for *table_name*.""" + meta = self.get_table_metadata(table_name) + if meta is None: + raise FileNotFoundError(f"Table not found: {table_name}") + if meta.file_type != "parquet": + raise ValueError(f"Table {table_name} is not a parquet file") + path = self.get_file_path(meta.filename) + if not path.exists(): + raise FileNotFoundError(f"Parquet file not found: {path}") + return path.resolve() + + def run_parquet_sql(self, table_name: str, sql: str) -> pd.DataFrame: + """ + Run a DuckDB SQL query against a parquet table. + + The *sql* string must contain a ``{parquet}`` placeholder which will + be replaced with ``read_parquet('')``. + Example: ``SELECT * FROM {parquet} AS t LIMIT 10`` + + This gives efficient column-pruned / row-group-skipped reads on + large parquet files without loading the full table into memory. + """ + import duckdb + + path = self.get_parquet_path(table_name) + path_escaped = str(path).replace("\\", "\\\\").replace("'", "''") + if "{parquet}" not in sql: + raise ValueError("SQL must contain {parquet} placeholder") + full_sql = sql.format(parquet=f"read_parquet('{path_escaped}')") + conn = duckdb.connect(":memory:") + try: + return conn.execute(full_sql).fetchdf() + finally: + conn.close() + + def refresh_parquet_from_arrow( + self, + table_name: str, + table: pa.Table, + compression: str = DEFAULT_COMPRESSION, + ) -> tuple[TableMetadata, bool]: + """ + Refresh a parquet table with new Arrow data. + + Returns ``(new_metadata, data_changed)``. + """ + old_meta = self.get_table_metadata(table_name) + if old_meta is None: + raise FileNotFoundError(f"Table not found: {table_name}") + + new_hash = compute_arrow_table_hash(table) + if old_meta.content_hash == new_hash: + old_meta.last_synced = datetime.now(timezone.utc) + self.add_table_metadata(old_meta) + logger.info(f"Table {table_name} unchanged (hash: {new_hash[:8]}…)") + return old_meta, False + + loader_metadata = { + 'loader_type': old_meta.loader_type, + 'loader_params': old_meta.loader_params, + 'source_table': old_meta.source_table, + 'source_query': old_meta.source_query, + } + new_meta = self.write_parquet_from_arrow( + table=table, + table_name=table_name, + compression=compression, + loader_metadata=loader_metadata, + ) + logger.info(f"Refreshed {table_name}: {old_meta.row_count} → {new_meta.row_count} rows") + return new_meta, True + + def refresh_parquet( + self, + table_name: str, + df: pd.DataFrame, + compression: str = DEFAULT_COMPRESSION, + ) -> tuple[TableMetadata, bool]: + """Refresh a parquet table with new DataFrame data.""" + return self.refresh_parquet_from_arrow( + table_name, pa.Table.from_pandas(df), compression + ) + + def __repr__(self) -> str: + return f"Workspace(identity_id={self._identity_id!r}, path={self._path!r})" + + +class WorkspaceWithTempData: + """ + Context manager that temporarily adds temp data (list of {name, rows}) to a workspace + as parquet tables, yields the same workspace, and removes those tables on exit. + + Use when the client sends in-memory data (e.g. language == "python"): wrap the + workspace so temp tables are visible for the block and then cleaned up. + """ + + def __init__(self, workspace: Workspace, temp_data: Optional[list[dict[str, Any]]] = None): + self._workspace = workspace + self._temp_data = temp_data if temp_data else None + self._added_table_names: list[str] = [] + + def __enter__(self) -> Workspace: + if not self._temp_data: + return self._workspace + + for item in self._temp_data: + base_name = item.get("name", "table") + name = self._workspace.get_fresh_name(base_name) + rows = item.get("rows", []) + df = pd.DataFrame(rows) if rows else pd.DataFrame() + meta = self._workspace.write_parquet(df, name) + self._added_table_names.append(meta.name) + logger.debug(f"Added temp table {meta.name} to workspace for duration of context") + return self._workspace + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + for name in self._added_table_names: + self._workspace.delete_table(name) + logger.debug(f"Removed temp table {name} from workspace") + self._added_table_names.clear() \ No newline at end of file diff --git a/py-src/data_formulator/db_manager.py b/py-src/data_formulator/db_manager.py deleted file mode 100644 index 66bf2c8c..00000000 --- a/py-src/data_formulator/db_manager.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import duckdb -import pandas as pd -from typing import Dict -import tempfile -import os -from contextlib import contextmanager -from dotenv import load_dotenv -import logging - -logger = logging.getLogger(__name__) - -class DuckDBManager: - def __init__(self, local_db_dir: str, disabled: bool = False): - # Store session db file paths - self._db_files: Dict[str, str] = {} - self._local_db_dir: str = local_db_dir - self._disabled: bool = disabled - - def is_disabled(self) -> bool: - """Check if the database manager is disabled""" - return self._disabled - - @contextmanager - def connection(self, session_id: str): - """Get a DuckDB connection as a context manager that will be closed when exiting the context""" - conn = None - try: - conn = self.get_connection(session_id) - yield conn - finally: - if conn: - conn.close() - - def get_connection(self, session_id: str) -> duckdb.DuckDBPyConnection: - """Internal method to get or create a DuckDB connection for a session""" - if self._disabled: - return duckdb.connect(database=":memory:") - - # Get or create the db file path for this session - if session_id not in self._db_files or self._db_files[session_id] is None: - db_dir = self._local_db_dir if self._local_db_dir else tempfile.gettempdir() - if not os.path.exists(db_dir): - db_dir = tempfile.gettempdir() - db_file = os.path.join(db_dir, f"df_{session_id}.duckdb") - logger.debug(f"=== Creating new db file: {db_file}") - self._db_files[session_id] = db_file - else: - logger.debug(f"=== Using existing db file: {self._db_files[session_id]}") - db_file = self._db_files[session_id] - - # Create a fresh connection to the database file - conn = duckdb.connect(database=db_file) - - return conn - -env = load_dotenv() - -# Initialize the DB manager -db_manager = DuckDBManager( - local_db_dir=os.getenv('LOCAL_DB_DIR'), - disabled=os.getenv('DISABLE_DATABASE', 'false').lower() == 'true' -) \ No newline at end of file diff --git a/py-src/data_formulator/demo_stream_routes.py b/py-src/data_formulator/demo_stream_routes.py index fd1229a4..42bb96e5 100644 --- a/py-src/data_formulator/demo_stream_routes.py +++ b/py-src/data_formulator/demo_stream_routes.py @@ -30,7 +30,7 @@ import math from datetime import datetime, timedelta from flask import Blueprint, Response, request, jsonify -from typing import List, Dict, Any, Optional +from typing import Any from collections import deque import threading @@ -107,9 +107,9 @@ def make_csv_response(rows: list, filename: str = "data.csv") -> Response: # Thread-safe storage for ISS position history _iss_track_lock = threading.Lock() _iss_track_history: deque = deque(maxlen=10000) # Keep last 10000 positions (~20000 min at 5s intervals) -_iss_last_fetch: Optional[datetime] = None +_iss_last_fetch: datetime | None = None -def _fetch_iss_position() -> Optional[Dict[str, Any]]: +def _fetch_iss_position() -> dict[str, Any] | None: """Fetch current ISS position from API""" try: response = requests.get("http://api.open-notify.org/iss-now.json", timeout=10) @@ -1074,7 +1074,7 @@ def get_yfinance_financials(): # Thread-safe storage for sales transaction history _sales_lock = threading.Lock() _sales_history: deque = deque(maxlen=1000) # Keep last 1000 transactions -_sales_last_update: Optional[datetime] = None +_sales_last_update: datetime | None = None # Products with realistic pricing and popularity _SALES_PRODUCTS = [ @@ -1097,7 +1097,7 @@ def get_yfinance_financials(): _SALES_CHANNEL_WEIGHTS = [0.40, 0.35, 0.15, 0.10] -def _generate_sale_transaction(timestamp: datetime) -> Dict[str, Any]: +def _generate_sale_transaction(timestamp: datetime) -> dict[str, Any]: """Generate a single sale transaction""" product = random.choices(_SALES_PRODUCTS, weights=[p["popularity"] for p in _SALES_PRODUCTS])[0] region = random.choices(_SALES_REGIONS, weights=_SALES_REGION_WEIGHTS)[0] diff --git a/py-src/data_formulator/sandbox/__init__.py b/py-src/data_formulator/sandbox/__init__.py new file mode 100644 index 00000000..338976ed --- /dev/null +++ b/py-src/data_formulator/sandbox/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from .py_sandbox import ( + run_in_subprocess, + run_in_main_process, +) diff --git a/py-src/data_formulator/py_sandbox.py b/py-src/data_formulator/sandbox/py_sandbox.py similarity index 69% rename from py-src/data_formulator/py_sandbox.py rename to py-src/data_formulator/sandbox/py_sandbox.py index 0bd01228..998e9ea5 100644 --- a/py-src/data_formulator/py_sandbox.py +++ b/py-src/data_formulator/sandbox/py_sandbox.py @@ -80,9 +80,10 @@ def run_in_main_process(code, allowed_objects): # List of allowed modules for import ALLOWED_MODULES = { - 'pandas', 'numpy', 'math', 'datetime', 'json', - 'statistics', 'random', 'collections', 're', - 'itertools', 'functools', 'operator', 'sklearn', 'time' + 'pandas', 'numpy', 'math', 'datetime', 'json', + 'statistics', 'random', 'collections', 're', + 'itertools', 'functools', 'operator', 'sklearn', 'scipy', 'time', + 'duckdb' # Added for unified Python+SQL execution } # Custom import function that only allows safe modules and their submodules @@ -112,60 +113,66 @@ def safe_import(name, *args, **kwargs): return {'status': 'ok', 'allowed_objects': {key: restricted_globals[key] for key in allowed_objects}} -def run_transform_in_sandbox2020(code, df_list, exec_python_in_subprocess=False): - - allowed_objects = { - 'df_list': df_list, - 'output_df': None - } +def run_unified_transform_in_sandbox( + code: str, + workspace_path: str, + output_variable: str, + exec_python_in_subprocess: bool = False +) -> dict: + """ + Execute Python script with DuckDB and pandas in workspace directory. + This is used by the unified agent that generates Python scripts combining SQL and pandas. - assemble_code = f''' -import pandas as pd -import json -{code} -output_df = transform_data(*df_list) -''' - - if exec_python_in_subprocess: - result = run_in_subprocess(assemble_code, allowed_objects) - else: - result = run_in_main_process(assemble_code, allowed_objects) - - if result['status'] == 'ok': - result_df = result['allowed_objects']['output_df'] - return { - 'status': 'ok', - 'content': result_df - } - else: - return { - 'status': 'error', - 'content': result['error_message'] - } + Args: + code: Python script to execute (not a function, just a script) + workspace_path: Path to workspace directory (script will run with this as cwd) + output_variable: Name of variable containing result DataFrame + exec_python_in_subprocess: Whether to use subprocess execution + Returns: + dict with status='ok'/'error' and content=DataFrame or error message + """ + import os -def run_derive_concept(code, output_field_name, table_rows, exec_python_in_subprocess=False): - """given a concept derivation function, execute the function on inputs to generate a new dataframe""" - - assemble_code = f''' -import pandas as pd -{code} -new_column = derive_new_column(df) -''' + # Save current directory + original_cwd = os.getcwd() - allowed_objects = { - 'df': pd.DataFrame.from_records(table_rows), - 'new_column': None # the return value of the derive_new_column function - } + try: + # Change to workspace directory so script can access files directly + os.chdir(workspace_path) + + allowed_objects = { + output_variable: None # Will be populated by script + } - if exec_python_in_subprocess: - result = run_in_subprocess(assemble_code, allowed_objects) - else: - result = run_in_main_process(assemble_code, allowed_objects) - - if result['status'] == 'ok': - result_df = result['allowed_objects']['df'] - result_df[output_field_name] = result['allowed_objects']['new_column'] - return { 'status': 'ok', 'content': result_df } - else: - return { 'status': 'error', 'content': result['error_message'] } \ No newline at end of file + # Execute the script directly (no function wrapper) + if exec_python_in_subprocess: + result = run_in_subprocess(code, allowed_objects) + else: + result = run_in_main_process(code, allowed_objects) + + if result['status'] == 'ok': + output_df = result['allowed_objects'][output_variable] + + # Validate output is a DataFrame + if not isinstance(output_df, pd.DataFrame): + return { + 'status': 'error', + 'content': f'Output variable "{output_variable}" is not a DataFrame (type: {type(output_df).__name__})' + } + + return { + 'status': 'ok', + 'content': output_df + } + else: + return result + + except Exception as e: + return { + 'status': 'error', + 'content': f"Error during execution setup: {type(e).__name__} - {str(e)}" + } + finally: + # Always restore original directory + os.chdir(original_cwd) diff --git a/py-src/data_formulator/security/__init__.py b/py-src/data_formulator/security/__init__.py deleted file mode 100644 index a536b8d0..00000000 --- a/py-src/data_formulator/security/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from .query_validator import validate_sql_query - -__all__ = [ 'validate_sql_query'] \ No newline at end of file diff --git a/py-src/data_formulator/security/query_validator.py b/py-src/data_formulator/security/query_validator.py deleted file mode 100644 index 8aa03db9..00000000 --- a/py-src/data_formulator/security/query_validator.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import re -import logging -from typing import Tuple, Dict, Any - -logger = logging.getLogger(__name__) - - -class QueryValidationError(Exception): - """Custom exception for query validation failures""" - pass - - -def normalize_query(query: str) -> str: - """ - Normalize query for case-insensitive matching - """ - query_normalized = re.sub(r'--.*$', '', query, flags=re.MULTILINE) # Single line comments - query_normalized = re.sub(r'/\*.*?\*/', '', query_normalized, flags=re.DOTALL) # Multi-line comments - return query_normalized.strip().lower() - -def validate_sql_query(query: str) -> Tuple[bool, str]: - """ - Simple regex-based SQL query validation for dangerous operations. - - Args: - query: SQL query string to validate - - Returns: - Tuple of (is_valid, error_message) - """ - try: - # Normalize query for case-insensitive matching - query_normalized = normalize_query(query) - - # Remove SQL comments - - - # Define dangerous patterns as regex patterns - dangerous_patterns = { - # File read operations - 'file_read_operations': [ - r'\bread_csv_auto\b', r'\bread_csv\b', r'\bread_json\b', r'\bread_parquet\b', - r'\bread_ndjson\b', r'\bread_delim\b', r'\bread_fwf\b', r'\bread_excel\b', - r'\bread_sql\b', r'\bread_table\b', r'\bread_html\b', r'\bread_xml\b', - r'\bread_feather\b', r'\bread_hdf\b', r'\bread_stata\b', r'\bread_sas\b', - r'\bread_spss\b', r'\bread_rdata\b', r'\bread_rds\b' - ], - - # File write operations - 'file_write_operations': [ - r'\bwrite_csv\b', r'\bwrite_json\b', r'\bwrite_parquet\b', r'\bwrite_excel\b', - r'\bwrite_sql\b', r'\bwrite_table\b', r'\bwrite_html\b', r'\bwrite_xml\b', - r'\bwrite_feather\b', r'\bwrite_hdf\b', r'\bwrite_stata\b', r'\bwrite_sas\b', - r'\bwrite_spss\b', r'\bwrite_rdata\b', r'\bwrite_rds\b' - ], - - # File system operations - 'file_system_operations': [ - r'\bglob\b', r'\bcopy\b', r'\bmove\b', r'\brename\b', r'\bdelete\b', - r'\bremove\b', r'\bunlink\b', r'\bmkdir\b', r'\bmakedirs\b', r'\brmdir\b', - r'\bremovedirs\b', r'\bchmod\b', r'\bchown\b', r'\bsymlink\b', r'\blink\b', - r'\btouch\b', r'\btruncate\b', r'\bwrite\b', r'\bappend\b' - ], - - # System operations - 'system_operations': [ - r'\bsystem\b', r'\bexec\b', r'\beval\b', r'\bcompile\b', r'\bexecfile\b', - r'\binput\b', r'\bos\.system\b', r'\bos\.popen\b', r'\bos\.spawn\b', - r'\bos\.fork\b', r'\bos\.kill\b', r'\bsubprocess\b', r'\bsubprocess\.call\b', - r'\bsubprocess\.run\b', r'\bsubprocess\.popen\b', r'\bsubprocess\.check_call\b', - r'\bsubprocess\.check_output\b' - ], - - # Network operations - 'network_operations': [ - r'\burllib\b', r'\brequests\b', r'\bhttp://\b', r'\bhttps://\b', r'\bftp://\b', - r'\bsmtp\b', r'\bpop3\b', r'\bsocket\b', r'\btelnet\b', r'\bssh\b', r'\bscp\b', - r'\bwget\b', r'\bcurl\b' - ], - - # Shell operations - 'shell_operations': [ - r'\bshell\b', r'\bcmd\b', r'\bbash\b', r'\bsh\b', r'\bpowershell\b', - r'\bcmd\.exe\b', r'\bcommand\b', r'\bexecute\b', r'\brun\b', r'\bcall\b', - r'\binvoke\b' - ], - - # DuckDB dangerous operations - 'duckdb_dangerous_operations': [ - r'\binstall\b', r'\bload\b', r'\bunload\b', r'\bexport\b', r'\bimport\b', - r'\bcopy_to\b' - ], - - # SQL injection patterns - 'sql_injection_patterns': [ - r';\s*--', # Comment after semicolon - r';\s*/\*', # Block comment after semicolon - r'\bunion\s+all\s+select\b', # UNION ALL SELECT - r'\bunion\s+select\b', # UNION SELECT - r'\bxp_cmdshell\b', # SQL Server command shell - r'\bsp_executesql\b', # SQL Server dynamic SQL - ], - - # Dangerous SQL keywords - 'dangerous_sql_keywords': [ - r'\binsert\b', r'\bupdate\b', r'\bdelete\b', r'\bdrop\b', r'\bcreate\b', - r'\balter\b', r'\btruncate\b', r'\bgrant\b', r'\brevoke\b', r'\bexecute\b', - r'\bexec\b', r'\bcall\b', r'\bbegin\b', r'\bcommit\b', r'\brollback\b' - ], - - # File path patterns - 'file_path_patterns': [ - r'file://', r'file:///', r'c:\\', r'd:\\', r'e:\\', - r'/etc/', r'/var/', r'/tmp/', r'/home/', r'/root/', - r'/usr/', r'/bin/', r'/sbin/', r'http://', r'https://', - r'ftp://', r'sftp://', r'ssh://' - ] - } - - # Check each category of dangerous patterns - for category, patterns in dangerous_patterns.items(): - for pattern in patterns: - if re.search(pattern, query_normalized, re.IGNORECASE): - return False, f"Dangerous {category.replace('_', ' ')} detected: {pattern}" - - # Check for file paths in string literals - string_literals = re.findall(r"'([^']*)'", query_normalized) + re.findall(r'"([^"]*)"', query_normalized) - for literal in string_literals: - for pattern in dangerous_patterns['file_path_patterns']: - if re.search(pattern, literal, re.IGNORECASE): - return False, f"Dangerous file path detected in string literal: {literal}" - - return True, "Query validation passed" - - except Exception as e: - logger.error(f"Error during query validation: {e}") - return False, f"Query validation error: {str(e)}" - - -def validate_sql_query_strict(query: str) -> Tuple[bool, str]: - """ - Strict validation that only allows SELECT queries and basic operations. - - Args: - query: SQL query string to validate - - Returns: - Tuple of (is_valid, error_message) - """ - try: - # Normalize query - query_normalized = normalize_query(query) - - # Check if it's a SELECT query - if not query_normalized.startswith('select'): - return False, "Only SELECT queries are allowed" - - # Perform regular validation - return validate_sql_query(query) - - except Exception as e: - return False, f"Strict validation error: {str(e)}" - diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py index 80bdeba9..147abf89 100644 --- a/py-src/data_formulator/tables_routes.py +++ b/py-src/data_formulator/tables_routes.py @@ -9,660 +9,593 @@ mimetypes.add_type('application/javascript', '.mjs') import json import traceback -from flask import request, send_from_directory, session, jsonify, Blueprint +from flask import request, jsonify, Blueprint import pandas as pd -import random -import string from pathlib import Path -import uuid -from data_formulator.db_manager import db_manager from data_formulator.data_loader import DATA_LOADERS +from data_formulator.auth import get_identity_id +from data_formulator.datalake.workspace import Workspace +from data_formulator.datalake.parquet_utils import sanitize_table_name as parquet_sanitize_table_name +from data_formulator.datalake.file_manager import save_uploaded_file, is_supported_file +from data_formulator.datalake.metadata import TableMetadata as DatalakeTableMetadata import re -from typing import Tuple # Get logger for this module (logging config done in app.py) logger = logging.getLogger(__name__) import os -import tempfile tables_bp = Blueprint('tables', __name__, url_prefix='/api/tables') + +def _get_workspace(): + """Get workspace for the current identity.""" + return Workspace(get_identity_id()) + + +# Row-count threshold above which we use DuckDB for parquet tables +# (avoids loading the entire file into memory via pandas). +_LARGE_TABLE_THRESHOLD = 100_000 + + +def _should_use_duckdb(workspace, table_name: str) -> bool: + """Return True if the table is a large parquet file that benefits from DuckDB. + + Small parquet tables are faster to handle with pandas (avoids DuckDB + connection overhead and repeated YAML reads). + """ + meta = workspace.get_table_metadata(table_name) + if meta is None or meta.file_type != "parquet": + return False + row_count = meta.row_count or 0 + return row_count > _LARGE_TABLE_THRESHOLD + + +def _quote_duckdb(col: str) -> str: + """Quote identifier for DuckDB (double quotes, escape internal quotes).""" + return '"' + str(col).replace('"', '""') + '"' + + +def _dedup_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame: + """Remove duplicate columns from a DataFrame, keeping the first occurrence.""" + if df.columns.duplicated().any(): + return df.loc[:, ~df.columns.duplicated()] + return df + + +def _dedup_list(items: list) -> list: + """Remove duplicates from a list while preserving order.""" + return list(dict.fromkeys(items)) + + +def _build_parquet_sample_sql( + columns: list[str], + aggregate_fields_and_functions: list, + select_fields: list, + method: str, + order_by_fields: list, + sample_size: int, +) -> tuple[str, str]: + """ + Build DuckDB SQL for sampling (and optional aggregation) over parquet. + Returns (main_sql, count_sql) where each contains {parquet} placeholder. + """ + valid_agg = [(f, fn) for (f, fn) in aggregate_fields_and_functions if f is None or f in columns] + valid_select = _dedup_list([f for f in select_fields if f in columns]) + valid_order = [f for f in order_by_fields if f in columns] + + if valid_agg: + select_parts = [] + for field, function in valid_agg: + fn = function.lower() + if field is None and fn == "count": + select_parts.append("COUNT(*) AS _count") + elif field in columns: + q = _quote_duckdb(field) + if fn == "count": + select_parts.append(f"COUNT({q}) AS _count") + elif fn in ("avg", "average", "mean"): + select_parts.append(f"AVG({q}) AS {_quote_duckdb(field + '_' + function)}") + elif fn == "sum": + select_parts.append(f"SUM({q}) AS {_quote_duckdb(field + '_sum')}") + elif fn == "min": + select_parts.append(f"MIN({q}) AS {_quote_duckdb(field + '_min')}") + elif fn == "max": + select_parts.append(f"MAX({q}) AS {_quote_duckdb(field + '_max')}") + for f in valid_select: + select_parts.append(f"t.{_quote_duckdb(f)}") + group_cols = valid_select + group_by = f" GROUP BY {', '.join('t.' + _quote_duckdb(c) for c in group_cols)}" if group_cols else "" + inner = f"SELECT {', '.join(select_parts)} FROM {{parquet}} AS t{group_by}" + count_sql = f"SELECT COUNT(*) FROM ({inner}) AS sub" + if method == "random": + order_by = " ORDER BY RANDOM()" + elif method == "head" and valid_order: + order_by = " ORDER BY " + ", ".join(f"sub.{_quote_duckdb(c)} ASC" for c in valid_order) + elif method == "bottom" and valid_order: + order_by = " ORDER BY " + ", ".join(f"sub.{_quote_duckdb(c)} DESC" for c in valid_order) + else: + order_by = "" + main_sql = f"SELECT * FROM ({inner}) AS sub{order_by} LIMIT {sample_size}" + return main_sql, count_sql + + count_sql = "SELECT COUNT(*) FROM {parquet} AS t" + if method == "random": + order_by = " ORDER BY RANDOM()" + elif method == "head" and valid_order: + order_by = " ORDER BY " + ", ".join(f"t.{_quote_duckdb(c)} ASC" for c in valid_order) + elif method == "bottom" and valid_order: + order_by = " ORDER BY " + ", ".join(f"t.{_quote_duckdb(c)} DESC" for c in valid_order) + else: + order_by = "" + if valid_select: + select_list = ", ".join(f"t.{_quote_duckdb(c)}" for c in valid_select) + main_sql = f"SELECT {select_list} FROM {{parquet}} AS t{order_by} LIMIT {sample_size}" + else: + main_sql = f"SELECT * FROM {{parquet}} AS t{order_by} LIMIT {sample_size}" + return main_sql, count_sql + + +def _table_metadata_to_source_metadata(meta: DatalakeTableMetadata) -> dict | None: + """Convert workspace TableMetadata to API source_metadata dict (for refresh).""" + if meta.loader_type is None and meta.loader_params is None: + return None + return { + "table_name": meta.name, + "data_loader_type": meta.loader_type or "", + "data_loader_params": meta.loader_params or {}, + "source_table_name": meta.source_table, + "source_query": meta.source_query, + "last_refreshed": meta.last_synced.isoformat() if meta.last_synced else None, + "content_hash": meta.content_hash, + } + + @tables_bp.route('/list-tables', methods=['GET']) def list_tables(): - """List all tables in the current session""" + """List all tables in the current workspace (datalake).""" try: + workspace = _get_workspace() result = [] - with db_manager.connection(session['session_id']) as db: - table_metadata_list = db.execute(""" - SELECT database_name, schema_name, table_name, schema_name==current_schema() as is_current_schema, 'table' as object_type - FROM duckdb_tables() - WHERE internal=False AND database_name == current_database() - UNION ALL - SELECT database_name, schema_name, view_name as table_name, schema_name==current_schema() as is_current_schema, 'view' as object_type - FROM duckdb_views() - WHERE view_name NOT LIKE 'duckdb_%' AND view_name NOT LIKE 'sqlite_%' AND view_name NOT LIKE 'pragma_%' AND database_name == current_database() - """).fetchall() - - - for table_metadata in table_metadata_list: - [database_name, schema_name, table_name, is_current_schema, object_type] = table_metadata - table_name = table_name if is_current_schema else '.'.join([database_name, schema_name, table_name]) - - # Skip system databases and internal metadata tables - if database_name in ['system', 'temp']: - continue - if table_name.startswith('_df_'): # Internal Data Formulator metadata tables + for table_name in workspace.list_tables(): + try: + meta = workspace.get_table_metadata(table_name) + if meta is None: continue - - try: - # Get column information - columns = db.execute(f"DESCRIBE {table_name}").fetchall() - # Get row count - row_count = db.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] - sample_rows = db.execute(f"SELECT * FROM {table_name} LIMIT 1000").fetchdf() if row_count > 0 else pd.DataFrame() - - # Check if this is a view or a table + columns = [{"name": c.name, "type": c.dtype} for c in (meta.columns or [])] + if not columns and meta.file_type == "parquet": try: - # Get both view existence and source in one query - view_info = db.execute(f"SELECT view_name, sql FROM duckdb_views() WHERE view_name = '{table_name}'").fetchone() - view_source = view_info[1] if view_info else None - except Exception as e: - # If the query fails, assume it's a regular table - view_source = None - - # Get source metadata if available (for refreshable tables) - source_metadata = None + schema_info = workspace.get_parquet_schema(table_name) + columns = [{"name": c["name"], "type": c["type"]} for c in schema_info.get("columns", [])] + except Exception: + pass + row_count = meta.row_count + if row_count is None and meta.file_type == "parquet": try: - source_metadata = get_table_metadata(db, table_name) + schema_info = workspace.get_parquet_schema(table_name) + row_count = schema_info.get("num_rows", 0) or 0 except Exception: - pass # Metadata table may not exist yet - - result.append({ - "name": table_name, - "columns": [{"name": col[0], "type": col[1]} for col in columns], - "row_count": row_count, - "sample_rows": json.loads(sample_rows.to_json(orient='records', date_format='iso')), - "view_source": view_source, - "source_metadata": source_metadata - }) - - except Exception as e: - logger.error(f"Error getting table metadata for {table_name}: {str(e)}") - continue - - return jsonify({ - "status": "success", - "tables": result - }) + row_count = 0 + if row_count is None: + row_count = 0 + sample_rows = [] + if row_count > 0: + try: + if _should_use_duckdb(workspace, table_name): + df = workspace.run_parquet_sql(table_name, "SELECT * FROM {parquet} AS t LIMIT 1000") + else: + df = workspace.read_data_as_df(table_name) + df = df.head(1000) + df = _dedup_dataframe_columns(df) + sample_rows = json.loads(df.to_json(orient='records', date_format='iso')) + except Exception: + pass + source_metadata = _table_metadata_to_source_metadata(meta) + result.append({ + "name": table_name, + "columns": columns, + "row_count": row_count, + "sample_rows": sample_rows, + "view_source": None, + "source_metadata": source_metadata, + }) + except Exception as e: + logger.error(f"Error getting table metadata for {table_name}: {str(e)}") + continue + return jsonify({"status": "success", "tables": result}) except Exception as e: safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code -def assemble_query(aggregate_fields_and_functions, group_fields, columns, table_name): +def _apply_aggregation_and_sample( + df: pd.DataFrame, + aggregate_fields_and_functions: list, + select_fields: list, + method: str, + order_by_fields: list, + sample_size: int, +) -> tuple[pd.DataFrame, int]: """ - Assembles a SELECT query string based on binning, aggregation, and grouping specifications. - - Args: - bin_fields (list): Fields to be binned into ranges - aggregate_fields_and_functions (list): List of tuples (field, function) for aggregation - group_fields (list): Fields to group by - columns (list): All available column names - - Returns: - str: The assembled SELECT query projection part + Apply aggregation (optional), then sample with ordering. + Returns (sampled_df, total_row_count_after_aggregation). """ - select_parts = [] - output_column_names = [] - - # Handle aggregate fields and functions - for field, function in aggregate_fields_and_functions: - if field is None: - # Handle count(*) case - if function.lower() == 'count': - select_parts.append('COUNT(*) as _count') - output_column_names.append('_count') - elif field in columns: - if function.lower() == 'count': - alias = f'_count' - select_parts.append(f'COUNT(*) as "{alias}"') - output_column_names.append(alias) - else: - # Sanitize function name and create alias - if function in ["avg", "average", "mean"]: - aggregate_function = "AVG" - else: - aggregate_function = function.upper() - - alias = f'{field}_{function}' - select_parts.append(f'{aggregate_function}("{field}") as "{alias}"') - output_column_names.append(alias) - - # Handle group fields - for field in group_fields: - if field in columns: - select_parts.append(f'"{field}"') - output_column_names.append(field) - # If no fields are specified, select all columns - if not select_parts: - select_parts = ["*"] - output_column_names = columns - - from_clause = f"FROM {table_name}" - group_by_clause = f"GROUP BY {', '.join(group_fields)}" if len(group_fields) > 0 and len(aggregate_fields_and_functions) > 0 else "" - - query = f"SELECT {', '.join(select_parts)} {from_clause} {group_by_clause}" - return query, output_column_names + columns = list(df.columns) + valid_agg = [ + (f, fn) for (f, fn) in aggregate_fields_and_functions + if f is None or f in columns + ] + valid_select = _dedup_list([f for f in select_fields if f in columns]) + valid_order = [f for f in order_by_fields if f in columns] + + if valid_agg: + group_cols = valid_select + agg_spec = {} + for field, function in valid_agg: + fn = function.lower() + if field is None and fn == "count": + agg_spec["_count"] = ("__size__", "size") + elif field in columns: + if fn == "count": + agg_spec["_count"] = (field, "count") + elif fn in ("avg", "average", "mean"): + agg_spec[f"{field}_{function}"] = (field, "mean") + elif fn == "sum": + agg_spec[f"{field}_sum"] = (field, "sum") + elif fn == "min": + agg_spec[f"{field}_min"] = (field, "min") + elif fn == "max": + agg_spec[f"{field}_max"] = (field, "max") + if "_count" in agg_spec and agg_spec["_count"] == ("__size__", "size"): + df = df.assign(__size__=1) + agg_spec["_count"] = ("__size__", "count") + if group_cols: + df_agg = df.groupby(group_cols, dropna=False).agg(**{k: (c, f) for k, (c, f) in agg_spec.items()}).reset_index() + else: + df_agg = pd.DataFrame([{k: df[c].agg(f) for k, (c, f) in agg_spec.items()}]) + total_row_count = len(df_agg) + work = df_agg + else: + total_row_count = len(df) + work = df[valid_select].copy() if valid_select else df.copy() + + if method == "random": + work = work.sample(n=min(sample_size, len(work)), random_state=None) + elif method == "head": + work = work.sort_values(by=valid_order, ascending=True).head(sample_size) if valid_order else work.head(sample_size) + elif method == "bottom": + work = work.sort_values(by=valid_order, ascending=False).head(sample_size) if valid_order else work.tail(sample_size).iloc[::-1].reset_index(drop=True) + else: + work = work.head(sample_size) + return work, total_row_count + @tables_bp.route('/sample-table', methods=['POST']) def sample_table(): - """Sample a table""" + """Sample a table from the workspace. Uses DuckDB for parquet (no full load).""" try: data = request.get_json() table_id = data.get('table') sample_size = data.get('size', 1000) - aggregate_fields_and_functions = data.get('aggregate_fields_and_functions', []) # each element is a tuple (field, function) - select_fields = data.get('select_fields', []) # if empty, we want to include all fields - method = data.get('method', 'random') # one of 'random', 'head', 'bottom' + aggregate_fields_and_functions = data.get('aggregate_fields_and_functions', []) + select_fields = data.get('select_fields', []) + method = data.get('method', 'random') order_by_fields = data.get('order_by_fields', []) - - total_row_count = 0 - # Validate field names against table columns to prevent SQL injection - with db_manager.connection(session['session_id']) as db: - # Get valid column names - columns = [col[0] for col in db.execute(f"DESCRIBE {table_id}").fetchall()] - - - # Filter order_by_fields to only include valid column names - valid_order_by_fields = [field for field in order_by_fields if field in columns] - valid_aggregate_fields_and_functions = [ - field_and_function for field_and_function in aggregate_fields_and_functions - if field_and_function[0] is None or field_and_function[0] in columns - ] - valid_select_fields = [field for field in select_fields if field in columns] - - query, output_column_names = assemble_query(valid_aggregate_fields_and_functions, valid_select_fields, columns, table_id) - - - # Modify the original query to include the count: - count_query = f"SELECT *, COUNT(*) OVER () as total_count FROM ({query}) as subq LIMIT 1" - result = db.execute(count_query).fetchone() - total_row_count = result[-1] if result else 0 - - - # Add ordering and limit to the main query - if method == 'random': - query += f" ORDER BY RANDOM() LIMIT {sample_size}" - elif method == 'head': - if valid_order_by_fields: - # Build ORDER BY clause with validated fields - order_by_clause = ", ".join([f'"{field}"' for field in valid_order_by_fields]) - query += f" ORDER BY {order_by_clause} LIMIT {sample_size}" - else: - query += f" LIMIT {sample_size}" - elif method == 'bottom': - if valid_order_by_fields: - # Build ORDER BY clause with validated fields in descending order - order_by_clause = ", ".join([f'"{field}" DESC' for field in valid_order_by_fields]) - query += f" ORDER BY {order_by_clause} LIMIT {sample_size}" - else: - query += f" ORDER BY ROWID DESC LIMIT {sample_size}" - - - result = db.execute(query).fetchdf() - - + workspace = _get_workspace() + if _should_use_duckdb(workspace, table_id): + schema_info = workspace.get_parquet_schema(table_id) + columns = [c["name"] for c in schema_info.get("columns", [])] + main_sql, count_sql = _build_parquet_sample_sql( + columns, + aggregate_fields_and_functions, + select_fields, + method, + order_by_fields, + sample_size, + ) + total_row_count = int(workspace.run_parquet_sql(table_id, count_sql).iloc[0, 0]) + result_df = workspace.run_parquet_sql(table_id, main_sql) + else: + df = workspace.read_data_as_df(table_id) + result_df, total_row_count = _apply_aggregation_and_sample( + df, + aggregate_fields_and_functions, + select_fields, + method, + order_by_fields, + sample_size, + ) + result_df = _dedup_dataframe_columns(result_df) + rows_json = json.loads(result_df.to_json(orient='records', date_format='iso')) return jsonify({ "status": "success", - "rows": json.loads(result.to_json(orient='records', date_format='iso')), - "total_row_count": total_row_count + "rows": rows_json, + "total_row_count": total_row_count, }) except Exception as e: logger.error(f"Error sampling table: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code @tables_bp.route('/get-table', methods=['GET']) def get_table_data(): - """Get data from a specific table""" + """Get data from a specific table in the workspace. Uses DuckDB for parquet (LIMIT/OFFSET only).""" try: - with db_manager.connection(session['session_id']) as db: + table_name = request.args.get('table_name') + page = int(request.args.get('page', 1)) + page_size = int(request.args.get('page_size', 100)) + offset = (page - 1) * page_size - table_name = request.args.get('table_name') - # Get pagination parameters - page = int(request.args.get('page', 1)) - page_size = int(request.args.get('page_size', 100)) - offset = (page - 1) * page_size - - if not table_name: - return jsonify({ - "status": "error", - "message": "Table name is required" - }), 400 - - # Get total count - total_rows = db.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] - - # Get paginated data - result = db.execute( - f"SELECT * FROM {table_name} LIMIT {page_size} OFFSET {offset}" - ).fetchall() - - # Get column names - columns = [col[0] for col in db.execute(f"DESCRIBE {table_name}").fetchall()] - - # Convert to list of dictionaries - rows = [dict(zip(columns, row)) for row in result] - - return jsonify({ - "status": "success", - "table_name": table_name, - "columns": columns, - "rows": rows, - "total_rows": total_rows, - "page": page, - "page_size": page_size - }) - + if not table_name: + return jsonify({"status": "error", "message": "Table name is required"}), 400 + + workspace = _get_workspace() + if _should_use_duckdb(workspace, table_name): + count_df = workspace.run_parquet_sql(table_name, "SELECT COUNT(*) FROM {parquet} AS t") + total_rows = int(count_df.iloc[0, 0]) + page_df = workspace.run_parquet_sql( + table_name, + f"SELECT * FROM {{parquet}} AS t LIMIT {page_size} OFFSET {offset}", + ) + page_df = _dedup_dataframe_columns(page_df) + columns = list(page_df.columns) + rows = json.loads(page_df.to_json(orient='records', date_format='iso')) + else: + df = workspace.read_data_as_df(table_name) + df = _dedup_dataframe_columns(df) + total_rows = len(df) + columns = list(df.columns) + page_df = df.iloc[offset : offset + page_size] + rows = json.loads(page_df.to_json(orient='records', date_format='iso')) + + return jsonify({ + "status": "success", + "table_name": table_name, + "columns": columns, + "rows": rows, + "total_rows": total_rows, + "page": page, + "page_size": page_size, + }) except Exception as e: logger.error(f"Error getting table data: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code @tables_bp.route('/create-table', methods=['POST']) def create_table(): - """Create a new table from uploaded data""" + """Create a new table from uploaded file or raw data in the workspace.""" try: if 'file' not in request.files and 'raw_data' not in request.form: return jsonify({"status": "error", "message": "No file or raw data provided"}), 400 - + table_name = request.form.get('table_name') if not table_name: return jsonify({"status": "error", "message": "No table name provided"}), 400 - - df = None + + workspace = _get_workspace() + base_name = parquet_sanitize_table_name(table_name) + sanitized_table_name = base_name + counter = 1 + while sanitized_table_name in workspace.list_tables(): + sanitized_table_name = f"{base_name}_{counter}" + counter += 1 + if 'file' in request.files: file = request.files['file'] - # Read file based on extension - if file.filename.endswith('.csv'): - df = pd.read_csv(file) - elif file.filename.endswith(('.xlsx', '.xls')): - df = pd.read_excel(file) - elif file.filename.endswith('.json'): - df = pd.read_json(file) - else: + if not file.filename or not is_supported_file(file.filename): return jsonify({"status": "error", "message": "Unsupported file format"}), 400 + meta = save_uploaded_file( + workspace, + file.stream, + file.filename, + table_name=sanitized_table_name, + overwrite=False, + ) + sanitized_table_name = meta.name + row_count = meta.row_count + columns = [c.name for c in (meta.columns or [])] + if row_count is None or not columns: + df = workspace.read_data_as_df(sanitized_table_name) + row_count = len(df) + columns = list(df.columns) else: raw_data = request.form.get('raw_data') try: df = pd.DataFrame(json.loads(raw_data)) except Exception as e: - return jsonify({"status": "error", "message": f"Invalid JSON data: {str(e)}, it must be in the format of a list of dictionaries"}), 400 + return jsonify({"status": "error", "message": f"Invalid JSON data: {str(e)}, it must be a list of dictionaries"}), 400 + workspace.write_parquet(df, sanitized_table_name) + row_count = len(df) + columns = list(df.columns) - if df is None: - return jsonify({"status": "error", "message": "No data provided"}), 400 - - sanitized_table_name = sanitize_table_name(table_name) - - with db_manager.connection(session['session_id']) as db: - # Check if table exists and generate unique name if needed - base_name = sanitized_table_name - counter = 1 - while True: - # Check if table exists - exists = db.execute(f"SELECT COUNT(*) FROM duckdb_tables() WHERE table_name = '{sanitized_table_name}'").fetchone()[0] > 0 - if not exists: - break - # If exists, append counter to base name - sanitized_table_name = f"{base_name}_{counter}" - counter += 1 - - # Create table - db.register('df_temp', df) - db.execute(f"CREATE TABLE {sanitized_table_name} AS SELECT * FROM df_temp") - db.execute("DROP VIEW df_temp") # Drop the temporary view after creating the table - - return jsonify({ - "status": "success", - "table_name": sanitized_table_name, - "row_count": len(df), - "columns": list(df.columns), - "original_name": base_name, # Include the original name in response - "is_renamed": base_name != sanitized_table_name # Flag indicating if name was changed - }) - + return jsonify({ + "status": "success", + "table_name": sanitized_table_name, + "row_count": row_count, + "columns": columns, + "original_name": base_name, + "is_renamed": base_name != sanitized_table_name, + }) except Exception as e: logger.error(f"Error creating table: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code +@tables_bp.route('/sync-table-data', methods=['POST']) +def sync_table_data(): + """Update an existing workspace table's parquet with new row data. + + Used when the frontend has fresher data than the workspace (e.g., from stream refresh) + and needs to sync it so sandbox code reads the latest data. + """ + try: + data = request.get_json() + table_name = data.get('table_name') + rows = data.get('rows') + + if not table_name: + return jsonify({"status": "error", "message": "table_name is required"}), 400 + if rows is None: + return jsonify({"status": "error", "message": "rows is required"}), 400 + + workspace = _get_workspace() + + if table_name not in workspace.list_tables(): + return jsonify({"status": "error", "message": f"Table '{table_name}' not found in workspace"}), 404 + + df = pd.DataFrame(rows) if rows else pd.DataFrame() + workspace.write_parquet(df, table_name) + + return jsonify({ + "status": "success", + "table_name": table_name, + "row_count": len(df), + }) + except Exception as e: + logger.error(f"Error syncing table data: {str(e)}") + safe_msg, status_code = sanitize_db_error_message(e) + return jsonify({"status": "error", "message": safe_msg}), status_code + @tables_bp.route('/delete-table', methods=['POST']) def drop_table(): - """Drop a table or view""" + """Drop a table from the workspace.""" try: data = request.get_json() table_name = data.get('table_name') - if not table_name: return jsonify({"status": "error", "message": "No table name provided"}), 400 - - with db_manager.connection(session['session_id']) as db: - # First check if it exists as a view - view_exists = db.execute(f"SELECT view_name FROM duckdb_views() WHERE view_name = '{table_name}'").fetchone() is not None - if view_exists: - db.execute(f"DROP VIEW IF EXISTS {table_name}") - - # Then check if it exists as a table - table_exists = db.execute(f"SELECT table_name FROM duckdb_tables() WHERE table_name = '{table_name}'").fetchone() is not None - if table_exists: - db.execute(f"DROP TABLE IF EXISTS {table_name}") - - if not view_exists and not table_exists: - return jsonify({ - "status": "error", - "message": f"Table/view '{table_name}' does not exist" - }), 404 - - return jsonify({ - "status": "success", - "message": f"Table/view {table_name} dropped" - }) - + + workspace = _get_workspace() + if not workspace.delete_table(table_name): + return jsonify({"status": "error", "message": f"Table '{table_name}' does not exist"}), 404 + return jsonify({"status": "success", "message": f"Table {table_name} dropped"}) except Exception as e: logger.error(f"Error dropping table: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code @tables_bp.route('/upload-db-file', methods=['POST']) def upload_db_file(): - """Upload a db file""" - try: - if 'file' not in request.files: - return jsonify({"status": "error", "message": "No file provided"}), 400 - - file = request.files['file'] - if not file.filename.endswith('.db'): - return jsonify({"status": "error", "message": "Invalid file format. Only .db files are supported"}), 400 - - # Get the session ID - if 'session_id' not in session: - return jsonify({"status": "error", "message": "No session ID found"}), 400 - - session_id = session['session_id'] - - # Create temp directory if it doesn't exist - temp_dir = os.path.join(tempfile.gettempdir()) - os.makedirs(temp_dir, exist_ok=True) - - # Save the file temporarily to verify it - temp_db_path = os.path.join(temp_dir, f"temp_{session_id}.db") - file.save(temp_db_path) - - # Verify if it's a valid DuckDB file - try: - import duckdb - # Try to connect to the database - conn = duckdb.connect(temp_db_path, read_only=True) - # Try a simple query to verify it's a valid database - conn.execute("SELECT 1").fetchall() - conn.close() - - # If we get here, the file is valid - move it to final location - db_file_path = os.path.join(temp_dir, f"df_{session_id}.db") - os.replace(temp_db_path, db_file_path) - - # Update the db_manager's file mapping - db_manager._db_files[session_id] = db_file_path - - except Exception as db_error: - # Clean up temp file - logger.error(f"Error uploading db file: {str(db_error)}") - if os.path.exists(temp_db_path): - os.remove(temp_db_path) - return jsonify({ - "status": "error", - "message": f"Invalid DuckDB database file." - }), 400 - - return jsonify({ - "status": "success", - "message": "Database file uploaded successfully", - "session_id": session_id - }) - - except Exception as e: - logger.error(f"Error uploading db file: {str(e)}") - safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + """No longer used: storage is workspace/datalake, not DuckDB. Kept for API compatibility.""" + return jsonify({ + "status": "error", + "message": "Database file upload is no longer supported. Data is stored in the workspace; use create-table with a file or data loaders to add data.", + }), 410 @tables_bp.route('/download-db-file', methods=['GET']) def download_db_file(): - """Download the db file for a session""" - try: - # Check if session exists - if 'session_id' not in session: - return jsonify({ - "status": "error", - "message": "No session ID found" - }), 400 - - session_id = session['session_id'] - - # Get the database file path from db_manager - if session_id not in db_manager._db_files: - return jsonify({ - "status": "error", - "message": "No database file found for this session" - }), 404 - - db_file_path = db_manager._db_files[session_id] - - # Check if file exists - if not os.path.exists(db_file_path): - return jsonify({ - "status": "error", - "message": "Database file not found" - }), 404 - - # Generate a filename for download - download_name = f"data_formulator_{session_id}.db" - - # Return the file as an attachment - return send_from_directory( - os.path.dirname(db_file_path), - os.path.basename(db_file_path), - as_attachment=True, - download_name=download_name, - mimetype='application/x-sqlite3' - ) - - except Exception as e: - logger.error(f"Error downloading db file: {str(e)}") - safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + """No longer used: storage is workspace/datalake. Kept for API compatibility.""" + return jsonify({ + "status": "error", + "message": "Database file download is no longer supported. Data lives in the workspace.", + }), 410 @tables_bp.route('/reset-db-file', methods=['POST']) def reset_db_file(): - """Reset the db file for a session""" + """Reset the workspace for the current session (removes all tables and files).""" try: - if 'session_id' not in session: - return jsonify({ - "status": "error", - "message": "No session ID found" - }), 400 - - session_id = session['session_id'] - - logger.info(f"session_id: {session_id}") - - # First check if there's a reference in db_manager - if session_id in db_manager._db_files: - db_file_path = db_manager._db_files[session_id] - - # Remove the file if it exists - if db_file_path and os.path.exists(db_file_path): - os.remove(db_file_path) - - # Clear the reference - db_manager._db_files[session_id] = None - - # Also check for any temporary files - temp_db_path = os.path.join(tempfile.gettempdir(), f"temp_{session_id}.db") - if os.path.exists(temp_db_path): - os.remove(temp_db_path) - - # Check for the main db file - main_db_path = os.path.join(tempfile.gettempdir(), f"df_{session_id}.db") - if os.path.exists(main_db_path): - os.remove(main_db_path) - - return jsonify({ - "status": "success", - "message": "Database file reset successfully" - }) - + workspace = _get_workspace() + workspace.cleanup() + return jsonify({"status": "success", "message": "Workspace reset successfully"}) except Exception as e: - logger.error(f"Error resetting db file: {str(e)}") + logger.error(f"Error resetting workspace: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code + +def _is_numeric_duckdb_type(col_type: str) -> bool: + """Return True if DuckDB/parquet type is numeric for min/max/avg.""" + t = (col_type or "").upper() + return any( + t.startswith(k) for k in ("INT", "BIGINT", "SMALLINT", "TINYINT", "DOUBLE", "FLOAT", "REAL", "DECIMAL", "NUMERIC") + ) + -# Example of a more complex query endpoint @tables_bp.route('/analyze', methods=['POST']) def analyze_table(): - """Get basic statistics about a table""" + """Get basic statistics about a table in the workspace. Uses DuckDB for parquet (no full load).""" try: data = request.get_json() table_name = data.get('table_name') - if not table_name: return jsonify({"status": "error", "message": "No table name provided"}), 400 - - with db_manager.connection(session['session_id']) as db: - - # Get column information - columns = db.execute(f"DESCRIBE {table_name}").fetchall() - + + workspace = _get_workspace() + if _should_use_duckdb(workspace, table_name): + schema_info = workspace.get_parquet_schema(table_name) + col_infos = schema_info.get("columns", []) stats = [] - for col in columns: - col_name = col[0] - col_type = col[1] - - # Properly quote column names to avoid SQL keywords issues - quoted_col_name = f'"{col_name}"' - - # Basic stats query - stats_query = f""" - SELECT - COUNT(*) as count, - COUNT(DISTINCT {quoted_col_name}) as unique_count, - COUNT(*) - COUNT({quoted_col_name}) as null_count - FROM {table_name} - """ - - # Add numeric stats if applicable - if col_type in ['INTEGER', 'DOUBLE', 'DECIMAL']: - stats_query = f""" - SELECT - COUNT(*) as count, - COUNT(DISTINCT {quoted_col_name}) as unique_count, - COUNT(*) - COUNT({quoted_col_name}) as null_count, - MIN({quoted_col_name}) as min_value, - MAX({quoted_col_name}) as max_value, - AVG({quoted_col_name}) as avg_value - FROM {table_name} - """ - - col_stats = db.execute(stats_query).fetchone() - - # Create a dictionary with appropriate keys based on column type - if col_type in ['INTEGER', 'DOUBLE', 'DECIMAL']: - stats_dict = dict(zip( - ["count", "unique_count", "null_count", "min", "max", "avg"], - col_stats - )) + for col_info in col_infos: + col_name = col_info["name"] + col_type = col_info.get("type", "") + q = _quote_duckdb(col_name) + if _is_numeric_duckdb_type(col_type): + sql = ( + f"SELECT COUNT(*) AS count, COUNT(DISTINCT t.{q}) AS unique_count, " + f"COUNT(*) - COUNT(t.{q}) AS null_count, " + f"MIN(t.{q}) AS min_val, MAX(t.{q}) AS max_val, AVG(t.{q}) AS avg_val " + f"FROM {{parquet}} AS t" + ) + df = workspace.run_parquet_sql(table_name, sql) + row = df.iloc[0] + stats_dict = { + "count": int(row["count"]), + "unique_count": int(row["unique_count"]), + "null_count": int(row["null_count"]), + "min": float(row["min_val"]) if row["min_val"] is not None else None, + "max": float(row["max_val"]) if row["max_val"] is not None else None, + "avg": float(row["avg_val"]) if row["avg_val"] is not None else None, + } else: - stats_dict = dict(zip( - ["count", "unique_count", "null_count"], - col_stats - )) - - stats.append({ - "column": col_name, - "type": col_type, - "statistics": stats_dict - }) - - return jsonify({ - "status": "success", - "table_name": table_name, - "statistics": stats - }) - + sql = ( + f"SELECT COUNT(*) AS count, COUNT(DISTINCT t.{q}) AS unique_count, " + f"COUNT(*) - COUNT(t.{q}) AS null_count FROM {{parquet}} AS t" + ) + df = workspace.run_parquet_sql(table_name, sql) + row = df.iloc[0] + stats_dict = { + "count": int(row["count"]), + "unique_count": int(row["unique_count"]), + "null_count": int(row["null_count"]), + } + stats.append({"column": col_name, "type": col_type, "statistics": stats_dict}) + else: + df = workspace.read_data_as_df(table_name) + stats = [] + for col_name in df.columns: + s = df[col_name] + col_type = str(s.dtype) + stats_dict = { + "count": int(s.count()), + "unique_count": int(s.nunique()), + "null_count": int(s.isna().sum()), + } + if pd.api.types.is_numeric_dtype(s): + stats_dict["min"] = float(s.min()) if s.notna().any() else None + stats_dict["max"] = float(s.max()) if s.notna().any() else None + stats_dict["avg"] = float(s.mean()) if s.notna().any() else None + stats.append({"column": col_name, "type": col_type, "statistics": stats_dict}) + + return jsonify({"status": "success", "table_name": table_name, "statistics": stats}) except Exception as e: logger.error(f"Error analyzing table: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code + def sanitize_table_name(table_name: str) -> str: - """ - Sanitize a table name to be a valid DuckDB table name. - """ - # Sanitize table name: - # 1. Convert to lowercase - # 2. Replace hyphens with underscores - # 3. Replace spaces with underscores - # 4. Remove any other special characters - sanitized_table_name = table_name.lower() - sanitized_table_name = sanitized_table_name.replace('-', '_') - sanitized_table_name = sanitized_table_name.replace(' ', '_') - sanitized_table_name = ''.join(c for c in sanitized_table_name if c.isalnum() or c == '_') - - # Ensure table name starts with a letter - if not sanitized_table_name or not sanitized_table_name[0].isalpha(): - sanitized_table_name = 'table_' + sanitized_table_name - - # Verify we have a valid table name after sanitization - if not sanitized_table_name: - return f'table_{uuid.uuid4()}' - return sanitized_table_name + """Sanitize a table name for use in the workspace.""" + return parquet_sanitize_table_name(table_name) -def sanitize_db_error_message(error: Exception) -> Tuple[str, int]: +def sanitize_db_error_message(error: Exception) -> tuple[str, int]: """ Sanitize error messages before sending to client. Returns a tuple of (sanitized_message, status_code) @@ -687,7 +620,7 @@ def sanitize_db_error_message(error: Exception) -> Tuple[str, int]: # Data loader errors r"Entity ID": (error_msg, 500), - r"session_id": ("session_id not found, please refresh the page", 500), + r"identity": ("Identity not found, please refresh the page", 500), } # Check if error matches any safe pattern @@ -727,246 +660,70 @@ def data_loader_list_data_loaders(): @tables_bp.route('/data-loader/list-tables', methods=['POST']) def data_loader_list_tables(): - """List tables from a data loader""" - + """List tables from a data loader (no workspace needed).""" try: data = request.get_json() data_loader_type = data.get('data_loader_type') data_loader_params = data.get('data_loader_params') - table_filter = data.get('table_filter', None) # New filter parameter + table_filter = data.get('table_filter', None) if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: - data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) - - # Pass table_filter to list_tables if the data loader supports it - if hasattr(data_loader, 'list_tables') and 'table_filter' in data_loader.list_tables.__code__.co_varnames: - tables = data_loader.list_tables(table_filter=table_filter) - else: - tables = data_loader.list_tables() - - return jsonify({ - "status": "success", - "tables": tables - }) + data_loader = DATA_LOADERS[data_loader_type](data_loader_params) + if hasattr(data_loader, 'list_tables') and 'table_filter' in data_loader.list_tables.__code__.co_varnames: + tables = data_loader.list_tables(table_filter=table_filter) + else: + tables = data_loader.list_tables() + return jsonify({"status": "success", "tables": tables}) except Exception as e: logger.error(f"Error listing tables from data loader: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code - - -def ensure_table_metadata_table(db_conn): - """ - Ensure the _df_table_source_metadata table exists for storing table source information. - This stores connection info so backend can refresh tables - frontend manages timing/toggle. - """ - db_conn.execute(""" - CREATE TABLE IF NOT EXISTS _df_table_source_metadata ( - table_name VARCHAR PRIMARY KEY, - data_loader_type VARCHAR, - data_loader_params JSON, - source_table_name VARCHAR, - source_query VARCHAR, - last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - content_hash VARCHAR - ) - """) - - # Add content_hash column if it doesn't exist (for existing databases) - try: - db_conn.execute(""" - ALTER TABLE _df_table_source_metadata ADD COLUMN content_hash VARCHAR - """) - except Exception: - # Column already exists - pass - - -def compute_table_content_hash(db_conn, table_name: str) -> str: - """ - Compute a content hash for a table using DuckDB's built-in hash function. - Uses a sampling strategy for efficiency with large tables: - - Row count - - Column names - - First 50 rows, last 50 rows, and 50 sampled rows from middle - """ - import hashlib - - # Get row count - row_count = db_conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] - - # Get column names - columns = db_conn.execute(f"DESCRIBE {table_name}").fetchall() - column_names = [col[0] for col in columns] - - # Build hash components - hash_parts = [ - f"count:{row_count}", - f"cols:{','.join(column_names)}" - ] - - if row_count > 0: - # Sample rows for hashing - # First 50 rows - first_rows = db_conn.execute(f""" - SELECT * FROM {table_name} LIMIT 50 - """).fetchall() - - # Last 50 rows (using row number) - last_rows = db_conn.execute(f""" - SELECT * FROM ( - SELECT *, ROW_NUMBER() OVER () as _rn FROM {table_name} - ) WHERE _rn > {max(0, row_count - 50)} - """).fetchall() - - # Middle sample (every Nth row to get ~50 rows) - if row_count > 100: - step = max(1, (row_count - 100) // 50) - middle_rows = db_conn.execute(f""" - SELECT * FROM ( - SELECT *, ROW_NUMBER() OVER () as _rn FROM {table_name} - ) WHERE _rn > 50 AND _rn <= {row_count - 50} AND (_rn - 50) % {step} = 0 - LIMIT 50 - """).fetchall() - else: - middle_rows = [] - - # Convert rows to strings for hashing - all_sample_rows = first_rows + middle_rows + last_rows - row_strs = [str(row) for row in all_sample_rows] - hash_parts.append(f"rows:{';'.join(row_strs)}") - - # Compute hash - content_str = '|'.join(hash_parts) - return hashlib.md5(content_str.encode()).hexdigest() - - -def save_table_metadata(db_conn, table_name: str, data_loader_type: str, data_loader_params: dict, - source_table_name: str = None, source_query: str = None, content_hash: str = None): - """Save or update table source metadata""" - ensure_table_metadata_table(db_conn) - - # Remove sensitive fields from params before storing - safe_params = {k: v for k, v in data_loader_params.items() if k not in ['password', 'api_key', 'secret']} - - # Compute content hash if not provided - if content_hash is None: - try: - content_hash = compute_table_content_hash(db_conn, table_name) - except Exception as e: - logger.warning(f"Failed to compute content hash for {table_name}: {e}") - content_hash = None - - db_conn.execute(""" - INSERT OR REPLACE INTO _df_table_source_metadata - (table_name, data_loader_type, data_loader_params, source_table_name, source_query, last_refreshed, content_hash) - VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?) - """, [table_name, data_loader_type, json.dumps(safe_params), source_table_name, source_query, content_hash]) - - -def get_table_metadata(db_conn, table_name: str) -> dict: - """Get metadata for a specific table (connection info for refresh)""" - ensure_table_metadata_table(db_conn) - - result = db_conn.execute(""" - SELECT table_name, data_loader_type, data_loader_params, source_table_name, source_query, last_refreshed, content_hash - FROM _df_table_source_metadata - WHERE table_name = ? - """, [table_name]).fetchone() - - if result: - return { - "table_name": result[0], - "data_loader_type": result[1], - "data_loader_params": json.loads(result[2]) if result[2] else {}, - "source_table_name": result[3], - "source_query": result[4], - "last_refreshed": str(result[5]) if result[5] else None, - "content_hash": result[6] - } - return None - - -def get_all_table_metadata(db_conn) -> list: - """Get metadata for all tables""" - ensure_table_metadata_table(db_conn) - - results = db_conn.execute(""" - SELECT table_name, data_loader_type, data_loader_params, source_table_name, source_query, last_refreshed, content_hash - FROM _df_table_source_metadata - """).fetchall() - - return [{ - "table_name": r[0], - "data_loader_type": r[1], - "data_loader_params": json.loads(r[2]) if r[2] else {}, - "source_table_name": r[3], - "source_query": r[4], - "last_refreshed": str(r[5]) if r[5] else None, - "content_hash": r[6] - } for r in results] + return jsonify({"status": "error", "message": safe_msg}), status_code @tables_bp.route('/data-loader/ingest-data', methods=['POST']) def data_loader_ingest_data(): - """Ingest data from a data loader""" - + """Ingest data from a data loader into the workspace as parquet.""" try: data = request.get_json() data_loader_type = data.get('data_loader_type') data_loader_params = data.get('data_loader_params') table_name = data.get('table_name') - import_options = data.get('import_options', {}) - - # Extract import options - row_limit = import_options.get('row_limit', 1000000) if import_options else 1000000 - sort_columns = import_options.get('sort_columns', None) if import_options else None - sort_order = import_options.get('sort_order', 'asc') if import_options else 'asc' + import_options = data.get('import_options', {}) or {} + row_limit = import_options.get('row_limit', 1000000) + sort_columns = import_options.get('sort_columns') + sort_order = import_options.get('sort_order', 'asc') if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: - data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) - data_loader.ingest_data(table_name, size=row_limit, sort_columns=sort_columns, sort_order=sort_order) - - # Get the actual table name that was created (may be sanitized) - sanitized_name = table_name.split('.')[-1] # Base name - - # Store metadata for refresh capability (include import options for future refresh) - save_table_metadata( - duck_db_conn, - sanitized_name, - data_loader_type, - data_loader_params, - source_table_name=table_name - ) - - return jsonify({ - "status": "success", - "message": "Successfully ingested data from data loader", - "table_name": sanitized_name - }) - + workspace = _get_workspace() + data_loader = DATA_LOADERS[data_loader_type](data_loader_params) + safe_name = parquet_sanitize_table_name(table_name.split('.')[-1] if '.' in table_name else table_name) + meta = data_loader.ingest_to_workspace( + workspace, + safe_name, + source_table=table_name, + size=row_limit, + sort_columns=sort_columns, + sort_order=sort_order, + ) + return jsonify({ + "status": "success", + "message": "Successfully ingested data from data loader", + "table_name": meta.name, + }) except Exception as e: logger.error(f"Error ingesting data from data loader: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code - + return jsonify({"status": "error", "message": safe_msg}), status_code + @tables_bp.route('/data-loader/view-query-sample', methods=['POST']) def data_loader_view_query_sample(): - """View a sample of data from a query""" - + """View a sample of data from a query (fetches from external source, no workspace).""" try: data = request.get_json() data_loader_type = data.get('data_loader_type') @@ -975,203 +732,171 @@ def data_loader_view_query_sample(): if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - - with db_manager.connection(session['session_id']) as duck_db_conn: - data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) - sample = data_loader.view_query_sample(query) + data_loader = DATA_LOADERS[data_loader_type](data_loader_params) + if hasattr(data_loader, 'view_query_sample') and callable(getattr(data_loader, 'view_query_sample')): + sample = data_loader.view_query_sample(query) + else: return jsonify({ - "status": "success", - "sample": sample, - "message": "Successfully retrieved query sample" - }) + "status": "error", + "message": "Query sample is only supported for loaders that implement view_query_sample. Use a source table to fetch data.", + }), 400 + return jsonify({"status": "success", "sample": sample, "message": "Successfully retrieved query sample"}) except Exception as e: logger.error(f"Error viewing query sample: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "sample": [], - "message": safe_msg - }), status_code - + return jsonify({"status": "error", "sample": [], "message": safe_msg}), status_code -@tables_bp.route('/data-loader/ingest-data-from-query', methods=['POST']) -def data_loader_ingest_data_from_query(): - """Ingest data from a data loader""" +@tables_bp.route('/data-loader/fetch-data', methods=['POST']) +def data_loader_fetch_data(): + """Fetch data from an external data loader and return as JSON rows WITHOUT saving to workspace. + + This is used when storeOnServer=false (local-only / incognito mode). + The data is returned directly to the frontend without being persisted as parquet. + """ try: data = request.get_json() data_loader_type = data.get('data_loader_type') data_loader_params = data.get('data_loader_params') - query = data.get('query') - name_as = data.get('name_as') + table_name = data.get('table_name') + row_limit = data.get('row_limit', 10000) + sort_columns = data.get('sort_columns') + sort_order = data.get('sort_order', 'asc') + + if not data_loader_type or not table_name: + return jsonify({"status": "error", "message": "data_loader_type and table_name are required"}), 400 if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: - data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) - data_loader.ingest_data_from_query(query, name_as) - - # Store metadata for refresh capability - save_table_metadata( - duck_db_conn, - name_as, - data_loader_type, - data_loader_params, - source_query=query - ) - - return jsonify({ - "status": "success", - "message": "Successfully ingested data from data loader", - "table_name": name_as - }) - + data_loader = DATA_LOADERS[data_loader_type](data_loader_params) + + # Fetch data as DataFrame (not Arrow, since we need JSON output not parquet) + df = data_loader.fetch_data_as_dataframe( + source_table=table_name, + size=row_limit, + sort_columns=sort_columns, + sort_order=sort_order, + ) + + total_row_count = len(df) + # Apply row limit + if len(df) > row_limit: + df = df.head(row_limit) + + df = _dedup_dataframe_columns(df) + rows = json.loads(df.to_json(orient='records', date_format='iso')) + columns = [{"name": col, "type": str(df[col].dtype)} for col in df.columns] + + return jsonify({ + "status": "success", + "rows": rows, + "columns": columns, + "total_row_count": total_row_count, + "row_limit_applied": row_limit, + }) except Exception as e: - logger.error(f"Error ingesting data from data loader: {str(e)}") + logger.error(f"Error fetching data from data loader: {str(e)}") + logger.error(traceback.format_exc()) safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code + + +@tables_bp.route('/data-loader/ingest-data-from-query', methods=['POST']) +def data_loader_ingest_data_from_query(): + """Ingest data from a query into the workspace as parquet.""" + return jsonify({ + "status": "error", + "message": "Ingestion from custom query is not supported. Please select a source table to ingest.", + }), 400 @tables_bp.route('/data-loader/refresh-table', methods=['POST']) def data_loader_refresh_table(): - """ - Refresh a table by re-importing data from its original source. - Requires the table to have been imported via a data loader with stored metadata. - Returns content_hash and data_changed flag so frontend can skip resampling if data unchanged. - """ + """Refresh a table by re-fetching from its source and updating parquet in the workspace.""" try: data = request.get_json() table_name = data.get('table_name') - # Allow passing updated connection params (e.g., for password that wasn't stored) updated_params = data.get('data_loader_params', {}) if not table_name: return jsonify({"status": "error", "message": "table_name is required"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: - # Get stored metadata - metadata = get_table_metadata(duck_db_conn, table_name) - - if not metadata: - return jsonify({ - "status": "error", - "message": f"No source metadata found for table '{table_name}'. Cannot refresh." - }), 400 - - # Get old content hash before refresh - old_content_hash = metadata.get('content_hash') - - data_loader_type = metadata['data_loader_type'] - data_loader_params = {**metadata['data_loader_params'], **updated_params} - - if data_loader_type not in DATA_LOADERS: - return jsonify({ - "status": "error", - "message": f"Unknown data loader type: {data_loader_type}" - }), 400 - - # Create data loader and refresh - data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) - - if metadata['source_query']: - # Refresh from query - data_loader.ingest_data_from_query(metadata['source_query'], table_name) - elif metadata['source_table_name']: - # Refresh from table - data_loader.ingest_data(metadata['source_table_name'], name_as=table_name) - else: - return jsonify({ - "status": "error", - "message": "No source table or query found in metadata" - }), 400 - - # Compute new content hash after refresh - new_content_hash = compute_table_content_hash(duck_db_conn, table_name) - data_changed = old_content_hash != new_content_hash - - # Update last_refreshed timestamp and content_hash - duck_db_conn.execute(""" - UPDATE _df_table_source_metadata - SET last_refreshed = CURRENT_TIMESTAMP, content_hash = ? - WHERE table_name = ? - """, [new_content_hash, table_name]) - - # Get updated row count - row_count = duck_db_conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] + workspace = _get_workspace() + meta = workspace.get_table_metadata(table_name) + if meta is None: + return jsonify({"status": "error", "message": f"No table '{table_name}' found. Cannot refresh."}), 400 + if not meta.loader_type: + return jsonify({"status": "error", "message": f"No source metadata for table '{table_name}'. Cannot refresh."}), 400 + + old_content_hash = meta.content_hash + data_loader_type = meta.loader_type + data_loader_params = {**(meta.loader_params or {}), **updated_params} + if data_loader_type not in DATA_LOADERS: + return jsonify({"status": "error", "message": f"Unknown data loader type: {data_loader_type}"}), 400 + + data_loader = DATA_LOADERS[data_loader_type](data_loader_params) + if meta.source_table: + arrow_table = data_loader.fetch_data_as_arrow(source_table=meta.source_table) + else: return jsonify({ - "status": "success", - "message": f"Successfully refreshed table '{table_name}'", - "row_count": row_count, - "content_hash": new_content_hash, - "data_changed": data_changed - }) + "status": "error", + "message": "Refresh is not supported for tables ingested from a query. Only table-based sources can be refreshed.", + }), 400 + new_meta, data_changed = workspace.refresh_parquet_from_arrow(table_name, arrow_table) + return jsonify({ + "status": "success", + "message": f"Successfully refreshed table '{table_name}'", + "row_count": new_meta.row_count, + "content_hash": new_meta.content_hash, + "data_changed": data_changed, + }) except Exception as e: logger.error(f"Error refreshing table: {str(e)}") logger.error(traceback.format_exc()) safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code @tables_bp.route('/data-loader/get-table-metadata', methods=['POST']) def data_loader_get_table_metadata(): - """Get source metadata for a specific table""" + """Get source metadata for a specific table from workspace.""" try: data = request.get_json() table_name = data.get('table_name') - if not table_name: return jsonify({"status": "error", "message": "table_name is required"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: - metadata = get_table_metadata(duck_db_conn, table_name) - - if metadata: - return jsonify({ - "status": "success", - "metadata": metadata - }) - else: - return jsonify({ - "status": "success", - "metadata": None, - "message": f"No metadata found for table '{table_name}'" - }) - + workspace = _get_workspace() + meta = workspace.get_table_metadata(table_name) + metadata = _table_metadata_to_source_metadata(meta) if meta else None + return jsonify({ + "status": "success", + "metadata": metadata, + "message": f"No metadata found for table '{table_name}'" if metadata is None else None, + }) except Exception as e: logger.error(f"Error getting table metadata: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code + return jsonify({"status": "error", "message": safe_msg}), status_code @tables_bp.route('/data-loader/list-table-metadata', methods=['GET']) def data_loader_list_table_metadata(): - """Get source metadata for all tables""" + """Get source metadata for all tables in the workspace.""" try: - with db_manager.connection(session['session_id']) as duck_db_conn: - metadata_list = get_all_table_metadata(duck_db_conn) - - return jsonify({ - "status": "success", - "metadata": metadata_list - }) - + workspace = _get_workspace() + metadata_list = [] + for name in workspace.list_tables(): + meta = workspace.get_table_metadata(name) + m = _table_metadata_to_source_metadata(meta) if meta else None + if m: + metadata_list.append(m) + return jsonify({"status": "success", "metadata": metadata_list}) except Exception as e: logger.error(f"Error listing table metadata: {str(e)}") safe_msg, status_code = sanitize_db_error_message(e) - return jsonify({ - "status": "error", - "message": safe_msg - }), status_code \ No newline at end of file + return jsonify({"status": "error", "message": safe_msg}), status_code \ No newline at end of file diff --git a/py-src/data_formulator/workflows/create_vl_plots.py b/py-src/data_formulator/workflows/create_vl_plots.py index 41776fec..be335fad 100644 --- a/py-src/data_formulator/workflows/create_vl_plots.py +++ b/py-src/data_formulator/workflows/create_vl_plots.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from typing import Dict, List, Any, Optional +from typing import Any import vl_convert as vlc import base64 @@ -68,7 +68,7 @@ def detect_field_type(series: pd.Series) -> str: ] -def get_chart_template(chart_type: str) -> Optional[Dict]: +def get_chart_template(chart_type: str) -> dict | None: """ Find a chart template by chart type name. """ @@ -77,7 +77,7 @@ def get_chart_template(chart_type: str) -> Optional[Dict]: return template return None -def create_chart_spec(df: pd.DataFrame, fields: List[str], chart_type: str) -> Dict[str, Dict[str, str]]: +def create_chart_spec(df: pd.DataFrame, fields: list[str], chart_type: str) -> dict[str, dict[str, str]]: """ Assign fields to appropriate visualization channels based on their data types and chart type. """ @@ -85,7 +85,7 @@ def create_chart_spec(df: pd.DataFrame, fields: List[str], chart_type: str) -> D return assemble_vegailte_chart(df, chart_type, encodings) -def fields_to_encodings(df, chart_type: str, fields: List[str]) -> Dict[str, Dict[str, str]]: +def fields_to_encodings(df, chart_type: str, fields: list[str]) -> dict[str, dict[str, str]]: """ Assign fields to appropriate visualization channels based on their data types and chart type. @@ -389,9 +389,9 @@ def assign_faceting_channels(): def assemble_vegailte_chart( df: pd.DataFrame, chart_type: str, - encodings: Dict[str, Dict[str, str]], + encodings: dict[str, dict[str, str]], max_nominal_values: int = 68 -) -> Dict: +) -> dict: """ Assemble a Vega-Lite chart specification from a dataframe, chart type, and encodings. @@ -418,6 +418,9 @@ def assemble_vegailte_chart( "encoding": {} } + # Remove duplicate columns before converting to records + if df.columns.duplicated().any(): + df = df.loc[:, ~df.columns.duplicated()] # Add data to the spec (inline data from dataframe) table_data = df.to_dict('records') @@ -574,7 +577,7 @@ def _get_top_values(df: pd.DataFrame, field_name: str, unique_values: list, return unique_values[:max_values] -def vl_spec_to_png(spec: Dict, output_path: str = None, scale: float = 1.0) -> bytes: +def vl_spec_to_png(spec: dict, output_path: str | None = None, scale: float = 1.0) -> bytes: """ Convert a Vega-Lite specification to a PNG image. @@ -600,7 +603,7 @@ def vl_spec_to_png(spec: Dict, output_path: str = None, scale: float = 1.0) -> b return png_data -def spec_to_base64(spec: Dict, scale: float = 1.0) -> str: +def spec_to_base64(spec: dict, scale: float = 1.0) -> str: """ Convert a Vega-Lite specification to a base64 encoded PNG string. diff --git a/py-src/data_formulator/workflows/exploration_flow.py b/py-src/data_formulator/workflows/exploration_flow.py index dc241a8b..70292a87 100644 --- a/py-src/data_formulator/workflows/exploration_flow.py +++ b/py-src/data_formulator/workflows/exploration_flow.py @@ -1,26 +1,22 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import json import logging -from this import d import pandas as pd -from typing import Dict, List, Any, Optional, Tuple, Generator +from typing import Any, Generator from data_formulator.agents.agent_exploration import ExplorationAgent -from data_formulator.agents.agent_py_data_rec import PythonDataRecAgent -from data_formulator.agents.agent_sql_data_rec import SQLDataRecAgent +from data_formulator.agents.agent_data_rec import DataRecAgent from data_formulator.agents.client_utils import Client -from data_formulator.db_manager import db_manager +from data_formulator.datalake.workspace import WorkspaceWithTempData, Workspace from data_formulator.workflows.create_vl_plots import assemble_vegailte_chart, spec_to_base64, detect_field_type -from data_formulator.agents.agent_utils import extract_json_objects logger = logging.getLogger(__name__) def create_chart_spec_from_data( - transformed_data: Dict[str, Any], + transformed_data: dict[str, Any], chart_type: str, - chart_encodings: Dict[str, str] + chart_encodings: dict[str, str] ) -> str: """ Create a chart from transformed data using Vega-Lite. @@ -59,17 +55,16 @@ def create_chart_spec_from_data( return None def run_exploration_flow_streaming( - model_config: Dict[str, str], - input_tables: List[Dict[str, Any]], - initial_plan: List[str], - language: str = "python", - session_id: Optional[str] = None, + model_config: dict[str, str], + input_tables: list[dict[str, Any]], + initial_plan: list[str], + session_id: str | None = None, exec_python_in_subprocess: bool = False, max_iterations: int = 5, max_repair_attempts: int = 1, agent_exploration_rules: str = "", agent_coding_rules: str = "" -) -> Generator[Dict[str, Any], None, None]: +) -> Generator[dict[str, Any], None, None]: """ Run the complete exploration flow from high-level question to final insights as a streaming generator. @@ -77,7 +72,6 @@ def run_exploration_flow_streaming( model_config: Dictionary with endpoint, model, api_key, api_base, api_version input_tables: List of input table dictionaries with 'name' 'rows' and 'attached_metadata' plan: List of steps to continue exploring - language: "python" or "sql" for data transformation session_id: Database session ID for SQL connections exec_python_in_subprocess: Whether to execute Python in subprocess max_iterations: Maximum number of exploration iterations @@ -102,232 +96,220 @@ def run_exploration_flow_streaming( # Initialize client and agents client = Client.from_config(model_config) - if language == "sql": - if session_id: - db_conn = db_manager.get_connection(session_id) - else: - yield { - "iteration": iteration, - "type": "data_transformation", - "content": {}, - "status": "error", - "error_message": "Session ID required for SQL transformations" - } - return - else: - db_conn = None - - # This is the exploration agent that revises the exploration plan - exploration_agent = ExplorationAgent(client, db_conn=db_conn, agent_exploration_rules=agent_exploration_rules) - - # rec agent for data transformation - if language == "sql": - rec_agent = SQLDataRecAgent(client=client, conn=db_conn, agent_coding_rules=agent_coding_rules) - else: - rec_agent = PythonDataRecAgent( - client=client, - exec_python_in_subprocess=exec_python_in_subprocess, - agent_coding_rules=agent_coding_rules - ) + if not session_id: + yield { + "iteration": iteration, + "type": "data_transformation", + "content": {}, + "status": "error", + "error_message": "Session ID required for exploration" + } + return - completed_steps = [] - current_question = initial_plan[0] if len(initial_plan) > 0 else "Let's explore something interesting." - current_plan = initial_plan[1:] + workspace = Workspace(session_id) - # Collect exploration plans at each step - exploration_plan_list = [] - - # Track initial plan if provided - if len(initial_plan) > 1: - exploration_plan_list.append({ - "ref_tables": [{"name": table['name'], "rows": table['rows'][:5] if 'rows' in table else []} for table in input_tables], - "plan": initial_plan[1:] - }) + # Determine temp tables by checking which input tables don't exist in the workspace + existing_tables = set(workspace.list_tables()) + temp_data = [table for table in input_tables if table.get('name') not in existing_tables] + + with WorkspaceWithTempData(workspace, temp_data) as workspace: + exploration_agent = ExplorationAgent(client, workspace=workspace, agent_exploration_rules=agent_exploration_rules) + rec_agent = DataRecAgent(client=client, workspace=workspace, agent_coding_rules=agent_coding_rules) - # Main exploration loop - while iteration < max_iterations + 1: - iteration += 1 + completed_steps = [] + current_question = initial_plan[0] if len(initial_plan) > 0 else "Let's explore something interesting." + current_plan = initial_plan[1:] - # Step 1: Use rec agent to transform data based on current question - logger.info(f"Iteration {iteration}: Using rec agent for question: {current_question}") + # Collect exploration plans at each step + exploration_plan_list = [] - attempt = 0 - if previous_transformation_dialog: + # Track initial plan if provided + if len(initial_plan) > 1: + exploration_plan_list.append({ + "ref_tables": [{"name": table['name'], "rows": table['rows'][:5] if 'rows' in table else []} for table in input_tables], + "plan": initial_plan[1:] + }) - if isinstance(previous_transformation_data, dict) and 'rows' in previous_transformation_data: - latest_data_sample = previous_transformation_data['rows'] - else: - latest_data_sample = [] # Use empty list as fallback + # Main exploration loop + while iteration < max_iterations + 1: + iteration += 1 - transformation_results = rec_agent.followup( - input_tables=input_tables, - new_instruction=current_question, - latest_data_sample=latest_data_sample, - dialog=previous_transformation_dialog - ) - else: - transformation_results = rec_agent.run( - input_tables=input_tables, - description=current_question - ) + # Step 1: Use rec agent to transform data based on current question + logger.info(f"Iteration {iteration}: Using rec agent for question: {current_question}") + + attempt = 0 + if previous_transformation_dialog: + + if isinstance(previous_transformation_data, dict) and 'rows' in previous_transformation_data: + latest_data_sample = previous_transformation_data['rows'] + else: + latest_data_sample = [] # Use empty list as fallback + + transformation_results = rec_agent.followup( + input_tables=input_tables, + new_instruction=current_question, + latest_data_sample=latest_data_sample, + dialog=previous_transformation_dialog + ) + else: + transformation_results = rec_agent.run( + input_tables=input_tables, + description=current_question + ) - # give one attempt to fix potential errors - while (not transformation_results or transformation_results[0]['status'] != 'ok'): + # give one attempt to fix potential errors + while (not transformation_results or transformation_results[0]['status'] != 'ok'): - if attempt >= max_repair_attempts or not transformation_results: + if attempt >= max_repair_attempts or not transformation_results: + yield { + "iteration": iteration, + "type": "data_transformation", + "content": {"question": current_question}, + "status": "error", + "error_message": "data transformation failed" + } + break + + attempt += 1 + error_msg = transformation_results[0]['content'] + dialog = transformation_results[0]['dialog'] + + new_instruction = f"We run into the following problem executing the code, please fix it:\n\n{error_msg}\n\nPlease think step by step, reflect why the error happens and fix the code so that no more errors would occur." + transformation_results = rec_agent.followup( + input_tables=input_tables, + new_instruction=new_instruction, + latest_data_sample=[], + dialog=dialog + ) + + # if the transformation results is not ok, yield an error and break + if transformation_results[0]['status'] != 'ok': yield { "iteration": iteration, "type": "data_transformation", - "content": {"question": current_question}, + "content": {}, "status": "error", - "error_message": "data transformation failed" + "error_message": transformation_results[0]['content'] } break - attempt += 1 - error_msg = transformation_results[0]['content'] - dialog = transformation_results[0]['dialog'] - - new_instruction = f"We run into the following problem executing the code, please fix it:\n\n{error_msg}\n\nPlease think step by step, reflect why the error happens and fix the code so that no more errors would occur." - transformation_results = rec_agent.followup( - input_tables=input_tables, - new_instruction=new_instruction, - latest_data_sample=[], - dialog=dialog - ) + # Extract transformation result + transform_result = transformation_results[0] + transformed_data = transform_result['content'] + refined_goal = transform_result.get('refined_goal', {}) + code = transform_result.get('code', '') + previous_transformation_dialog = transform_result.get('dialog', []) + previous_transformation_data = transformed_data - # if the transformation results is not ok, yield an error and break - if transformation_results[0]['status'] != 'ok': yield { "iteration": iteration, "type": "data_transformation", - "content": {}, - "status": "error", - "error_message": transformation_results[0]['content'] + "content": { + "question": current_question, + "result": transform_result + }, + "status": "success", + "error_message": "" + } + + # Step 2: Create visualization to help generate followup question + chart_type = refined_goal.get('chart_type', 'bar') + chart_encodings = refined_goal.get('chart_encodings', {}) + + chart_spec = create_chart_spec_from_data( + transformed_data, + chart_type, + chart_encodings + ) + current_visualization = spec_to_base64(chart_spec) if chart_spec else None + + # Store this step for exploration analysis + step_data = { + 'question': current_question, + 'code': code, + 'data': {"rows": transformed_data['rows'], "name": transformed_data['virtual']['table_name'] if 'virtual' in transformed_data else None }, + 'visualization': current_visualization } - break + completed_steps.append(step_data) - # Extract transformation result - transform_result = transformation_results[0] - transformed_data = transform_result['content'] - refined_goal = transform_result.get('refined_goal', {}) - code = transform_result.get('code', '') - previous_transformation_dialog = transform_result.get('dialog', []) - previous_transformation_data = transformed_data + # Step 3: Use exploration agent to analyze results and decide next step + logger.info(f"Iteration {iteration}: Using exploration agent to decide next step") + + followup_results = exploration_agent.suggest_followup( + input_tables=input_tables, + completed_steps=completed_steps, + next_steps=current_plan + ) - yield { - "iteration": iteration, - "type": "data_transformation", - "content": { - "question": current_question, - "result": transform_result - }, - "status": "success", - "error_message": "" - } - - # Step 2: Create visualization to help generate followup question - chart_type = refined_goal.get('chart_type', 'bar') - chart_encodings = refined_goal.get('chart_encodings', {}) - - chart_spec = create_chart_spec_from_data( - transformed_data, - chart_type, - chart_encodings - ) - current_visualization = spec_to_base64(chart_spec) if chart_spec else None - - # Store this step for exploration analysis - step_data = { - 'question': current_question, - 'code': code, - 'data': {"rows": transformed_data['rows'], "name": transformed_data['virtual']['table_name'] if 'virtual' in transformed_data else None }, - 'visualization': current_visualization - } - completed_steps.append(step_data) + if not followup_results or followup_results[0]['status'] != 'ok': + error_msg = followup_results[0]['content'] if followup_results else "Follow-up planning failed" + yield { + "iteration": iteration, + "type": "planning", + "content": {}, + "status": "error", + "error_message": error_msg + } + break + + # Extract follow-up decision + followup_plan = followup_results[0]['content'] + + # Check if exploration agent decides to present findings + if followup_plan.get('status') in ['present', 'warning']: + yield { + "iteration": iteration, + "type": "completion", + "content": { + "message": followup_plan.get('summary', ''), + "total_steps": len(completed_steps), + "exploration_plan_list": exploration_plan_list + }, + "status": "success" if followup_plan.get('status') == 'present' else "warning", + "error_message": "" + } + break - # Step 3: Use exploration agent to analyze results and decide next step - logger.info(f"Iteration {iteration}: Using exploration agent to decide next step") - - followup_results = exploration_agent.suggest_followup( - input_tables=input_tables, - completed_steps=completed_steps, - next_steps=current_plan - ) + current_plan = followup_plan.get('next_steps', []) + current_question = current_plan.pop(0) + + # Collect updated plan from exploration agent + # Get table from last completed step (this is the table used for generating the new plan) + if completed_steps: + last_step_data = completed_steps[-1]['data'] + last_step_table = [{ + "name": last_step_data.get('name'), + "rows": last_step_data.get('rows', [])[:5] + }] + else: + last_step_table = [{"name": table['name'], "rows": table['rows'][:5] if 'rows' in table else []} for table in input_tables] + + exploration_plan_list.append({ + "ref_tables": last_step_table, + "plan": current_plan.copy() + }) - if not followup_results or followup_results[0]['status'] != 'ok': - error_msg = followup_results[0]['content'] if followup_results else "Follow-up planning failed" yield { "iteration": iteration, "type": "planning", - "content": {}, - "status": "error", - "error_message": error_msg + "content": { + "message": current_question, + "exploration_steps_count": len(completed_steps) + }, + "status": "success", + "error_message": "" } - break - - # Extract follow-up decision - followup_plan = followup_results[0]['content'] - - # Check if exploration agent decides to present findings - if followup_plan.get('status') in ['present', 'warning']: + + # If we hit max iterations without presenting + if iteration >= max_iterations: yield { "iteration": iteration, "type": "completion", "content": { - "message": followup_plan.get('summary', ''), "total_steps": len(completed_steps), + "reason": "Reached maximum iterations", "exploration_plan_list": exploration_plan_list }, - "status": "success" if followup_plan.get('status') == 'present' else "warning", - "error_message": "" - } - break - - current_plan = followup_plan.get('next_steps', []) - current_question = current_plan.pop(0) - - # Collect updated plan from exploration agent - # Get table from last completed step (this is the table used for generating the new plan) - if completed_steps: - last_step_data = completed_steps[-1]['data'] - last_step_table = [{ - "name": last_step_data.get('name'), - "rows": last_step_data.get('rows', [])[:5] - }] - else: - last_step_table = [{"name": table['name'], "rows": table['rows'][:5] if 'rows' in table else []} for table in input_tables] - - exploration_plan_list.append({ - "ref_tables": last_step_table, - "plan": current_plan.copy() - }) - - yield { - "iteration": iteration, - "type": "planning", - "content": { - "message": current_question, - "exploration_steps_count": len(completed_steps) - }, - "status": "success", - "error_message": "" - } - - # Clean up connection if used - if db_conn: - db_conn.close() - - # If we hit max iterations without presenting - if iteration >= max_iterations: - yield { - "iteration": iteration, - "type": "completion", - "content": { - "total_steps": len(completed_steps), - "reason": "Reached maximum iterations", - "exploration_plan_list": exploration_plan_list - }, - "status": "success", - "error_message": "Reached maximum iterations" - } \ No newline at end of file + "status": "success", + "error_message": "Reached maximum iterations" + } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 49ed802a..34599461 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "data_formulator" version = "0.6" -requires-python = ">=3.9" +requires-python = ">=3.11" authors = [ {name = "Chenglong Wang", email = "chenglong.wang@microsoft.com"}, {name = "Dan Marshall", email = "danmar@microsoft.com"}, @@ -20,23 +20,22 @@ classifiers = [ "Programming Language :: Python" ] -dependencies = [ - "jupyter", - "pandas", - "flask", - "flask-cors", +dependencies = [ + "jupyter", + "pandas", + "flask", + "flask-cors", "flask-limiter", - "openai", - "python-dotenv", + "openai", + "python-dotenv", "vega_datasets", "litellm", "duckdb", - "numpy", - "vl-convert-python", + "numpy", + "vl-convert-python", "backoff", "beautifulsoup4", "scikit-learn", - "azure-identity", "azure-kusto-data", "azure-keyvault-secrets", @@ -48,7 +47,9 @@ dependencies = [ "pymysql", "pyodbc", "pymongo", - "yfinance" + "yfinance", + "connectorx>=0.4.5", + "pyarrow>=23.0.0", ] [project.urls] @@ -62,3 +63,8 @@ include-package-data = true [project.scripts] data_formulator = "data_formulator:run_app" + +[tool.uv] +dev-dependencies = [ + "build", +] diff --git a/requirements.txt b/requirements.txt index 0fe4db15..07f4b553 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,33 +1,3021 @@ -# Core dependencies (always required) -jupyter -pandas -numpy -flask -flask-cors -flask-limiter -openai -python-dotenv -vega_datasets -litellm -duckdb -vl-convert-python -backoff -beautifulsoup4 -scikit-learn -yfinance # for demo stream routes - -# External data loaders (Azure, BigQuery, AWS S3, MySQL, MSSQL) -azure-identity -azure-kusto-data -azure-keyvault-secrets -azure-storage-blob -google-cloud-bigquery -google-auth -db-dtypes -boto3 -pymysql -pyodbc -pymongo - -# Install data_formulator itself in editable mode --e . \ No newline at end of file +# This file was autogenerated by uv via the following command: +# uv export --format requirements-txt --output-file requirements.txt +-e . +aiohappyeyeballs==2.6.1 \ + --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ + --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 + # via aiohttp +aiohttp==3.13.3 \ + --hash=sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf \ + --hash=sha256:042e9e0bcb5fba81886c8b4fbb9a09d6b8a00245fd8d88e4d989c1f96c74164c \ + --hash=sha256:05861afbbec40650d8a07ea324367cb93e9e8cc7762e04dd4405df99fa65159c \ + --hash=sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423 \ + --hash=sha256:0add0900ff220d1d5c5ebbf99ed88b0c1bbf87aa7e4262300ed1376a6b13414f \ + --hash=sha256:10b47b7ba335d2e9b1239fa571131a87e2d8ec96b333e68b2a305e7a98b0bae2 \ + --hash=sha256:1449ceddcdbcf2e0446957863af03ebaaa03f94c090f945411b61269e2cb5daf \ + --hash=sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64 \ + --hash=sha256:2712039939ec963c237286113c68dbad80a82a4281543f3abf766d9d73228998 \ + --hash=sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d \ + --hash=sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea \ + --hash=sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463 \ + --hash=sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4 \ + --hash=sha256:2e41b18a58da1e474a057b3d35248d8320029f61d70a37629535b16a0c8f3767 \ + --hash=sha256:2eb752b102b12a76ca02dff751a801f028b4ffbbc478840b473597fc91a9ed43 \ + --hash=sha256:2fc82186fadc4a8316768d61f3722c230e2c1dcab4200d52d2ebdf2482e47592 \ + --hash=sha256:2fff83cfc93f18f215896e3a190e8e5cb413ce01553901aca925176e7568963a \ + --hash=sha256:34749271508078b261c4abb1767d42b8d0c0cc9449c73a4df494777dc55f0687 \ + --hash=sha256:34bac00a67a812570d4a460447e1e9e06fae622946955f939051e7cc895cfab8 \ + --hash=sha256:37239e9f9a7ea9ac5bf6b92b0260b01f8a22281996da609206a84df860bc1261 \ + --hash=sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a \ + --hash=sha256:3d9908a48eb7416dc1f4524e69f1d32e5d90e3981e4e37eb0aa1cd18f9cfa2a4 \ + --hash=sha256:3dd4dce1c718e38081c8f35f323209d4c1df7d4db4bab1b5c88a6b4d12b74587 \ + --hash=sha256:4021b51936308aeea0367b8f006dc999ca02bc118a0cc78c303f50a2ff6afb91 \ + --hash=sha256:425c126c0dc43861e22cb1c14ba4c8e45d09516d0a3ae0a3f7494b79f5f233a3 \ + --hash=sha256:44531a36aa2264a1860089ffd4dce7baf875ee5a6079d5fb42e261c704ef7344 \ + --hash=sha256:48e377758516d262bde50c2584fc6c578af272559c409eecbdd2bae1601184d6 \ + --hash=sha256:49a03727c1bba9a97d3e93c9f93ca03a57300f484b6e935463099841261195d3 \ + --hash=sha256:568f416a4072fbfae453dcf9a99194bbb8bdeab718e08ee13dfa2ba0e4bebf29 \ + --hash=sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3 \ + --hash=sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b \ + --hash=sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51 \ + --hash=sha256:5dff64413671b0d3e7d5918ea490bdccb97a4ad29b3f311ed423200b2203e01c \ + --hash=sha256:5e1d8c8b8f1d91cd08d8f4a3c2b067bfca6ec043d3ff36de0f3a715feeedf926 \ + --hash=sha256:5f8ca7f2bb6ba8348a3614c7918cc4bb73268c5ac2a207576b7afea19d3d9f64 \ + --hash=sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f \ + --hash=sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b \ + --hash=sha256:693781c45a4033d31d4187d2436f5ac701e7bbfe5df40d917736108c1cc7436e \ + --hash=sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440 \ + --hash=sha256:697753042d57f4bf7122cab985bf15d0cef23c770864580f5af4f52023a56bd6 \ + --hash=sha256:6de499a1a44e7de70735d0b39f67c8f25eb3d91eb3103be99ca0fa882cdd987d \ + --hash=sha256:6fc0e2337d1a4c3e6acafda6a78a39d4c14caea625124817420abceed36e2415 \ + --hash=sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce \ + --hash=sha256:7b5e8fe4de30df199155baaf64f2fcd604f4c678ed20910db8e2c66dc4b11603 \ + --hash=sha256:7bfdc049127717581866fa4708791220970ce291c23e28ccf3922c700740fdc0 \ + --hash=sha256:7f9120f7093c2a32d9647abcaf21e6ad275b4fbec5b55969f978b1a97c7c86bf \ + --hash=sha256:8057c98e0c8472d8846b9c79f56766bcc57e3e8ac7bfd510482332366c56c591 \ + --hash=sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540 \ + --hash=sha256:82611aeec80eb144416956ec85b6ca45a64d76429c1ed46ae1b5f86c6e0c9a26 \ + --hash=sha256:8542f41a62bcc58fc7f11cf7c90e0ec324ce44950003feb70640fc2a9092c32a \ + --hash=sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a \ + --hash=sha256:87b9aab6d6ed88235aa2970294f496ff1a1f9adcd724d800e9b952395a80ffd9 \ + --hash=sha256:90455115e5da1c3c51ab619ac57f877da8fd6d73c05aacd125c5ae9819582aba \ + --hash=sha256:90751b8eed69435bac9ff4e3d2f6b3af1f57e37ecb0fbeee59c0174c9e2d41df \ + --hash=sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679 \ + --hash=sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc \ + --hash=sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29 \ + --hash=sha256:9bf9f7a65e7aa20dd764151fb3d616c81088f91f8df39c3893a536e279b4b984 \ + --hash=sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1 \ + --hash=sha256:a19884d2ee70b06d9204b2727a7b9f983d0c684c650254679e716b0b77920632 \ + --hash=sha256:a1e53262fd202e4b40b70c3aff944a8155059beedc8a89bba9dc1f9ef06a1b56 \ + --hash=sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239 \ + --hash=sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168 \ + --hash=sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88 \ + --hash=sha256:add1da70de90a2569c5e15249ff76a631ccacfe198375eead4aadf3b8dc849dc \ + --hash=sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046 \ + --hash=sha256:b0d95340658b9d2f11d9697f59b3814a9d3bb4b7a7c20b131df4bcef464037c0 \ + --hash=sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3 \ + --hash=sha256:b556c85915d8efaed322bf1bdae9486aa0f3f764195a0fb6ee962e5c71ef5ce1 \ + --hash=sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c \ + --hash=sha256:b928f30fe49574253644b1ca44b1b8adbd903aa0da4b9054a6c20fc7f4092a25 \ + --hash=sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033 \ + --hash=sha256:bbe7d4cecacb439e2e2a8a1a7b935c25b812af7a5fd26503a66dadf428e79ec1 \ + --hash=sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d \ + --hash=sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f \ + --hash=sha256:c685f2d80bb67ca8c3837823ad76196b3694b0159d232206d1e461d3d434666f \ + --hash=sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29 \ + --hash=sha256:d60ac9663f44168038586cab2157e122e46bdef09e9368b37f2d82d354c23f72 \ + --hash=sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57 \ + --hash=sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c \ + --hash=sha256:ea37047c6b367fd4bd632bff8077449b8fa034b69e812a18e0132a00fae6e808 \ + --hash=sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7 \ + --hash=sha256:f76c1e3fe7d7c8afad7ed193f89a292e1999608170dcc9751a7462a87dfd5bc0 \ + --hash=sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3 \ + --hash=sha256:fc290605db2a917f6e81b0e1e0796469871f5af381ce15c604a3c5c7e51cb730 \ + --hash=sha256:fc353029f176fd2b3ec6cfc71be166aba1936fe5d73dd1992ce289ca6647a9aa + # via litellm +aiosignal==1.4.0 \ + --hash=sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e \ + --hash=sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7 + # via aiohttp +annotated-types==0.7.0 \ + --hash=sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53 \ + --hash=sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89 + # via pydantic +anyio==4.12.1 \ + --hash=sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703 \ + --hash=sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c + # via + # httpx + # jupyter-server + # openai +appnope==0.1.4 ; sys_platform == 'darwin' \ + --hash=sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee \ + --hash=sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c + # via ipykernel +argon2-cffi==25.1.0 \ + --hash=sha256:694ae5cc8a42f4c4e2bf2ca0e64e51e23a040c6a517a85074683d3959e1346c1 \ + --hash=sha256:fdc8b074db390fccb6eb4a3604ae7231f219aa669a2652e0f20e16ba513d5741 + # via jupyter-server +argon2-cffi-bindings==25.1.0 \ + --hash=sha256:1db89609c06afa1a214a69a462ea741cf735b29a57530478c06eb81dd403de99 \ + --hash=sha256:1e021e87faa76ae0d413b619fe2b65ab9a037f24c60a1e6cc43457ae20de6dc6 \ + --hash=sha256:2630b6240b495dfab90aebe159ff784d08ea999aa4b0d17efa734055a07d2f44 \ + --hash=sha256:3c6702abc36bf3ccba3f802b799505def420a1b7039862014a65db3205967f5a \ + --hash=sha256:3d3f05610594151994ca9ccb3c771115bdb4daef161976a266f0dd8aa9996b8f \ + --hash=sha256:473bcb5f82924b1becbb637b63303ec8d10e84c8d241119419897a26116515d2 \ + --hash=sha256:7aef0c91e2c0fbca6fc68e7555aa60ef7008a739cbe045541e438373bc54d2b0 \ + --hash=sha256:84a461d4d84ae1295871329b346a97f68eade8c53b6ed9a7ca2d7467f3c8ff6f \ + --hash=sha256:87c33a52407e4c41f3b70a9c2d3f6056d88b10dad7695be708c5021673f55623 \ + --hash=sha256:8b8efee945193e667a396cbc7b4fb7d357297d6234d30a489905d96caabde56b \ + --hash=sha256:a1c70058c6ab1e352304ac7e3b52554daadacd8d453c1752e547c76e9c99ac44 \ + --hash=sha256:a98cd7d17e9f7ce244c0803cad3c23a7d379c301ba618a5fa76a67d116618b98 \ + --hash=sha256:aecba1723ae35330a008418a91ea6cfcedf6d31e5fbaa056a166462ff066d500 \ + --hash=sha256:b0fdbcf513833809c882823f98dc2f931cf659d9a1429616ac3adebb49f5db94 \ + --hash=sha256:b55aec3565b65f56455eebc9b9f34130440404f27fe21c3b375bf1ea4d8fbae6 \ + --hash=sha256:b957f3e6ea4d55d820e40ff76f450952807013d361a65d7f28acc0acbf29229d \ + --hash=sha256:ba92837e4a9aa6a508c8d2d7883ed5a8f6c308c89a4790e1e447a220deb79a85 \ + --hash=sha256:c4f9665de60b1b0e99bcd6be4f17d90339698ce954cfd8d9cf4f91c995165a92 \ + --hash=sha256:c87b72589133f0346a1cb8d5ecca4b933e3c9b64656c9d175270a000e73b288d \ + --hash=sha256:d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a \ + --hash=sha256:e2fd3bfbff3c5d74fef31a722f729bf93500910db650c925c2d6ef879a7e51cb + # via argon2-cffi +arrow==1.4.0 \ + --hash=sha256:749f0769958ebdc79c173ff0b0670d59051a535fa26e8eba02953dc19eb43205 \ + --hash=sha256:ed0cc050e98001b8779e84d461b0098c4ac597e88704a655582b21d116e526d7 + # via isoduration +asttokens==3.0.1 \ + --hash=sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a \ + --hash=sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7 + # via stack-data +async-lru==2.1.0 \ + --hash=sha256:9eeb2fecd3fe42cc8a787fc32ead53a3a7158cc43d039c3c55ab3e4e5b2a80ed \ + --hash=sha256:fa12dcf99a42ac1280bc16c634bbaf06883809790f6304d85cdab3f666f33a7e + # via jupyterlab +attrs==25.4.0 \ + --hash=sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11 \ + --hash=sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373 + # via + # aiohttp + # jsonschema + # referencing +azure-core==1.38.0 \ + --hash=sha256:8194d2682245a3e4e3151a667c686464c3786fed7918b394d035bdcd61bb5993 \ + --hash=sha256:ab0c9b2cd71fecb1842d52c965c95285d3cfb38902f6766e4a471f1cd8905335 + # via + # azure-identity + # azure-keyvault-secrets + # azure-kusto-data + # azure-storage-blob +azure-identity==1.25.1 \ + --hash=sha256:87ca8328883de6036443e1c37b40e8dc8fb74898240f61071e09d2e369361456 \ + --hash=sha256:e9edd720af03dff020223cd269fa3a61e8f345ea75443858273bcb44844ab651 + # via + # azure-kusto-data + # data-formulator +azure-keyvault-secrets==4.10.0 \ + --hash=sha256:666fa42892f9cee749563e551a90f060435ab878977c95265173a8246d546a36 \ + --hash=sha256:9dbde256077a4ee1a847646671580692e3f9bea36bcfc189c3cf2b9a94eb38b9 + # via data-formulator +azure-kusto-data==6.0.1 \ + --hash=sha256:1d5e04d273376330b58d6d11b055aeadda748cd1ecee1117fdc1b8329e76cb75 \ + --hash=sha256:8d4e7adbe122ea08d5f0053ec37f294171ff734e8d77f36983a29694485bc347 + # via data-formulator +azure-storage-blob==12.28.0 \ + --hash=sha256:00fb1db28bf6a7b7ecaa48e3b1d5c83bfadacc5a678b77826081304bd87d6461 \ + --hash=sha256:e7d98ea108258d29aa0efbfd591b2e2075fa1722a2fae8699f0b3c9de11eff41 + # via data-formulator +babel==2.18.0 \ + --hash=sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d \ + --hash=sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35 + # via jupyterlab-server +backoff==2.2.1 \ + --hash=sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba \ + --hash=sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8 + # via data-formulator +beautifulsoup4==4.14.3 \ + --hash=sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb \ + --hash=sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86 + # via + # data-formulator + # nbconvert + # yfinance +bleach==6.3.0 \ + --hash=sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22 \ + --hash=sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6 + # via nbconvert +blinker==1.9.0 \ + --hash=sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf \ + --hash=sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc + # via flask +boto3==1.42.39 \ + --hash=sha256:d03f82363314759eff7f84a27b9e6428125f89d8119e4588e8c2c1d79892c956 \ + --hash=sha256:d9d6ce11df309707b490d2f5f785b761cfddfd6d1f665385b78c9d8ed097184b + # via data-formulator +botocore==1.42.39 \ + --hash=sha256:0f00355050821e91a5fe6d932f7bf220f337249b752899e3e4cf6ed54326249e \ + --hash=sha256:9e0d0fed9226449cc26fcf2bbffc0392ac698dd8378e8395ce54f3ec13f81d58 + # via + # boto3 + # s3transfer +build==1.4.0 \ + --hash=sha256:6a07c1b8eb6f2b311b96fcbdbce5dab5fe637ffda0fd83c9cac622e927501596 \ + --hash=sha256:f1b91b925aa322be454f8330c6fb48b465da993d1e7e7e6fa35027ec49f3c936 +certifi==2026.1.4 \ + --hash=sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c \ + --hash=sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120 + # via + # curl-cffi + # httpcore + # httpx + # requests +cffi==2.0.0 \ + --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \ + --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \ + --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \ + --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \ + --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \ + --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \ + --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \ + --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \ + --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \ + --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \ + --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \ + --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \ + --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \ + --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \ + --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \ + --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \ + --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \ + --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \ + --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \ + --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \ + --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \ + --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \ + --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \ + --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \ + --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \ + --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \ + --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \ + --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \ + --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \ + --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \ + --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \ + --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \ + --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \ + --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \ + --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \ + --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \ + --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \ + --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \ + --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \ + --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \ + --hash=sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9 \ + --hash=sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775 \ + --hash=sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc \ + --hash=sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062 \ + --hash=sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe \ + --hash=sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92 \ + --hash=sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5 \ + --hash=sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13 \ + --hash=sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d \ + --hash=sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26 \ + --hash=sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b \ + --hash=sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6 \ + --hash=sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c \ + --hash=sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef \ + --hash=sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5 \ + --hash=sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18 \ + --hash=sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad \ + --hash=sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3 \ + --hash=sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2 \ + --hash=sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5 + # via + # argon2-cffi-bindings + # cryptography + # curl-cffi + # pyzmq +charset-normalizer==3.4.4 \ + --hash=sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394 \ + --hash=sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89 \ + --hash=sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86 \ + --hash=sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f \ + --hash=sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8 \ + --hash=sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161 \ + --hash=sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152 \ + --hash=sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72 \ + --hash=sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4 \ + --hash=sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e \ + --hash=sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3 \ + --hash=sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c \ + --hash=sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2 \ + --hash=sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44 \ + --hash=sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26 \ + --hash=sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016 \ + --hash=sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede \ + --hash=sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf \ + --hash=sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc \ + --hash=sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0 \ + --hash=sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1 \ + --hash=sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed \ + --hash=sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8 \ + --hash=sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133 \ + --hash=sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e \ + --hash=sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef \ + --hash=sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14 \ + --hash=sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0 \ + --hash=sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828 \ + --hash=sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f \ + --hash=sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328 \ + --hash=sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090 \ + --hash=sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381 \ + --hash=sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c \ + --hash=sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb \ + --hash=sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc \ + --hash=sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a \ + --hash=sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec \ + --hash=sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc \ + --hash=sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac \ + --hash=sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569 \ + --hash=sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3 \ + --hash=sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525 \ + --hash=sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894 \ + --hash=sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a \ + --hash=sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9 \ + --hash=sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14 \ + --hash=sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25 \ + --hash=sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1 \ + --hash=sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3 \ + --hash=sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e \ + --hash=sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815 \ + --hash=sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6 \ + --hash=sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15 \ + --hash=sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191 \ + --hash=sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0 \ + --hash=sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897 \ + --hash=sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd \ + --hash=sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2 \ + --hash=sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794 \ + --hash=sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224 \ + --hash=sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838 \ + --hash=sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a \ + --hash=sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d \ + --hash=sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490 \ + --hash=sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9 + # via requests +click==8.3.1 \ + --hash=sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a \ + --hash=sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6 + # via + # flask + # litellm + # typer-slim +colorama==0.4.6 ; os_name == 'nt' or sys_platform == 'win32' \ + --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ + --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 + # via + # build + # click + # ipython + # tqdm +comm==0.2.3 \ + --hash=sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971 \ + --hash=sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417 + # via + # ipykernel + # ipywidgets +connectorx==0.4.5 \ + --hash=sha256:0737254429e22e5012e1fe6a849112da38abb9b56743b3b8c8a1f902e5270e75 \ + --hash=sha256:0ea5feccc2fb3471fa72c1d920bb4ed17ba1b18aedb89dee5ee6009138e35260 \ + --hash=sha256:2073970532a8e6e2a8a2c0b163497eb8e58216e28fdab6693fcd7e58bfc47bfc \ + --hash=sha256:234af0b6ab4a12b64e3818ebea1eb98cc8b47650280fb40924b43e2f1611acb4 \ + --hash=sha256:25efda2317f40e6536582c3dd4f57a8a31c7e5969d708a674272c05591e6f5a2 \ + --hash=sha256:27539e03408705f318572b163c419572a114fdc9baf4d1e6cd746bb87f573cf2 \ + --hash=sha256:31a65ff4ec8fde7ea7aa2812f2b21e7a512a3216b1b22ca1b02d3975b0bf1e75 \ + --hash=sha256:3863bc71677d6314b60cb1e1489a650114d37d8d9f58f2df038cae4a82d2ffc5 \ + --hash=sha256:38ad8a032fddf25c36c6911d857fbe54220fe28439f02a4beb273b29bdef1eb8 \ + --hash=sha256:3ddfe372065b974365bff3b383e39c29cad468c0e7556543dd23753446c441ed \ + --hash=sha256:3fa0811081c84befde6d3aa661ecb17b95be9e3851e20009fd27d0e1b925ceb9 \ + --hash=sha256:3fd7788294417cbbb3811f8942e4fe3b4c190b80627a3c706ceae6c321824bcf \ + --hash=sha256:50c20558beff2719be34ff325213526c1700c3a20743e9e0ba592774ebc9cc92 \ + --hash=sha256:ab1d62a26350055c5e901daa4d6dddb75b11addb923797158c809dffc4f0ac9e \ + --hash=sha256:c68cc9c6bff737d3c9fb8735b27ecc8474238ef640abb701ee0ab213c6c95f8c \ + --hash=sha256:cc01ca122f649e62707f49f7220ba1ae67961b260e2dcff9e8647ea9915a01cf \ + --hash=sha256:e605b5eca75fe63117e5fb93f94e940ede0513340671631da35bdb5a035f8163 \ + --hash=sha256:f139bbfa34840b89d0a5ec760026a9268c18c63fb739568ecbc77660d3e4fc1f \ + --hash=sha256:f5d4754069644a712bd3105345e4f7c680420c5bb1d1264070cda058c7f07fb3 \ + --hash=sha256:ff2f4236a0fc14cd724b03df1f11c03b714442f4381575465f7d0f4f91135766 + # via data-formulator +cryptography==46.0.4 \ + --hash=sha256:01df4f50f314fbe7009f54046e908d1754f19d0c6d3070df1e6268c5a4af09fa \ + --hash=sha256:0563655cb3c6d05fb2afe693340bc050c30f9f34e15763361cf08e94749401fc \ + --hash=sha256:078e5f06bd2fa5aea5a324f2a09f914b1484f1d0c2a4d6a8a28c74e72f65f2da \ + --hash=sha256:0a9ad24359fee86f131836a9ac3bffc9329e956624a2d379b613f8f8abaf5255 \ + --hash=sha256:2067461c80271f422ee7bdbe79b9b4be54a5162e90345f86a23445a0cf3fd8a2 \ + --hash=sha256:281526e865ed4166009e235afadf3a4c4cba6056f99336a99efba65336fd5485 \ + --hash=sha256:2d08bc22efd73e8854b0b7caff402d735b354862f1145d7be3b9c0f740fef6a0 \ + --hash=sha256:3c268a3490df22270955966ba236d6bc4a8f9b6e4ffddb78aac535f1a5ea471d \ + --hash=sha256:3d425eacbc9aceafd2cb429e42f4e5d5633c6f873f5e567077043ef1b9bbf616 \ + --hash=sha256:44cc0675b27cadb71bdbb96099cca1fa051cd11d2ade09e5cd3a2edb929ed947 \ + --hash=sha256:47bcd19517e6389132f76e2d5303ded6cf3f78903da2158a671be8de024f4cd0 \ + --hash=sha256:485e2b65d25ec0d901bca7bcae0f53b00133bf3173916d8e421f6fddde103908 \ + --hash=sha256:5aa3e463596b0087b3da0dbe2b2487e9fc261d25da85754e30e3b40637d61f81 \ + --hash=sha256:5f14fba5bf6f4390d7ff8f086c566454bff0411f6d8aa7af79c88b6f9267aecc \ + --hash=sha256:62217ba44bf81b30abaeda1488686a04a702a261e26f87db51ff61d9d3510abd \ + --hash=sha256:6225d3ebe26a55dbc8ead5ad1265c0403552a63336499564675b29eb3184c09b \ + --hash=sha256:6bb5157bf6a350e5b28aee23beb2d84ae6f5be390b2f8ee7ea179cda077e1019 \ + --hash=sha256:728fedc529efc1439eb6107b677f7f7558adab4553ef8669f0d02d42d7b959a7 \ + --hash=sha256:766330cce7416c92b5e90c3bb71b1b79521760cdcfc3a6a1a182d4c9fab23d2b \ + --hash=sha256:812815182f6a0c1d49a37893a303b44eaac827d7f0d582cecfc81b6427f22973 \ + --hash=sha256:829c2b12bbc5428ab02d6b7f7e9bbfd53e33efd6672d21341f2177470171ad8b \ + --hash=sha256:82a62483daf20b8134f6e92898da70d04d0ef9a75829d732ea1018678185f4f5 \ + --hash=sha256:8a15fb869670efa8f83cbffbc8753c1abf236883225aed74cd179b720ac9ec80 \ + --hash=sha256:8bf75b0259e87fa70bddc0b8b4078b76e7fd512fd9afae6c1193bcf440a4dbef \ + --hash=sha256:91627ebf691d1ea3976a031b61fb7bac1ccd745afa03602275dda443e11c8de0 \ + --hash=sha256:93d8291da8d71024379ab2cb0b5c57915300155ad42e07f76bea6ad838d7e59b \ + --hash=sha256:9b34d8ba84454641a6bf4d6762d15847ecbd85c1316c0a7984e6e4e9f748ec2e \ + --hash=sha256:9b4d17bc7bd7cdd98e3af40b441feaea4c68225e2eb2341026c84511ad246c0c \ + --hash=sha256:9c2da296c8d3415b93e6053f5a728649a87a48ce084a9aaf51d6e46c87c7f2d2 \ + --hash=sha256:a05177ff6296644ef2876fce50518dffb5bcdf903c85250974fc8bc85d54c0af \ + --hash=sha256:a90e43e3ef65e6dcf969dfe3bb40cbf5aef0d523dff95bfa24256be172a845f4 \ + --hash=sha256:a9556ba711f7c23f77b151d5798f3ac44a13455cc68db7697a1096e6d0563cab \ + --hash=sha256:b1de0ebf7587f28f9190b9cb526e901bf448c9e6a99655d2b07fff60e8212a82 \ + --hash=sha256:be8c01a7d5a55f9a47d1888162b76c8f49d62b234d88f0ff91a9fbebe32ffbc3 \ + --hash=sha256:bfd019f60f8abc2ed1b9be4ddc21cfef059c841d86d710bb69909a688cbb8f59 \ + --hash=sha256:c236a44acfb610e70f6b3e1c3ca20ff24459659231ef2f8c48e879e2d32b73da \ + --hash=sha256:c411f16275b0dea722d76544a61d6421e2cc829ad76eec79280dbdc9ddf50061 \ + --hash=sha256:c92010b58a51196a5f41c3795190203ac52edfd5dc3ff99149b4659eba9d2085 \ + --hash=sha256:d5a45ddc256f492ce42a4e35879c5e5528c09cd9ad12420828c972951d8e016b \ + --hash=sha256:daa392191f626d50f1b136c9b4cf08af69ca8279d110ea24f5c2700054d2e263 \ + --hash=sha256:dc1272e25ef673efe72f2096e92ae39dea1a1a450dd44918b15351f72c5a168e \ + --hash=sha256:dce1e4f068f03008da7fa51cc7abc6ddc5e5de3e3d1550334eaf8393982a5829 \ + --hash=sha256:dd5aba870a2c40f87a3af043e0dee7d9eb02d4aff88a797b48f2b43eff8c3ab4 \ + --hash=sha256:de0f5f4ec8711ebc555f54735d4c673fc34b65c44283895f1a08c2b49d2fd99c \ + --hash=sha256:df4a817fa7138dd0c96c8c8c20f04b8aaa1fac3bbf610913dcad8ea82e1bfd3f \ + --hash=sha256:e07ea39c5b048e085f15923511d8121e4a9dc45cee4e3b970ca4f0d338f23095 \ + --hash=sha256:eeeb2e33d8dbcccc34d64651f00a98cb41b2dc69cef866771a5717e6734dfa32 \ + --hash=sha256:fa0900b9ef9c49728887d1576fd8d9e7e3ea872fa9b25ef9b64888adc434e976 \ + --hash=sha256:fdc3daab53b212472f1524d070735b2f0c214239df131903bae1d598016fa822 + # via + # azure-identity + # azure-storage-blob + # google-auth + # msal + # pyjwt +curl-cffi==0.13.0 \ + --hash=sha256:28911b526e8cd4aa0e5e38401bfe6887e8093907272f1f67ca22e6beb2933a51 \ + --hash=sha256:434cadbe8df2f08b2fc2c16dff2779fb40b984af99c06aa700af898e185bb9db \ + --hash=sha256:59afa877a9ae09efa04646a7d068eeea48915a95d9add0a29854e7781679fcd7 \ + --hash=sha256:62ecd90a382bd5023750e3606e0aa7cb1a3a8ba41c14270b8e5e149ebf72c5ca \ + --hash=sha256:66a6b75ce971de9af64f1b6812e275f60b88880577bac47ef1fa19694fa21cd3 \ + --hash=sha256:6d433ffcb455ab01dd0d7bde47109083aa38b59863aa183d29c668ae4c96bf8e \ + --hash=sha256:8eb4083371bbb94e9470d782de235fb5268bf43520de020c9e5e6be8f395443f \ + --hash=sha256:b4e0de45ab3b7a835c72bd53640c2347415111b43421b5c7a1a0b18deae2e541 \ + --hash=sha256:d06ed389e45a7ca97b17c275dbedd3d6524560270e675c720e93a2018a766076 \ + --hash=sha256:d438a3b45244e874794bc4081dc1e356d2bb926dcc7021e5a8fef2e2105ef1d8 + # via yfinance +db-dtypes==1.5.0 \ + --hash=sha256:abdbb2e4eb965800ed6f98af0c5c1cafff9063ace09114be2d26a7f046be2c8a \ + --hash=sha256:ad9e94243f53e104bc77dbf9ae44b580d83a770d3694483aba59c9767966daa5 + # via data-formulator +debugpy==1.8.20 \ + --hash=sha256:077a7447589ee9bc1ff0cdf443566d0ecf540ac8aa7333b775ebcb8ce9f4ecad \ + --hash=sha256:1f7650546e0eded1902d0f6af28f787fa1f1dbdbc97ddabaf1cd963a405930cb \ + --hash=sha256:352036a99dd35053b37b7803f748efc456076f929c6a895556932eaf2d23b07f \ + --hash=sha256:4057ac68f892064e5f98209ab582abfee3b543fb55d2e87610ddc133a954d390 \ + --hash=sha256:4ae3135e2089905a916909ef31922b2d733d756f66d87345b3e5e52b7a55f13d \ + --hash=sha256:55bc8701714969f1ab89a6d5f2f3d40c36f91b2cbe2f65d98bf8196f6a6a2c33 \ + --hash=sha256:5be9bed9ae3be00665a06acaa48f8329d2b9632f15fd09f6a9a8c8d9907e54d7 \ + --hash=sha256:5dff4bb27027821fdfcc9e8f87309a28988231165147c31730128b1c983e282a \ + --hash=sha256:773e839380cf459caf73cc533ea45ec2737a5cc184cf1b3b796cd4fd98504fec \ + --hash=sha256:7de0b7dfeedc504421032afba845ae2a7bcc32ddfb07dae2c3ca5442f821c344 \ + --hash=sha256:84562982dd7cf5ebebfdea667ca20a064e096099997b175fe204e86817f64eaf \ + --hash=sha256:88f47850a4284b88bd2bfee1f26132147d5d504e4e86c22485dfa44b97e19b4b \ + --hash=sha256:9c74df62fc064cd5e5eaca1353a3ef5a5d50da5eb8058fcef63106f7bebe6173 \ + --hash=sha256:a1a8f851e7cf171330679ef6997e9c579ef6dd33c9098458bd9986a0f4ca52e3 \ + --hash=sha256:a98eec61135465b062846112e5ecf2eebb855305acc1dfbae43b72903b8ab5be \ + --hash=sha256:da11dea6447b2cadbf8ce2bec59ecea87cc18d2c574980f643f2d2dfe4862393 \ + --hash=sha256:eada6042ad88fa1571b74bd5402ee8b86eded7a8f7b827849761700aff171f1b \ + --hash=sha256:eb506e45943cab2efb7c6eafdd65b842f3ae779f020c82221f55aca9de135ed7 + # via ipykernel +decorator==5.2.1 \ + --hash=sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360 \ + --hash=sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a + # via ipython +defusedxml==0.7.1 \ + --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ + --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 + # via nbconvert +deprecated==1.3.1 \ + --hash=sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f \ + --hash=sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223 + # via limits +distro==1.9.0 \ + --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ + --hash=sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2 + # via openai +dnspython==2.8.0 \ + --hash=sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af \ + --hash=sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f + # via pymongo +duckdb==1.4.4 \ + --hash=sha256:0509b39ea7af8cff0198a99d206dca753c62844adab54e545984c2e2c1381616 \ + --hash=sha256:0d636ceda422e7babd5e2f7275f6a0d1a3405e6a01873f00d38b72118d30c10b \ + --hash=sha256:1af6e76fe8bd24875dc56dd8e38300d64dc708cd2e772f67b9fbc635cc3066a3 \ + --hash=sha256:1f8d55843cc940e36261689054f7dfb6ce35b1f5b0953b0d355b6adb654b0d52 \ + --hash=sha256:25874f8b1355e96178079e37312c3ba6d61a2354f51319dae860cf21335c3a20 \ + --hash=sha256:337f8b24e89bc2e12dadcfe87b4eb1c00fd920f68ab07bc9b70960d6523b8bc3 \ + --hash=sha256:452c5b5d6c349dc5d1154eb2062ee547296fcbd0c20e9df1ed00b5e1809089da \ + --hash=sha256:47dd4162da6a2be59a0aef640eb08d6360df1cf83c317dcc127836daaf3b7f7c \ + --hash=sha256:4c25d5b0febda02b7944e94fdae95aecf952797afc8cb920f677b46a7c251955 \ + --hash=sha256:50f2eb173c573811b44aba51176da7a4e5c487113982be6a6a1c37337ec5fa57 \ + --hash=sha256:53cd6423136ab44383ec9955aefe7599b3fb3dd1fe006161e6396d8167e0e0d4 \ + --hash=sha256:5536eb952a8aa6ae56469362e344d4e6403cc945a80bc8c5c2ebdd85d85eb64b \ + --hash=sha256:59c8d76016dde854beab844935b1ec31de358d4053e792988108e995b18c08e7 \ + --hash=sha256:5ba684f498d4e924c7e8f30dd157da8da34c8479746c5011b6c0e037e9c60ad2 \ + --hash=sha256:6703dd1bb650025b3771552333d305d62ddd7ff182de121483d4e042ea6e2e00 \ + --hash=sha256:6792ca647216bd5c4ff16396e4591cfa9b4a72e5ad7cdd312cec6d67e8431a7c \ + --hash=sha256:6cb357cfa3403910e79e2eb46c8e445bb1ee2fd62e9e9588c6b999df4256abc1 \ + --hash=sha256:6fb1225a9ea5877421481d59a6c556a9532c32c16c7ae6ca8d127e2b878c9389 \ + --hash=sha256:7df7351328ffb812a4a289732f500d621e7de9942a3a2c9b6d4afcf4c0e72526 \ + --hash=sha256:8bba52fd2acb67668a4615ee17ee51814124223de836d9e2fdcbc4c9021b3d3c \ + --hash=sha256:8e5c2d8a0452df55e092959c0bfc8ab8897ac3ea0f754cb3b0ab3e165cd79aff \ + --hash=sha256:b297eff642503fd435a9de5a9cb7db4eccb6f61d61a55b30d2636023f149855f \ + --hash=sha256:bf138201f56e5d6fc276a25138341b3523e2f84733613fc43f02c54465619a95 \ + --hash=sha256:c65d15c440c31e06baaebfd2c06d71ce877e132779d309f1edf0a85d23c07e92 \ + --hash=sha256:d0440f59e0cd9936a9ebfcf7a13312eda480c79214ffed3878d75947fc3b7d6d \ + --hash=sha256:d525de5f282b03aa8be6db86b1abffdceae5f1055113a03d5b50cd2fb8cf2ef8 \ + --hash=sha256:ddcfd9c6ff234da603a1edd5fd8ae6107f4d042f74951b65f91bc5e2643856b3 \ + --hash=sha256:f28a18cc790217e5b347bb91b2cab27aafc557c58d3d8382e04b4fe55d0c3f66 \ + --hash=sha256:fb94de6d023de9d79b7edc1ae07ee1d0b4f5fa8a9dcec799650b5befdf7aafec + # via data-formulator +executing==2.2.1 \ + --hash=sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4 \ + --hash=sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017 + # via stack-data +fastjsonschema==2.21.2 \ + --hash=sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463 \ + --hash=sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de + # via nbformat +fastuuid==0.14.0 \ + --hash=sha256:05a8dde1f395e0c9b4be515b7a521403d1e8349443e7641761af07c7ad1624b1 \ + --hash=sha256:09098762aad4f8da3a888eb9ae01c84430c907a297b97166b8abc07b640f2995 \ + --hash=sha256:09378a05020e3e4883dfdab438926f31fea15fd17604908f3d39cbeb22a0b4dc \ + --hash=sha256:0c9ec605ace243b6dbe3bd27ebdd5d33b00d8d1d3f580b39fdd15cd96fd71796 \ + --hash=sha256:0df14e92e7ad3276327631c9e7cec09e32572ce82089c55cb1bb8df71cf394ed \ + --hash=sha256:12ac85024637586a5b69645e7ed986f7535106ed3013640a393a03e461740cb7 \ + --hash=sha256:1383fff584fa249b16329a059c68ad45d030d5a4b70fb7c73a08d98fd53bcdab \ + --hash=sha256:178947fc2f995b38497a74172adee64fdeb8b7ec18f2a5934d037641ba265d26 \ + --hash=sha256:1bf539a7a95f35b419f9ad105d5a8a35036df35fdafae48fb2fd2e5f318f0d75 \ + --hash=sha256:1ca61b592120cf314cfd66e662a5b54a578c5a15b26305e1b8b618a6f22df714 \ + --hash=sha256:1e3cc56742f76cd25ecb98e4b82a25f978ccffba02e4bdce8aba857b6d85d87b \ + --hash=sha256:1e690d48f923c253f28151b3a6b4e335f2b06bf669c68a02665bc150b7839e94 \ + --hash=sha256:2b29e23c97e77c3a9514d70ce343571e469098ac7f5a269320a0f0b3e193ab36 \ + --hash=sha256:2fb3c0d7fef6674bbeacdd6dbd386924a7b60b26de849266d1ff6602937675c8 \ + --hash=sha256:33e678459cf4addaedd9936bbb038e35b3f6b2061330fd8f2f6a1d80414c0f87 \ + --hash=sha256:3acdf655684cc09e60fb7e4cf524e8f42ea760031945aa8086c7eae2eeeabeb8 \ + --hash=sha256:73946cb950c8caf65127d4e9a325e2b6be0442a224fd51ba3b6ac44e1912ce34 \ + --hash=sha256:77a09cb7427e7af74c594e409f7731a0cf887221de2f698e1ca0ebf0f3139021 \ + --hash=sha256:77e94728324b63660ebf8adb27055e92d2e4611645bf12ed9d88d30486471d0a \ + --hash=sha256:808527f2407f58a76c916d6aa15d58692a4a019fdf8d4c32ac7ff303b7d7af09 \ + --hash=sha256:9579618be6280700ae36ac42c3efd157049fe4dd40ca49b021280481c78c3176 \ + --hash=sha256:9a133bf9cc78fdbd1179cb58a59ad0100aa32d8675508150f3658814aeefeaa4 \ + --hash=sha256:9bd57289daf7b153bfa3e8013446aa144ce5e8c825e9e366d455155ede5ea2dc \ + --hash=sha256:a0809f8cc5731c066c909047f9a314d5f536c871a7a22e815cc4967c110ac9ad \ + --hash=sha256:a6f46790d59ab38c6aa0e35c681c0484b50dc0acf9e2679c005d61e019313c24 \ + --hash=sha256:a8a0dfea3972200f72d4c7df02c8ac70bad1bb4c58d7e0ec1e6f341679073a7f \ + --hash=sha256:aa75b6657ec129d0abded3bec745e6f7ab642e6dba3a5272a68247e85f5f316f \ + --hash=sha256:ab32f74bd56565b186f036e33129da77db8be09178cd2f5206a5d4035fb2a23f \ + --hash=sha256:ab3f5d36e4393e628a4df337c2c039069344db5f4b9d2a3c9cea48284f1dd741 \ + --hash=sha256:ac60fc860cdf3c3f327374db87ab8e064c86566ca8c49d2e30df15eda1b0c2d5 \ + --hash=sha256:b852a870a61cfc26c884af205d502881a2e59cc07076b60ab4a951cc0c94d1ad \ + --hash=sha256:b9a0ca4f03b7e0b01425281ffd44e99d360e15c895f1907ca105854ed85e2057 \ + --hash=sha256:bbb0c4b15d66b435d2538f3827f05e44e2baafcc003dd7d8472dc67807ab8fd8 \ + --hash=sha256:c0a94245afae4d7af8c43b3159d5e3934c53f47140be0be624b96acd672ceb73 \ + --hash=sha256:c7502d6f54cd08024c3ea9b3514e2d6f190feb2f46e6dbcd3747882264bb5f7b \ + --hash=sha256:caa1f14d2102cb8d353096bc6ef6c13b2c81f347e6ab9d6fbd48b9dea41c153d \ + --hash=sha256:cb9a030f609194b679e1660f7e32733b7a0f332d519c5d5a6a0a580991290022 \ + --hash=sha256:cd5a7f648d4365b41dbf0e38fe8da4884e57bed4e77c83598e076ac0c93995e7 \ + --hash=sha256:d23ef06f9e67163be38cece704170486715b177f6baae338110983f99a72c070 \ + --hash=sha256:d9e4332dc4ba054434a9594cbfaf7823b57993d7d8e7267831c3e059857cf397 \ + --hash=sha256:df61342889d0f5e7a32f7284e55ef95103f2110fee433c2ae7c2c0956d76ac8a \ + --hash=sha256:e150eab56c95dc9e3fefc234a0eedb342fac433dacc273cd4d150a5b0871e1fa \ + --hash=sha256:e23fc6a83f112de4be0cc1990e5b127c27663ae43f866353166f87df58e73d06 \ + --hash=sha256:ec27778c6ca3393ef662e2762dba8af13f4ec1aaa32d08d77f71f2a70ae9feb8 \ + --hash=sha256:f54d5b36c56a2d5e1a31e73b950b28a0d83eb0c37b91d10408875a5a29494bad + # via litellm +filelock==3.20.3 \ + --hash=sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1 \ + --hash=sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1 + # via huggingface-hub +flask==3.1.2 \ + --hash=sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87 \ + --hash=sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c + # via + # data-formulator + # flask-cors + # flask-limiter +flask-cors==6.0.2 \ + --hash=sha256:6e118f3698249ae33e429760db98ce032a8bf9913638d085ca0f4c5534ad2423 \ + --hash=sha256:e57544d415dfd7da89a9564e1e3a9e515042df76e12130641ca6f3f2f03b699a + # via data-formulator +flask-limiter==4.1.1 \ + --hash=sha256:ca11608fc7eec43dcea606964ca07c3bd4ec1ae89043a0f67f717899a4f48106 \ + --hash=sha256:e1ae13e06e6b3e39a4902e7d240b901586b25932c2add7bd5f5eeb4bdc11111b + # via data-formulator +fqdn==1.5.1 \ + --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \ + --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014 + # via jsonschema +frozendict==2.4.7 \ + --hash=sha256:972af65924ea25cf5b4d9326d549e69a9a4918d8a76a9d3a7cd174d98b237550 \ + --hash=sha256:e478fb2a1391a56c8a6e10cc97c4a9002b410ecd1ac28c18d780661762e271bd + # via yfinance +frozenlist==1.8.0 \ + --hash=sha256:0325024fe97f94c41c08872db482cf8ac4800d80e79222c6b0b7b162d5b13686 \ + --hash=sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0 \ + --hash=sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121 \ + --hash=sha256:06be8f67f39c8b1dc671f5d83aaefd3358ae5cdcf8314552c57e7ed3e6475bdd \ + --hash=sha256:073f8bf8becba60aa931eb3bc420b217bb7d5b8f4750e6f8b3be7f3da85d38b7 \ + --hash=sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c \ + --hash=sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84 \ + --hash=sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d \ + --hash=sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b \ + --hash=sha256:102e6314ca4da683dca92e3b1355490fed5f313b768500084fbe6371fddfdb79 \ + --hash=sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967 \ + --hash=sha256:119fb2a1bd47307e899c2fac7f28e85b9a543864df47aa7ec9d3c1b4545f096f \ + --hash=sha256:154e55ec0655291b5dd1b8731c637ecdb50975a2ae70c606d100750a540082f7 \ + --hash=sha256:168c0969a329b416119507ba30b9ea13688fafffac1b7822802537569a1cb0ef \ + --hash=sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9 \ + --hash=sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd \ + --hash=sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed \ + --hash=sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b \ + --hash=sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f \ + --hash=sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25 \ + --hash=sha256:28bd570e8e189d7f7b001966435f9dac6718324b5be2990ac496cf1ea9ddb7fe \ + --hash=sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143 \ + --hash=sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e \ + --hash=sha256:2c5dcbbc55383e5883246d11fd179782a9d07a986c40f49abe89ddf865913930 \ + --hash=sha256:2dc43a022e555de94c3b68a4ef0b11c4f747d12c024a520c7101709a2144fb37 \ + --hash=sha256:2f05983daecab868a31e1da44462873306d3cbfd76d1f0b5b69c473d21dbb128 \ + --hash=sha256:33139dc858c580ea50e7e60a1b0ea003efa1fd42e6ec7fdbad78fff65fad2fd2 \ + --hash=sha256:33f48f51a446114bc5d251fb2954ab0164d5be02ad3382abcbfe07e2531d650f \ + --hash=sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746 \ + --hash=sha256:342c97bf697ac5480c0a7ec73cd700ecfa5a8a40ac923bd035484616efecc2df \ + --hash=sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8 \ + --hash=sha256:39ecbc32f1390387d2aa4f5a995e465e9e2f79ba3adcac92d68e3e0afae6657c \ + --hash=sha256:3e0761f4d1a44f1d1a47996511752cf3dcec5bbdd9cc2b4fe595caf97754b7a0 \ + --hash=sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad \ + --hash=sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82 \ + --hash=sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29 \ + --hash=sha256:4314debad13beb564b708b4a496020e5306c7333fa9a3ab90374169a20ffab30 \ + --hash=sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf \ + --hash=sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62 \ + --hash=sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383 \ + --hash=sha256:4970ece02dbc8c3a92fcc5228e36a3e933a01a999f7094ff7c23fbd2beeaa67c \ + --hash=sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52 \ + --hash=sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d \ + --hash=sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1 \ + --hash=sha256:54b2077180eb7f83dd52c40b2750d0a9f175e06a42e3213ce047219de902717a \ + --hash=sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714 \ + --hash=sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65 \ + --hash=sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506 \ + --hash=sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888 \ + --hash=sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41 \ + --hash=sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608 \ + --hash=sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa \ + --hash=sha256:778a11b15673f6f1df23d9586f83c4846c471a8af693a22e066508b77d201ec8 \ + --hash=sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1 \ + --hash=sha256:8009897cdef112072f93a0efdce29cd819e717fd2f649ee3016efd3cd885a7ed \ + --hash=sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52 \ + --hash=sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231 \ + --hash=sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496 \ + --hash=sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a \ + --hash=sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3 \ + --hash=sha256:92db2bf818d5cc8d9c1f1fc56b897662e24ea5adb36ad1f1d82875bd64e03c24 \ + --hash=sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695 \ + --hash=sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7 \ + --hash=sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4 \ + --hash=sha256:97260ff46b207a82a7567b581ab4190bd4dfa09f4db8a8b49d1a958f6aa4940e \ + --hash=sha256:974b28cf63cc99dfb2188d8d222bc6843656188164848c4f679e63dae4b0708e \ + --hash=sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b \ + --hash=sha256:b2a095d45c5d46e5e79ba1e5b9cb787f541a8dee0433836cea4b96a2c439dcd8 \ + --hash=sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51 \ + --hash=sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8 \ + --hash=sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b \ + --hash=sha256:bac9c42ba2ac65ddc115d930c78d24ab8d4f465fd3fc473cdedfccadb9429806 \ + --hash=sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042 \ + --hash=sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b \ + --hash=sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d \ + --hash=sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567 \ + --hash=sha256:cb89a7f2de3602cfed448095bab3f178399646ab7c61454315089787df07733a \ + --hash=sha256:cba69cb73723c3f329622e34bdbf5ce1f80c21c290ff04256cff1cd3c2036ed2 \ + --hash=sha256:cee686f1f4cadeb2136007ddedd0aaf928ab95216e7691c63e50a8ec066336d0 \ + --hash=sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e \ + --hash=sha256:d1eaff1d00c7751b7c6662e9c5ba6eb2c17a2306ba5e2a37f24ddf3cc953402b \ + --hash=sha256:d3bb933317c52d7ea5004a1c442eef86f426886fba134ef8cf4226ea6ee1821d \ + --hash=sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a \ + --hash=sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52 \ + --hash=sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1 \ + --hash=sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94 \ + --hash=sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822 \ + --hash=sha256:eab8145831a0d56ec9c4139b6c3e594c7a83c2c8be25d5bcf2d86136a532287a \ + --hash=sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11 \ + --hash=sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581 \ + --hash=sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51 \ + --hash=sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40 \ + --hash=sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92 \ + --hash=sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5 \ + --hash=sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4 \ + --hash=sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93 \ + --hash=sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027 \ + --hash=sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd + # via + # aiohttp + # aiosignal +fsspec==2026.1.0 \ + --hash=sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc \ + --hash=sha256:e987cb0496a0d81bba3a9d1cee62922fb395e7d4c3b575e57f547953334fe07b + # via huggingface-hub +google-api-core==2.29.0 \ + --hash=sha256:84181be0f8e6b04006df75ddfe728f24489f0af57c96a529ff7cf45bc28797f7 \ + --hash=sha256:d30bc60980daa36e314b5d5a3e5958b0200cb44ca8fa1be2b614e932b75a3ea9 + # via + # google-cloud-bigquery + # google-cloud-core +google-auth==2.48.0 \ + --hash=sha256:2e2a537873d449434252a9632c28bfc268b0adb1e53f9fb62afc5333a975903f \ + --hash=sha256:4f7e706b0cd3208a3d940a19a822c37a476ddba5450156c3e6624a71f7c841ce + # via + # data-formulator + # google-api-core + # google-cloud-bigquery + # google-cloud-core +google-cloud-bigquery==3.40.0 \ + --hash=sha256:0469bcf9e3dad3cab65b67cce98180c8c0aacf3253d47f0f8e976f299b49b5ab \ + --hash=sha256:b3ccb11caf0029f15b29569518f667553fe08f6f1459b959020c83fbbd8f2e68 + # via data-formulator +google-cloud-core==2.5.0 \ + --hash=sha256:67d977b41ae6c7211ee830c7912e41003ea8194bff15ae7d72fd6f51e57acabc \ + --hash=sha256:7c1b7ef5c92311717bd05301aa1a91ffbc565673d3b0b4163a52d8413a186963 + # via google-cloud-bigquery +google-crc32c==1.8.0 \ + --hash=sha256:014a7e68d623e9a4222d663931febc3033c5c7c9730785727de2a81f87d5bab8 \ + --hash=sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411 \ + --hash=sha256:17446feb05abddc187e5441a45971b8394ea4c1b6efd88ab0af393fd9e0a156a \ + --hash=sha256:19b40d637a54cb71e0829179f6cb41835f0fbd9e8eb60552152a8b52c36cbe15 \ + --hash=sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb \ + --hash=sha256:3b9776774b24ba76831609ffbabce8cdf6fa2bd5e9df37b594221c7e333a81fa \ + --hash=sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962 \ + --hash=sha256:3ebb04528e83b2634857f43f9bb8ef5b2bbe7f10f140daeb01b58f972d04736b \ + --hash=sha256:450dc98429d3e33ed2926fc99ee81001928d63460f8538f21a5d6060912a8e27 \ + --hash=sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113 \ + --hash=sha256:57a50a9035b75643996fbf224d6661e386c7162d1dfdab9bc4ca790947d1007f \ + --hash=sha256:71734788a88f551fbd6a97be9668a0020698e07b2bf5b3aa26a36c10cdfb27b2 \ + --hash=sha256:86cfc00fe45a0ac7359e5214a1704e51a99e757d0272554874f419f79838c5f7 \ + --hash=sha256:87fa445064e7db928226b2e6f0d5304ab4cd0339e664a4e9a25029f384d9bb93 \ + --hash=sha256:89c17d53d75562edfff86679244830599ee0a48efc216200691de8b02ab6b2b8 \ + --hash=sha256:8b3f68782f3cbd1bce027e48768293072813469af6a61a86f6bb4977a4380f21 \ + --hash=sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79 \ + --hash=sha256:b0d1a7afc6e8e4635564ba8aa5c0548e3173e41b6384d7711a9123165f582de2 \ + --hash=sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454 \ + --hash=sha256:d511b3153e7011a27ab6ee6bb3a5404a55b994dc1a7322c0b87b29606d9790e2 \ + --hash=sha256:e6584b12cb06796d285d09e33f63309a09368b9d806a551d8036a4207ea43697 \ + --hash=sha256:f4b51844ef67d6cf2e9425983274da75f18b1597bb2c998e1c0a0e8d46f8f651 \ + --hash=sha256:f639065ea2042d5c034bf258a9f085eaa7af0cd250667c0635a3118e8f92c69c + # via google-resumable-media +google-resumable-media==2.8.0 \ + --hash=sha256:dd14a116af303845a8d932ddae161a26e86cc229645bc98b39f026f9b1717582 \ + --hash=sha256:f1157ed8b46994d60a1bc432544db62352043113684d4e030ee02e77ebe9a1ae + # via google-cloud-bigquery +googleapis-common-protos==1.72.0 \ + --hash=sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038 \ + --hash=sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5 + # via + # google-api-core + # grpcio-status +grpcio==1.76.0 \ + --hash=sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280 \ + --hash=sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd \ + --hash=sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465 \ + --hash=sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc \ + --hash=sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054 \ + --hash=sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba \ + --hash=sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03 \ + --hash=sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2 \ + --hash=sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a \ + --hash=sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749 \ + --hash=sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb \ + --hash=sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958 \ + --hash=sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468 \ + --hash=sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc \ + --hash=sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09 \ + --hash=sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980 \ + --hash=sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d \ + --hash=sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f \ + --hash=sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882 \ + --hash=sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae \ + --hash=sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77 \ + --hash=sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e \ + --hash=sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73 \ + --hash=sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8 \ + --hash=sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3 \ + --hash=sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da \ + --hash=sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397 \ + --hash=sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e \ + --hash=sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42 \ + --hash=sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6 \ + --hash=sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11 \ + --hash=sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c \ + --hash=sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a \ + --hash=sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347 \ + --hash=sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4 \ + --hash=sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00 \ + --hash=sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48 \ + --hash=sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8 \ + --hash=sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8 \ + --hash=sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc \ + --hash=sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62 + # via + # google-api-core + # grpcio-status +grpcio-status==1.76.0 \ + --hash=sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd \ + --hash=sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18 + # via google-api-core +h11==0.16.0 \ + --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ + --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 + # via httpcore +hf-xet==1.2.0 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ + --hash=sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e \ + --hash=sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc \ + --hash=sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4 \ + --hash=sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382 \ + --hash=sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090 \ + --hash=sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8 \ + --hash=sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0 \ + --hash=sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd \ + --hash=sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848 \ + --hash=sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737 \ + --hash=sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a \ + --hash=sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f \ + --hash=sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc \ + --hash=sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f \ + --hash=sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865 \ + --hash=sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f \ + --hash=sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813 \ + --hash=sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5 \ + --hash=sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649 \ + --hash=sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c \ + --hash=sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69 \ + --hash=sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832 + # via huggingface-hub +httpcore==1.0.9 \ + --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \ + --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8 + # via httpx +httpx==0.28.1 \ + --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \ + --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad + # via + # huggingface-hub + # jupyterlab + # litellm + # openai +huggingface-hub==1.3.7 \ + --hash=sha256:5f86cd48f27131cdbf2882699cbdf7a67dd4cbe89a81edfdc31211f42e4a5fd1 \ + --hash=sha256:8155ce937038fa3d0cb4347d752708079bc85e6d9eb441afb44c84bcf48620d2 + # via tokenizers +idna==3.11 \ + --hash=sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea \ + --hash=sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902 + # via + # anyio + # httpx + # jsonschema + # requests + # yarl +ijson==3.4.0.post0 \ + --hash=sha256:043f9b7cf9cc744263a78175e769947733710d2412d25180df44b1086b23ebd5 \ + --hash=sha256:04ac9ca54db20f82aeda6379b5f4f6112fdb150d09ebce04affeab98a17b4ed3 \ + --hash=sha256:05807edc0bcbd222dc6ea32a2b897f0c81dc7f12c8580148bc82f6d7f5e7ec7b \ + --hash=sha256:07f20ecd748602ac7f18c617637e53bd73ded7f3b22260bba3abe401a7fc284e \ + --hash=sha256:0b473112e72c0c506da425da3278367b6680f340ecc093084693a1e819d28435 \ + --hash=sha256:103a0838061297d063bca81d724b0958b616f372bd893bbc278320152252c652 \ + --hash=sha256:114ed248166ac06377e87a245a158d6b98019d2bdd3bb93995718e0bd996154f \ + --hash=sha256:11f13b73194ea2a5a8b4a2863f25b0b4624311f10db3a75747b510c4958179b0 \ + --hash=sha256:1709171023ce82651b2f132575c2e6282e47f64ad67bd3260da476418d0e7895 \ + --hash=sha256:17e45262a5ddef39894013fb1548ee7094e444c8389eb1a97f86708b19bea03e \ + --hash=sha256:226447e40ca9340a39ed07d68ea02ee14b52cb4fe649425b256c1f0073531c83 \ + --hash=sha256:254cfb8c124af68327a0e7a49b50bbdacafd87c4690a3d62c96eb01020a685ef \ + --hash=sha256:27aa193d47ffc6bc4e45453896ad98fb089a367e8283b973f1fe5c0198b60b4e \ + --hash=sha256:2c88f0669d45d4b1aa017c9b68d378e7cd15d188dfb6f0209adc78b7f45590a7 \ + --hash=sha256:339d49f6c5d24051c85d9226be96d2d56e633cb8b7d09dd8099de8d8b51a97e2 \ + --hash=sha256:3505dff18bdeb8b171eb28af6df34857e2be80dc01e2e3b624e77215ad58897f \ + --hash=sha256:3ed19b1e4349240773a8ce4a4bfa450892d4a57949c02c515cd6be5a46b7696a \ + --hash=sha256:40007c977e230e04118b27322f25a72ae342a3d61464b2057fcd9b21eeb7427a \ + --hash=sha256:432fb60ffb952926f9438e0539011e2dfcd108f8426ee826ccc6173308c3ff2c \ + --hash=sha256:45a0b1c833ed2620eaf8da958f06ac8351c59e5e470e078400d23814670ed708 \ + --hash=sha256:461acf4320219459dabe5ed90a45cb86c9ba8cc6d6db9dad0d9427d42f57794c \ + --hash=sha256:47352563e8c594360bacee2e0753e97025f0861234722d02faace62b1b6d2b2a \ + --hash=sha256:4810546e66128af51fd4a0c9a640e84e8508e9c15c4f247d8a3e3253b20e1465 \ + --hash=sha256:4827d9874a6a81625412c59f7ca979a84d01f7f6bfb3c6d4dc4c46d0382b14e0 \ + --hash=sha256:4e39bfdc36b0b460ef15a06550a6a385c64c81f7ac205ccff39bd45147918912 \ + --hash=sha256:54a0e3e05d9a0c95ecba73d9579f146cf6d5c5874116c849dba2d39a5f30380e \ + --hash=sha256:55f7f656b5986326c978cbb3a9eea9e33f3ef6ecc4535b38f1d452c731da39ab \ + --hash=sha256:56169e298c5a2e7196aaa55da78ddc2415876a74fe6304f81b1eb0d3273346f7 \ + --hash=sha256:56b3089dc28c12492d92cc4896d2be585a89ecae34e25d08c1df88f21815cb50 \ + --hash=sha256:5a48b9486242d1295abe7fd0fbb6308867da5ca3f69b55c77922a93c2b6847aa \ + --hash=sha256:5f0a72b1e3c0f78551670c12b2fdc1bf05f2796254d9c2055ba319bec2216020 \ + --hash=sha256:61ab0b8c5bf707201dc67e02c116f4b6545c4afd7feb2264b989d242d9c4348a \ + --hash=sha256:636b6eca96c6c43c04629c6b37fad0181662eaacf9877c71c698485637f752f9 \ + --hash=sha256:6458bd8e679cdff459a0a5e555b107c3bbacb1f382da3fe0f40e392871eb518d \ + --hash=sha256:659acb2843433e080c271ecedf7d19c71adde1ee5274fc7faa2fec0a793f9f1c \ + --hash=sha256:69718ed41710dfcaa7564b0af42abc05875d4f7aaa24627c808867ef32634bc7 \ + --hash=sha256:7206afcb396aaef66c2b066997b4e9d9042c4b7d777f4d994e9cec6d322c2fe6 \ + --hash=sha256:7809ec8c8f40228edaaa089f33e811dff4c5b8509702652870d3f286c9682e27 \ + --hash=sha256:8311f48db6a33116db5c81682f08b6e2405501a4b4e460193ae69fec3cd1f87a \ + --hash=sha256:83fc738d81c9ea686b452996110b8a6678296c481e0546857db24785bff8da92 \ + --hash=sha256:91c61a3e63e04da648737e6b4abd537df1b46fb8cdf3219b072e790bb3c1a46b \ + --hash=sha256:9aa02dc70bb245670a6ca7fba737b992aeeb4895360980622f7e568dbf23e41e \ + --hash=sha256:9c0886234d1fae15cf4581a430bdba03d79251c1ab3b07e30aa31b13ef28d01c \ + --hash=sha256:a0fedf09c0f6ffa2a99e7e7fd9c5f3caf74e655c1ee015a0797383e99382ebc3 \ + --hash=sha256:a39d5d36067604b26b78de70b8951c90e9272450642661fe531a8f7a6936a7fa \ + --hash=sha256:a5269af16f715855d9864937f9dd5c348ca1ac49cee6a2c7a1b7091c159e874f \ + --hash=sha256:a603d7474bf35e7b3a8e49c8dabfc4751841931301adff3f3318171c4e407f32 \ + --hash=sha256:add9242f886eae844a7410b84aee2bbb8bdc83c624f227cb1fdb2d0476a96cb1 \ + --hash=sha256:b005ce84e82f28b00bf777a464833465dfe3efa43a0a26c77b5ac40723e1a728 \ + --hash=sha256:b200df83c901f5bfa416d069ac71077aa1608f854a4c50df1b84ced560e9c9ec \ + --hash=sha256:b2a81aee91633868f5b40280e2523f7c5392e920a5082f47c5e991e516b483f6 \ + --hash=sha256:b39dbf87071f23a23c8077eea2ae7cfeeca9ff9ffec722dfc8b5f352e4dd729c \ + --hash=sha256:b55e49045f4c8031f3673f56662fd828dc9e8d65bd3b03a9420dda0d370e64ba \ + --hash=sha256:b607a500fca26101be47d2baf7cddb457b819ab60a75ce51ed1092a40da8b2f9 \ + --hash=sha256:b982a3597b0439ce9c8f4cfc929d86c6ed43907908be1e8463a34dc35fe5b258 \ + --hash=sha256:ba3478ff0bb49d7ba88783f491a99b6e3fa929c930ab062d2bb7837e6a38fe88 \ + --hash=sha256:c117321cfa7b749cc1213f9b4c80dc958f0a206df98ec038ae4bcbbdb8463a15 \ + --hash=sha256:c8dd327da225887194fe8b93f2b3c9c256353e14a6b9eefc940ed17fde38f5b8 \ + --hash=sha256:ccddb2894eb7af162ba43b9475ac5825d15d568832f82eb8783036e5d2aebd42 \ + --hash=sha256:cf24a48a1c3ca9d44a04feb59ccefeb9aa52bb49b9cb70ad30518c25cce74bb7 \ + --hash=sha256:cf4a34c2cfe852aee75c89c05b0a4531c49dc0be27eeed221afd6fbf9c3e149c \ + --hash=sha256:d14427d366f95f21adcb97d0ed1f6d30f6fdc04d0aa1e4de839152c50c2b8d65 \ + --hash=sha256:d4d4afec780881edb2a0d2dd40b1cdbe246e630022d5192f266172a0307986a7 \ + --hash=sha256:da6a21b88cbf5ecbc53371283988d22c9643aa71ae2873bbeaefd2dea3b6160b \ + --hash=sha256:deda4cfcaafa72ca3fa845350045b1d0fef9364ec9f413241bb46988afbe6ee6 \ + --hash=sha256:e15833dcf6f6d188fdc624a31cd0520c3ba21b6855dc304bc7c1a8aeca02d4ac \ + --hash=sha256:eb5e73028f6e63d27b3d286069fe350ed80a4ccc493b022b590fea4bb086710d \ + --hash=sha256:ec5bb1520cb212ebead7dba048bb9b70552c3440584f83b01b0abc96862e2a09 \ + --hash=sha256:eeb9540f0b1a575cbb5968166706946458f98c16e7accc6f2fe71efa29864241 \ + --hash=sha256:f932969fc1fd4449ca141cf5f47ff357656a154a361f28d9ebca0badc5b02297 \ + --hash=sha256:fe9c84c9b1c8798afa407be1cea1603401d99bfc7c34497e19f4f5e5ddc9b441 \ + --hash=sha256:fecae19b5187d92900c73debb3a979b0b3290a53f85df1f8f3c5ba7d1e9fb9cb \ + --hash=sha256:ffb21203736b08fe27cb30df6a4f802fafb9ef7646c5ff7ef79569b63ea76c57 + # via azure-kusto-data +importlib-metadata==8.7.1 \ + --hash=sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb \ + --hash=sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151 + # via litellm +ipykernel==7.1.0 \ + --hash=sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db \ + --hash=sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c + # via + # jupyter + # jupyter-console + # jupyterlab +ipython==9.10.0 \ + --hash=sha256:c6ab68cc23bba8c7e18e9b932797014cc61ea7fd6f19de180ab9ba73e65ee58d \ + --hash=sha256:cd9e656be97618a0676d058134cd44e6dc7012c0e5cb36a9ce96a8c904adaf77 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipython-pygments-lexers==1.1.1 \ + --hash=sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81 \ + --hash=sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c + # via ipython +ipywidgets==8.1.8 \ + --hash=sha256:61f969306b95f85fba6b6986b7fe45d73124d1d9e3023a8068710d47a22ea668 \ + --hash=sha256:ecaca67aed704a338f88f67b1181b58f821ab5dc89c1f0f5ef99db43c1c2921e + # via jupyter +isodate==0.7.2 \ + --hash=sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15 \ + --hash=sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6 + # via + # azure-keyvault-secrets + # azure-storage-blob +isoduration==20.11.0 \ + --hash=sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9 \ + --hash=sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042 + # via jsonschema +itsdangerous==2.2.0 \ + --hash=sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef \ + --hash=sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173 + # via flask +jedi==0.19.2 \ + --hash=sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0 \ + --hash=sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9 + # via ipython +jinja2==3.1.6 \ + --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ + --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 + # via + # flask + # jupyter-server + # jupyterlab + # jupyterlab-server + # litellm + # nbconvert +jiter==0.13.0 \ + --hash=sha256:04670992b576fa65bd056dbac0c39fe8bd67681c380cb2b48efa885711d9d726 \ + --hash=sha256:0733312953b909688ae3c2d58d043aa040f9f1a6a75693defed7bc2cc4bf2654 \ + --hash=sha256:07b75fe09a4ee8e0c606200622e571e44943f47254f95e2436c8bdcaceb36d7d \ + --hash=sha256:0a2bd69fc1d902e89925fc34d1da51b2128019423d7b339a45d9e99c894e0663 \ + --hash=sha256:0a8d76c7524087272c8ae913f5d9d608bd839154b62c4322ef65723d2e5bb0b8 \ + --hash=sha256:0b34c519e17658ed88d5047999a93547f8889f3c1824120c26ad6be5f27b6cf5 \ + --hash=sha256:0bf670e3b1445fc4d31612199f1744f67f889ee1bbae703c4b54dc097e5dd394 \ + --hash=sha256:0c365005b05505a90d1c47856420980d0237adf82f70c4aff7aebd3c1cc143ad \ + --hash=sha256:0e3a5f0cde8ff433b8e88e41aa40131455420fb3649a3c7abdda6145f8cb7202 \ + --hash=sha256:0f0c065695f616a27c920a56ad0d4fc46415ef8b806bf8fc1cacf25002bd24e1 \ + --hash=sha256:1211427574b17b633cfceba5040de8081e5abf114f7a7602f73d2e16f9fdaa59 \ + --hash=sha256:1317fdffd16f5873e46ce27d0e0f7f4f90f0cdf1d86bf6abeaea9f63ca2c401d \ + --hash=sha256:15db60e121e11fe186c0b15236bd5d18381b9ddacdcf4e659feb96fc6c969c92 \ + --hash=sha256:1f4748aad1b4a93c8bdd70f604d0f748cdc0e8744c5547798acfa52f10e79228 \ + --hash=sha256:1f8a55b848cbabf97d861495cd65f1e5c590246fabca8b48e1747c4dfc8f85bf \ + --hash=sha256:24ab43126d5e05f3d53a36a8e11eb2f23304c6c1117844aaaf9a0aa5e40b5018 \ + --hash=sha256:24dc96eca9f84da4131cdf87a95e6ce36765c3b156fc9ae33280873b1c32d5f6 \ + --hash=sha256:2b4972c6df33731aac0742b64fd0d18e0a69bc7d6e03108ce7d40c85fd9e3e6d \ + --hash=sha256:2c26cf47e2cad140fa23b6d58d435a7c0161f5c514284802f25e87fddfe11024 \ + --hash=sha256:2d08c9475d48b92892583df9da592a0e2ac49bcd41fae1fec4f39ba6cf107820 \ + --hash=sha256:3097d665a27bc96fd9bbf7f86178037db139f319f785e4757ce7ccbf390db6c2 \ + --hash=sha256:36ebfbcffafb146d0e6ffb3e74d51e03d9c35ce7c625c8066cdbfc7b953bdc72 \ + --hash=sha256:3b3fb8c2053acaef8580809ac1d1f7481a0a0bdc012fd7f5d8b18fb696a5a089 \ + --hash=sha256:3d744a6061afba08dd7ae375dcde870cffb14429b7477e10f67e9e6d68772a0a \ + --hash=sha256:41f92313d17989102f3cb5dd533a02787cdb99454d494344b0361355da52fcb9 \ + --hash=sha256:45f6f8efb2f3b0603092401dc2df79fa89ccbc027aaba4174d2d4133ed661434 \ + --hash=sha256:47455245307e4debf2ce6c6e65a717550a0244231240dcf3b8f7d64e4c2f22f4 \ + --hash=sha256:5467696f6b827f1116556cb0db620440380434591e93ecee7fd14d1a491b6daa \ + --hash=sha256:57aab48f40be1db920a582b30b116fe2435d184f77f0e4226f546794cedd9cf0 \ + --hash=sha256:597245258e6ad085d064780abfb23a284d418d3e61c57362d9449c6c7317ee2d \ + --hash=sha256:5a1aff1fbdb803a376d4d22a8f63f8e7ccbce0b4890c26cc7af9e501ab339ef0 \ + --hash=sha256:5d9b34ad56761b3bf0fbe8f7e55468704107608512350962d3317ffd7a4382d5 \ + --hash=sha256:632bf7c1d28421c00dd8bbb8a3bac5663e1f57d5cd5ed962bce3c73bf62608e6 \ + --hash=sha256:66aa3e663840152d18cc8ff1e4faad3dd181373491b9cfdc6004b92198d67911 \ + --hash=sha256:682161a67adea11e3aae9038c06c8b4a9a71023228767477d683f69903ebc607 \ + --hash=sha256:6c26a424569a59140fb51160a56df13f438a2b0967365e987889186d5fc2f6f9 \ + --hash=sha256:701a1e77d1e593c1b435315ff625fd071f0998c5f02792038a5ca98899261b7d \ + --hash=sha256:775e10de3849d0631a97c603f996f518159272db00fdda0a780f81752255ee9d \ + --hash=sha256:7772115877c53f62beeb8fd853cab692dbc04374ef623b30f997959a4c0e7e95 \ + --hash=sha256:7b88d649135aca526da172e48083da915ec086b54e8e73a425ba50999468cc08 \ + --hash=sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19 \ + --hash=sha256:7beae3a3d3b5212d3a55d2961db3c292e02e302feb43fce6a3f7a31b90ea6dfe \ + --hash=sha256:879e768938e7b49b5e90b7e3fecc0dbec01b8cb89595861fb39a8967c5220d09 \ + --hash=sha256:87ce0f14c6c08892b610686ae8be350bf368467b6acd5085a5b65441e2bf36d2 \ + --hash=sha256:8d76029f077379374cf0dbc78dbe45b38dec4a2eb78b08b5194ce836b2517afc \ + --hash=sha256:964538479359059a35fb400e769295d4b315ae61e4105396d355a12f7fef09f0 \ + --hash=sha256:9776ebe51713acf438fd9b4405fcd86893ae5d03487546dae7f34993217f8a91 \ + --hash=sha256:98fbafb6e88256f4454de33c1f40203d09fc33ed19162a68b3b257b29ca7f663 \ + --hash=sha256:9950290340acc1adaded363edd94baebcee7dabdfa8bee4790794cd5cfad2af6 \ + --hash=sha256:9d01ecc3a8cbdb6f25a37bd500510550b64ddf9f7d64a107d92f3ccb25035d0f \ + --hash=sha256:9da38b4fedde4fb528c740c2564628fbab737166a0e73d6d46cb4bb5463ff411 \ + --hash=sha256:a13b68cd1cd8cc9de8f244ebae18ccb3e4067ad205220ef324c39181e23bbf66 \ + --hash=sha256:ab44b178f7981fcaea7e0a5df20e773c663d06ffda0198f1a524e91b2fde7e59 \ + --hash=sha256:ade8cb6ff5632a62b7dbd4757d8c5573f7a2e9ae285d6b5b841707d8363205ef \ + --hash=sha256:aed40e099404721d7fcaf5b89bd3b4568a4666358bcac7b6b15c09fb6252ab68 \ + --hash=sha256:b1cbfa133241d0e6bdab48dcdc2604e8ba81512f6bbd68ec3e8e1357dd3c316c \ + --hash=sha256:bb7613e1a427cfcb6ea4544f9ac566b93d5bf67e0d48c787eca673ff9c9dff2b \ + --hash=sha256:bdaba7d87e66f26a2c45d8cbadcbfc4bf7884182317907baf39cfe9775bb4d93 \ + --hash=sha256:c05b450d37ba0c9e21c77fef1f205f56bcee2330bddca68d344baebfc55ae0df \ + --hash=sha256:c1e2b199f446d3e82246b4fd9236d7cb502dc2222b18698ba0d986d2fecc6152 \ + --hash=sha256:c3524798e70655ff19aec58c7d05adb1f074fecff62da857ea9be2b908b6d701 \ + --hash=sha256:cc5223ab19fe25e2f0bf2643204ad7318896fe3729bf12fde41b77bfc4fafff0 \ + --hash=sha256:d2a6394e6af690d462310a86b53c47ad75ac8c21dc79f120714ea449979cb1d3 \ + --hash=sha256:db367d8be9fad6e8ebbac4a7578b7af562e506211036cba2c06c3b998603c3d2 \ + --hash=sha256:e104da1db1c0991b3eaed391ccd650ae8d947eab1480c733e5a3fb28d4313e40 \ + --hash=sha256:e404ea551d35438013c64b4f357b0474c7abf9f781c06d44fcaf7a14c69ff9e2 \ + --hash=sha256:e5562a0f0e90a6223b704163ea28e831bd3a9faa3512a711f031611e6b06c939 \ + --hash=sha256:ea026e70a9a28ebbdddcbcf0f1323128a8db66898a06eaad3a4e62d2f554d096 \ + --hash=sha256:ec7e287d7fbd02cb6e22f9a00dd9c9cd504c40a61f2c61e7e1f9690a82726b4c \ + --hash=sha256:ed9bbc30f5d60a3bdf63ae76beb3f9db280d7f195dfcfa61af792d6ce912d159 \ + --hash=sha256:ee9da221dca6e0429c2704c1b3655fe7b025204a71d4d9b73390c759d776d165 \ + --hash=sha256:f22ef501c3f87ede88f23f9b11e608581c14f04db59b6a801f354397ae13739f \ + --hash=sha256:f2839f9c2c7e2dffc1bc5929a510e14ce0a946be9365fd1219e7ef342dae14f4 \ + --hash=sha256:f556aa591c00f2c45eb1b89f68f52441a016034d18b65da60e2d2875bbbf344a \ + --hash=sha256:f7e1d61da332ec412350463891923f960c3073cf1aae93b538f0bb4c8cd46efb \ + --hash=sha256:f917a04240ef31898182f76a332f508f2cc4b57d2b4d7ad2dbfebbfe167eb505 \ + --hash=sha256:fa476ab5dd49f3bf3a168e05f89358c75a17608dbabb080ef65f96b27c19ab10 \ + --hash=sha256:ff732bd0a0e778f43d5009840f20b935e79087b4dc65bd36f1cd0f9b04b8ff7f + # via openai +jmespath==1.1.0 \ + --hash=sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d \ + --hash=sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64 + # via + # boto3 + # botocore +joblib==1.5.3 \ + --hash=sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713 \ + --hash=sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3 + # via scikit-learn +json5==0.13.0 \ + --hash=sha256:9a08e1dd65f6a4d4c6fa82d216cf2477349ec2346a38fd70cc11d2557499fbcc \ + --hash=sha256:b1edf8d487721c0bf64d83c28e91280781f6e21f4a797d3261c7c828d4c165bf + # via jupyterlab-server +jsonpointer==3.0.0 \ + --hash=sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942 \ + --hash=sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef + # via jsonschema +jsonschema==4.26.0 \ + --hash=sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326 \ + --hash=sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce + # via + # jupyter-events + # jupyterlab-server + # litellm + # nbformat +jsonschema-specifications==2025.9.1 \ + --hash=sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe \ + --hash=sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d + # via jsonschema +jupyter==1.1.1 \ + --hash=sha256:7a59533c22af65439b24bbe60373a4e95af8f16ac65a6c00820ad378e3f7cc83 \ + --hash=sha256:d55467bceabdea49d7e3624af7e33d59c37fff53ed3a350e1ac957bed731de7a + # via data-formulator +jupyter-client==8.8.0 \ + --hash=sha256:d556811419a4f2d96c869af34e854e3f059b7cc2d6d01a9cd9c85c267691be3e \ + --hash=sha256:f93a5b99c5e23a507b773d3a1136bd6e16c67883ccdbd9a829b0bbdb98cd7d7a + # via + # ipykernel + # jupyter-console + # jupyter-server + # nbclient +jupyter-console==6.6.3 \ + --hash=sha256:309d33409fcc92ffdad25f0bcdf9a4a9daa61b6f341177570fdac03de5352485 \ + --hash=sha256:566a4bf31c87adbfadf22cdf846e3069b59a71ed5da71d6ba4d8aaad14a53539 + # via jupyter +jupyter-core==5.9.1 \ + --hash=sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508 \ + --hash=sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407 + # via + # ipykernel + # jupyter-client + # jupyter-console + # jupyter-server + # jupyterlab + # nbclient + # nbconvert + # nbformat +jupyter-events==0.12.0 \ + --hash=sha256:6464b2fa5ad10451c3d35fabc75eab39556ae1e2853ad0c0cc31b656731a97fb \ + --hash=sha256:fc3fce98865f6784c9cd0a56a20644fc6098f21c8c33834a8d9fe383c17e554b + # via jupyter-server +jupyter-lsp==2.3.0 \ + --hash=sha256:458aa59339dc868fb784d73364f17dbce8836e906cd75fd471a325cba02e0245 \ + --hash=sha256:e914a3cb2addf48b1c7710914771aaf1819d46b2e5a79b0f917b5478ec93f34f + # via jupyterlab +jupyter-server==2.17.0 \ + --hash=sha256:c38ea898566964c888b4772ae1ed58eca84592e88251d2cfc4d171f81f7e99d5 \ + --hash=sha256:e8cb9c7db4251f51ed307e329b81b72ccf2056ff82d50524debde1ee1870e13f + # via + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # notebook + # notebook-shim +jupyter-server-terminals==0.5.4 \ + --hash=sha256:55be353fc74a80bc7f3b20e6be50a55a61cd525626f578dcb66a5708e2007d14 \ + --hash=sha256:bbda128ed41d0be9020349f9f1f2a4ab9952a73ed5f5ac9f1419794761fb87f5 + # via jupyter-server +jupyterlab==4.5.3 \ + --hash=sha256:4a159f71067cb38e4a82e86a42de8e7e926f384d7f2291964f282282096d27e8 \ + --hash=sha256:63c9f3a48de72ba00df766ad6eed416394f5bb883829f11eeff0872302520ba7 + # via + # jupyter + # notebook +jupyterlab-pygments==0.3.0 \ + --hash=sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d \ + --hash=sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780 + # via nbconvert +jupyterlab-server==2.28.0 \ + --hash=sha256:35baa81898b15f93573e2deca50d11ac0ae407ebb688299d3a5213265033712c \ + --hash=sha256:e4355b148fdcf34d312bbbc80f22467d6d20460e8b8736bf235577dd18506968 + # via + # jupyterlab + # notebook +jupyterlab-widgets==3.0.16 \ + --hash=sha256:423da05071d55cf27a9e602216d35a3a65a3e41cdf9c5d3b643b814ce38c19e0 \ + --hash=sha256:45fa36d9c6422cf2559198e4db481aa243c7a32d9926b500781c830c80f7ecf8 + # via ipywidgets +lark==1.3.1 \ + --hash=sha256:b426a7a6d6d53189d318f2b6236ab5d6429eaf09259f1ca33eb716eed10d2905 \ + --hash=sha256:c629b661023a014c37da873b4ff58a817398d12635d3bbb2c5a03be7fe5d1e12 + # via rfc3987-syntax +limits==5.6.0 \ + --hash=sha256:807fac75755e73912e894fdd61e2838de574c5721876a19f7ab454ae1fffb4b5 \ + --hash=sha256:b585c2104274528536a5b68864ec3835602b3c4a802cd6aa0b07419798394021 + # via flask-limiter +litellm==1.81.6 \ + --hash=sha256:573206ba194d49a1691370ba33f781671609ac77c35347f8a0411d852cf6341a \ + --hash=sha256:f02b503dfb7d66d1c939f82e4db21aeec1d6e2ed1fe3f5cd02aaec3f792bc4ae + # via data-formulator +markupsafe==3.0.3 \ + --hash=sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a \ + --hash=sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf \ + --hash=sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19 \ + --hash=sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf \ + --hash=sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175 \ + --hash=sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219 \ + --hash=sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb \ + --hash=sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6 \ + --hash=sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab \ + --hash=sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce \ + --hash=sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218 \ + --hash=sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634 \ + --hash=sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad \ + --hash=sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73 \ + --hash=sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c \ + --hash=sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe \ + --hash=sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa \ + --hash=sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37 \ + --hash=sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f \ + --hash=sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d \ + --hash=sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c \ + --hash=sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97 \ + --hash=sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a \ + --hash=sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19 \ + --hash=sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9 \ + --hash=sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9 \ + --hash=sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc \ + --hash=sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4 \ + --hash=sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354 \ + --hash=sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50 \ + --hash=sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698 \ + --hash=sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9 \ + --hash=sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b \ + --hash=sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc \ + --hash=sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115 \ + --hash=sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485 \ + --hash=sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f \ + --hash=sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12 \ + --hash=sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025 \ + --hash=sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009 \ + --hash=sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d \ + --hash=sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a \ + --hash=sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5 \ + --hash=sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f \ + --hash=sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1 \ + --hash=sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287 \ + --hash=sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6 \ + --hash=sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f \ + --hash=sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581 \ + --hash=sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed \ + --hash=sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b \ + --hash=sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026 \ + --hash=sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676 \ + --hash=sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e \ + --hash=sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d \ + --hash=sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d \ + --hash=sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01 \ + --hash=sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795 \ + --hash=sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5 \ + --hash=sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d \ + --hash=sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe \ + --hash=sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda \ + --hash=sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e \ + --hash=sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737 \ + --hash=sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523 \ + --hash=sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a \ + --hash=sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50 + # via + # flask + # jinja2 + # nbconvert + # werkzeug +matplotlib-inline==0.2.1 \ + --hash=sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76 \ + --hash=sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe + # via + # ipykernel + # ipython +mistune==3.2.0 \ + --hash=sha256:708487c8a8cdd99c9d90eb3ed4c3ed961246ff78ac82f03418f5183ab70e398a \ + --hash=sha256:febdc629a3c78616b94393c6580551e0e34cc289987ec6c35ed3f4be42d0eee1 + # via nbconvert +msal==1.34.0 \ + --hash=sha256:76ba83b716ea5a6d75b0279c0ac353a0e05b820ca1f6682c0eb7f45190c43c2f \ + --hash=sha256:f669b1644e4950115da7a176441b0e13ec2975c29528d8b9e81316023676d6e1 + # via + # azure-identity + # azure-kusto-data + # msal-extensions +msal-extensions==1.3.1 \ + --hash=sha256:96d3de4d034504e969ac5e85bae8106c8373b5c6568e4c8fa7af2eca9dbe6bca \ + --hash=sha256:c5b0fd10f65ef62b5f1d62f4251d51cbcaf003fcedae8c91b040a488614be1a4 + # via azure-identity +multidict==6.7.1 \ + --hash=sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9 \ + --hash=sha256:0458c978acd8e6ea53c81eefaddbbee9c6c5e591f41b3f5e8e194780fe026581 \ + --hash=sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3 \ + --hash=sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43 \ + --hash=sha256:0e161ddf326db5577c3a4cc2d8648f81456e8a20d40415541587a71620d7a7d1 \ + --hash=sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c \ + --hash=sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa \ + --hash=sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6 \ + --hash=sha256:14525a5f61d7d0c94b368a42cff4c9a4e7ba2d52e2672a7b23d84dc86fb02b0c \ + --hash=sha256:17307b22c217b4cf05033dabefe68255a534d637c6c9b0cc8382718f87be4262 \ + --hash=sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd \ + --hash=sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d \ + --hash=sha256:1e3a8bb24342a8201d178c3b4984c26ba81a577c80d4d525727427460a50c22d \ + --hash=sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3 \ + --hash=sha256:233b398c29d3f1b9676b4b6f75c518a06fcb2ea0b925119fb2c1bc35c05e1601 \ + --hash=sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0 \ + --hash=sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292 \ + --hash=sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed \ + --hash=sha256:283ddac99f7ac25a4acadbf004cb5ae34480bbeb063520f70ce397b281859362 \ + --hash=sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511 \ + --hash=sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23 \ + --hash=sha256:2bbd113e0d4af5db41d5ebfe9ccaff89de2120578164f86a5d17d5a576d1e5b2 \ + --hash=sha256:2e1425e2f99ec5bd36c15a01b690a1a2456209c5deed58f95469ffb46039ccbb \ + --hash=sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e \ + --hash=sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582 \ + --hash=sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0 \ + --hash=sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e \ + --hash=sha256:3ab8b9d8b75aef9df299595d5388b14530839f6422333357af1339443cff777d \ + --hash=sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65 \ + --hash=sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a \ + --hash=sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd \ + --hash=sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d \ + --hash=sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108 \ + --hash=sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177 \ + --hash=sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144 \ + --hash=sha256:4885cb0e817aef5d00a2e8451d4665c1808378dc27c2705f1bf4ef8505c0d2e5 \ + --hash=sha256:497394b3239fc6f0e13a78a3e1b61296e72bf1c5f94b4c4eb80b265c37a131cd \ + --hash=sha256:497bde6223c212ba11d462853cfa4f0ae6ef97465033e7dc9940cdb3ab5b48e5 \ + --hash=sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060 \ + --hash=sha256:538cec1e18c067d0e6103aa9a74f9e832904c957adc260e61cd9d8cf0c3b3d37 \ + --hash=sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56 \ + --hash=sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df \ + --hash=sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963 \ + --hash=sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118 \ + --hash=sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84 \ + --hash=sha256:5e01429a929600e7dab7b166062d9bb54a5eed752384c7384c968c2afab8f50f \ + --hash=sha256:5fa6a95dfee63893d80a34758cd0e0c118a30b8dcb46372bf75106c591b77889 \ + --hash=sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71 \ + --hash=sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7 \ + --hash=sha256:6b10359683bd8806a200fd2909e7c8ca3a7b24ec1d8132e483d58e791d881048 \ + --hash=sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8 \ + --hash=sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49 \ + --hash=sha256:7a7e590ff876a3eaf1c02a4dfe0724b6e69a9e9de6d8f556816f29c496046e59 \ + --hash=sha256:7eee46ccb30ff48a1e35bb818cc90846c6be2b68240e42a78599166722cea709 \ + --hash=sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d \ + --hash=sha256:841189848ba629c3552035a6a7f5bf3b02eb304e9fea7492ca220a8eda6b0e5c \ + --hash=sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e \ + --hash=sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2 \ + --hash=sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3 \ + --hash=sha256:8f333ec9c5eb1b7105e3b84b53141e66ca05a19a605368c55450b6ba208cb9ee \ + --hash=sha256:9004d8386d133b7e6135679424c91b0b854d2d164af6ea3f289f8f2761064609 \ + --hash=sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c \ + --hash=sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445 \ + --hash=sha256:93b1818e4a6e0930454f0f2af7dfce69307ca03cdcfb3739bf4d91241967b6c1 \ + --hash=sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a \ + --hash=sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5 \ + --hash=sha256:97231140a50f5d447d3164f994b86a0bed7cd016e2682f8650d6a9158e14fd31 \ + --hash=sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33 \ + --hash=sha256:98655c737850c064a65e006a3df7c997cd3b220be4ec8fe26215760b9697d4d7 \ + --hash=sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca \ + --hash=sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733 \ + --hash=sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429 \ + --hash=sha256:9f9af11306994335398293f9958071019e3ab95e9a707dc1383a35613f6abcb9 \ + --hash=sha256:a0543217a6a017692aa6ae5cc39adb75e587af0f3a82288b1492eb73dd6cc2a4 \ + --hash=sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6 \ + --hash=sha256:a407f13c188f804c759fc6a9f88286a565c242a76b27626594c133b82883b5c2 \ + --hash=sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172 \ + --hash=sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52 \ + --hash=sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7 \ + --hash=sha256:b26684587228afed0d50cf804cc71062cc9c1cdf55051c4c6345d372947b268c \ + --hash=sha256:b4938326284c4f1224178a560987b6cf8b4d38458b113d9b8c1db1a836e640a2 \ + --hash=sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6 \ + --hash=sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf \ + --hash=sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b \ + --hash=sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961 \ + --hash=sha256:c0abd12629b0af3cf590982c0b413b1e7395cd4ec026f30986818ab95bfaa94a \ + --hash=sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3 \ + --hash=sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b \ + --hash=sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1 \ + --hash=sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c \ + --hash=sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53 \ + --hash=sha256:ce1bbd7d780bb5a0da032e095c951f7014d6b0a205f8318308140f1a6aba159e \ + --hash=sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8 \ + --hash=sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a \ + --hash=sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a \ + --hash=sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32 \ + --hash=sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3 \ + --hash=sha256:e628ef0e6859ffd8273c69412a2465c4be4a9517d07261b33334b5ec6f3c7489 \ + --hash=sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23 \ + --hash=sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34 \ + --hash=sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75 \ + --hash=sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8 \ + --hash=sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d \ + --hash=sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855 \ + --hash=sha256:f33dc2a3abe9249ea5d8360f969ec7f4142e7ac45ee7014d8f8d5acddf178b7b \ + --hash=sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4 \ + --hash=sha256:f99fe611c312b3c1c0ace793f92464d8cd263cc3b26b5721950d977b006b6c4d \ + --hash=sha256:fa263a02f4f2dd2d11a7b1bb4362aa7cb1049f84a9235d31adf63f30143469a0 \ + --hash=sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba \ + --hash=sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19 + # via + # aiohttp + # yarl +multitasking==0.0.12 \ + --hash=sha256:2fba2fa8ed8c4b85e227c5dd7dc41c7d658de3b6f247927316175a57349b84d1 + # via yfinance +nbclient==0.10.4 \ + --hash=sha256:1e54091b16e6da39e297b0ece3e10f6f29f4ac4e8ee515d29f8a7099bd6553c9 \ + --hash=sha256:9162df5a7373d70d606527300a95a975a47c137776cd942e52d9c7e29ff83440 + # via nbconvert +nbconvert==7.17.0 \ + --hash=sha256:1b2696f1b5be12309f6c7d707c24af604b87dfaf6d950794c7b07acab96dda78 \ + --hash=sha256:4f99a63b337b9a23504347afdab24a11faa7d86b405e5c8f9881cd313336d518 + # via + # jupyter + # jupyter-server +nbformat==5.10.4 \ + --hash=sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a \ + --hash=sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b + # via + # jupyter-server + # nbclient + # nbconvert +nest-asyncio==1.6.0 \ + --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \ + --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c + # via ipykernel +notebook==7.5.3 \ + --hash=sha256:393ceb269cf9fdb02a3be607a57d7bd5c2c14604f1818a17dbeb38e04f98cbfa \ + --hash=sha256:c997bfa1a2a9eb58c9bbb7e77d50428befb1033dd6f02c482922e96851d67354 + # via jupyter +notebook-shim==0.2.4 \ + --hash=sha256:411a5be4e9dc882a074ccbcae671eda64cceb068767e9a3419096986560e1cef \ + --hash=sha256:b4b2cfa1b65d98307ca24361f5b30fe785b53c3fd07b7a47e89acb5e6ac638cb + # via + # jupyterlab + # notebook +numpy==2.4.2 \ + --hash=sha256:00ab83c56211a1d7c07c25e3217ea6695e50a3e2f255053686b081dc0b091a82 \ + --hash=sha256:068cdb2d0d644cdb45670810894f6a0600797a69c05f1ac478e8d31670b8ee75 \ + --hash=sha256:0f01dcf33e73d80bd8dc0f20a71303abbafa26a19e23f6b68d1aa9990af90257 \ + --hash=sha256:0fece1d1f0a89c16b03442eae5c56dc0be0c7883b5d388e0c03f53019a4bfd71 \ + --hash=sha256:12e26134a0331d8dbd9351620f037ec470b7c75929cb8a1537f6bfe411152a1a \ + --hash=sha256:1ae241bbfc6ae276f94a170b14785e561cb5e7f626b6688cf076af4110887413 \ + --hash=sha256:1f92f53998a17265194018d1cc321b2e96e900ca52d54c7c77837b71b9465181 \ + --hash=sha256:209fae046e62d0ce6435fcfe3b1a10537e858249b3d9b05829e2a05218296a85 \ + --hash=sha256:20abd069b9cda45874498b245c8015b18ace6de8546bf50dfa8cea1696ed06ef \ + --hash=sha256:21982668592194c609de53ba4933a7471880ccbaadcc52352694a59ecc860b3a \ + --hash=sha256:25f2059807faea4b077a2b6837391b5d830864b3543627f381821c646f31a63c \ + --hash=sha256:2653de5c24910e49c2b106499803124dde62a5a1fe0eedeaecf4309a5f639390 \ + --hash=sha256:2b8f157c8a6f20eb657e240f8985cc135598b2b46985c5bccbde7616dc9c6b1e \ + --hash=sha256:2fb882da679409066b4603579619341c6d6898fc83a8995199d5249f986e8e8f \ + --hash=sha256:40397bda92382fcec844066efb11f13e1c9a3e2a8e8f318fb72ed8b6db9f60f1 \ + --hash=sha256:444be170853f1f9d528428eceb55f12918e4fda5d8805480f36a002f1415e09b \ + --hash=sha256:47c5a6ed21d9452b10227e5e8a0e1c22979811cad7dcc19d8e3e2fb8fa03f1a3 \ + --hash=sha256:4f069069931240b3fc703f1e23df63443dbd6390614c8c44a87d96cd0ec81eb1 \ + --hash=sha256:52b913ec40ff7ae845687b0b34d8d93b60cb66dcee06996dd5c99f2fc9328657 \ + --hash=sha256:5633c0da313330fd20c484c78cdd3f9b175b55e1a766c4a174230c6b70ad8262 \ + --hash=sha256:5daf6f3914a733336dab21a05cdec343144600e964d2fcdabaac0c0269874b2a \ + --hash=sha256:5eea80d908b2c1f91486eb95b3fb6fab187e569ec9752ab7d9333d2e66bf2d6b \ + --hash=sha256:602f65afdef699cda27ec0b9224ae5dc43e328f4c24c689deaf77133dbee74d0 \ + --hash=sha256:659a6107e31a83c4e33f763942275fd278b21d095094044eb35569e86a21ddae \ + --hash=sha256:66cb9422236317f9d44b67b4d18f44efe6e9c7f8794ac0462978513359461554 \ + --hash=sha256:6d82351358ffbcdcd7b686b90742a9b86632d6c1c051016484fa0b326a0a1548 \ + --hash=sha256:6e9f61981ace1360e42737e2bae58b27bf28a1b27e781721047d84bd754d32e7 \ + --hash=sha256:6ed0be1ee58eef41231a5c943d7d1375f093142702d5723ca2eb07db9b934b05 \ + --hash=sha256:7cdde6de52fb6664b00b056341265441192d1291c130e99183ec0d4b110ff8b1 \ + --hash=sha256:7df2de1e4fba69a51c06c28f5a3de36731eb9639feb8e1cf7e4a7b0daf4cf622 \ + --hash=sha256:7edc794af8b36ca37ef5fcb5e0d128c7e0595c7b96a2318d1badb6fcd8ee86b1 \ + --hash=sha256:7f54844851cdb630ceb623dcec4db3240d1ac13d4990532446761baede94996a \ + --hash=sha256:805cc8de9fd6e7a22da5aed858e0ab16be5a4db6c873dde1d7451c541553aa27 \ + --hash=sha256:8906e71fd8afcb76580404e2a950caef2685df3d2a57fe82a86ac8d33cc007ba \ + --hash=sha256:89f7268c009bc492f506abd6f5265defa7cb3f7487dc21d357c3d290add45082 \ + --hash=sha256:8c50dd1fc8826f5b26a5ee4d77ca55d88a895f4e4819c7ecc2a9f5905047a443 \ + --hash=sha256:8e4549f8a3c6d13d55041925e912bfd834285ef1dd64d6bc7d542583355e2e98 \ + --hash=sha256:8e9afaeb0beff068b4d9cd20d322ba0ee1cecfb0b08db145e4ab4dd44a6b5110 \ + --hash=sha256:98f16a80e917003a12c0580f97b5f875853ebc33e2eaa4bccfc8201ac6869308 \ + --hash=sha256:9e35d3e0144137d9fdae62912e869136164534d64a169f86438bc9561b6ad49f \ + --hash=sha256:9e4424677ce4b47fe73c8b5556d876571f7c6945d264201180db2dc34f676ab5 \ + --hash=sha256:adb6ed2ad29b9e15321d167d152ee909ec73395901b70936f029c3bc6d7f4460 \ + --hash=sha256:aea4f66ff44dfddf8c2cffd66ba6538c5ec67d389285292fe428cb2c738c8aef \ + --hash=sha256:b21041e8cb6a1eb5312dd1d2f80a94d91efffb7a06b70597d44f1bd2dfc315ab \ + --hash=sha256:b2f0073ed0868db1dcd86e052d37279eef185b9c8db5bf61f30f46adac63c909 \ + --hash=sha256:b3a24467af63c67829bfaa61eecf18d5432d4f11992688537be59ecd6ad32f5e \ + --hash=sha256:b9c618d56a29c9cb1c4da979e9899be7578d2e0b3c24d52079c166324c9e8695 \ + --hash=sha256:bba37bc29d4d85761deed3954a1bc62be7cf462b9510b51d367b769a8c8df325 \ + --hash=sha256:bd3a7a9f5847d2fb8c2c6d1c862fa109c31a9abeca1a3c2bd5a64572955b2979 \ + --hash=sha256:be71bf1edb48ebbbf7f6337b5bfd2f895d1902f6335a5830b20141fc126ffba0 \ + --hash=sha256:c02ef4401a506fb60b411467ad501e1429a3487abca4664871d9ae0b46c8ba32 \ + --hash=sha256:c3cd545784805de05aafe1dde61752ea49a359ccba9760c1e5d1c88a93bbf2b7 \ + --hash=sha256:c7ac672d699bf36275c035e16b65539931347d68b70667d28984c9fb34e07fa7 \ + --hash=sha256:cb7bbb88aa74908950d979eeaa24dbdf1a865e3c7e45ff0121d8f70387b55f73 \ + --hash=sha256:cd2bd2bbed13e213d6b55dc1d035a4f91748a7d3edc9480c13898b0353708920 \ + --hash=sha256:cda077c2e5b780200b6b3e09d0b42205a3d1c68f30c6dceb90401c13bff8fe74 \ + --hash=sha256:cf28c0c1d4c4bf00f509fa7eb02c58d7caf221b50b467bcb0d9bbf1584d5c821 \ + --hash=sha256:d0d9b7c93578baafcbc5f0b83eaf17b79d345c6f36917ba0c67f45226911d499 \ + --hash=sha256:d1240d50adff70c2a88217698ca844723068533f3f5c5fa6ee2e3220e3bdb000 \ + --hash=sha256:d30291931c915b2ab5717c2974bb95ee891a1cf22ebc16a8006bd59cd210d40a \ + --hash=sha256:d9f64d786b3b1dd742c946c42d15b07497ed14af1a1f3ce840cce27daa0ce913 \ + --hash=sha256:da6cad4e82cb893db4b69105c604d805e0c3ce11501a55b5e9f9083b47d2ffe8 \ + --hash=sha256:df1b10187212b198dd45fa943d8985a3c8cf854aed4923796e0e019e113a1bda \ + --hash=sha256:e04ae107ac591763a47398bb45b568fc38f02dbc4aa44c063f67a131f99346cb \ + --hash=sha256:e6dee3bb76aa4009d5a912180bf5b2de012532998d094acee25d9cb8dee3e44a \ + --hash=sha256:e7e88598032542bd49af7c4747541422884219056c268823ef6e5e89851c8825 \ + --hash=sha256:e98c97502435b53741540a5717a6749ac2ada901056c7db951d33e11c885cc7d \ + --hash=sha256:ec055f6dae239a6299cace477b479cca2fc125c5675482daf1dd886933a1076f \ + --hash=sha256:f74f0f7779cc7ae07d1810aab8ac6b1464c3eafb9e283a40da7309d5e6e48fbb \ + --hash=sha256:fbde1b0c6e81d56f5dccd95dd4a711d9b95df1ae4009a60887e56b27e8d903fa \ + --hash=sha256:fcf92bee92742edd401ba41135185866f7026c502617f422eb432cfeca4fe236 \ + --hash=sha256:fd49860271d52127d61197bb50b64f58454e9f578cb4b2c001a6de8b1f50b0b1 + # via + # data-formulator + # db-dtypes + # pandas + # scikit-learn + # scipy + # yfinance +openai==2.16.0 \ + --hash=sha256:42eaa22ca0d8ded4367a77374104d7a2feafee5bd60a107c3c11b5243a11cd12 \ + --hash=sha256:5f46643a8f42899a84e80c38838135d7038e7718333ce61396994f887b09a59b + # via + # data-formulator + # litellm +ordered-set==4.1.0 \ + --hash=sha256:046e1132c71fcf3330438a539928932caf51ddbc582496833e23de611de14562 \ + --hash=sha256:694a8e44c87657c59292ede72891eb91d34131f6531463aab3009191c77364a8 + # via flask-limiter +overrides==7.7.0 ; python_full_version < '3.12' \ + --hash=sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a \ + --hash=sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49 + # via jupyter-server +packaging==26.0 \ + --hash=sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4 \ + --hash=sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529 + # via + # build + # db-dtypes + # google-cloud-bigquery + # huggingface-hub + # ipykernel + # jupyter-events + # jupyter-server + # jupyterlab + # jupyterlab-server + # limits + # nbconvert +pandas==2.3.3 \ + --hash=sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7 \ + --hash=sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593 \ + --hash=sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5 \ + --hash=sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791 \ + --hash=sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec \ + --hash=sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5 \ + --hash=sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac \ + --hash=sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084 \ + --hash=sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87 \ + --hash=sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35 \ + --hash=sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c \ + --hash=sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713 \ + --hash=sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523 \ + --hash=sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3 \ + --hash=sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78 \ + --hash=sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53 \ + --hash=sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c \ + --hash=sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21 \ + --hash=sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5 \ + --hash=sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45 \ + --hash=sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110 \ + --hash=sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493 \ + --hash=sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b \ + --hash=sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450 \ + --hash=sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86 \ + --hash=sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98 \ + --hash=sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89 \ + --hash=sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66 \ + --hash=sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b \ + --hash=sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8 \ + --hash=sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6 \ + --hash=sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc \ + --hash=sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788 \ + --hash=sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151 \ + --hash=sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b \ + --hash=sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d \ + --hash=sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908 \ + --hash=sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0 \ + --hash=sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b \ + --hash=sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c \ + --hash=sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee + # via + # data-formulator + # db-dtypes + # vega-datasets + # yfinance +pandocfilters==1.5.1 \ + --hash=sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e \ + --hash=sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc + # via nbconvert +parso==0.8.5 \ + --hash=sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a \ + --hash=sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887 + # via jedi +peewee==3.19.0 \ + --hash=sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417 \ + --hash=sha256:f88292a6f0d7b906cb26bca9c8599b8f4d8920ebd36124400d0cbaaaf915511f + # via yfinance +pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' \ + --hash=sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 \ + --hash=sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f + # via ipython +platformdirs==4.5.1 \ + --hash=sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda \ + --hash=sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31 + # via + # jupyter-core + # yfinance +prometheus-client==0.24.1 \ + --hash=sha256:150db128af71a5c2482b36e588fc8a6b95e498750da4b17065947c16070f4055 \ + --hash=sha256:7e0ced7fbbd40f7b84962d5d2ab6f17ef88a72504dcf7c0b40737b43b2a461f9 + # via jupyter-server +prompt-toolkit==3.0.52 \ + --hash=sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855 \ + --hash=sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955 + # via + # ipython + # jupyter-console +propcache==0.4.1 \ + --hash=sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4 \ + --hash=sha256:005f08e6a0529984491e37d8dbc3dd86f84bd78a8ceb5fa9a021f4c48d4984be \ + --hash=sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3 \ + --hash=sha256:05674a162469f31358c30bcaa8883cb7829fa3110bf9c0991fe27d7896c42d85 \ + --hash=sha256:060b16ae65bc098da7f6d25bf359f1f31f688384858204fe5d652979e0015e5b \ + --hash=sha256:120c964da3fdc75e3731aa392527136d4ad35868cc556fd09bb6d09172d9a367 \ + --hash=sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf \ + --hash=sha256:17612831fda0138059cc5546f4d12a2aacfb9e47068c06af35c400ba58ba7393 \ + --hash=sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1 \ + --hash=sha256:2ad890caa1d928c7c2965b48f3a3815c853180831d0e5503d35cf00c472f4717 \ + --hash=sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc \ + --hash=sha256:333ddb9031d2704a301ee3e506dc46b1fe5f294ec198ed6435ad5b6a085facfe \ + --hash=sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75 \ + --hash=sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6 \ + --hash=sha256:381914df18634f5494334d201e98245c0596067504b9372d8cf93f4bb23e025e \ + --hash=sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566 \ + --hash=sha256:3f7124c9d820ba5548d431afb4632301acf965db49e666aa21c305cbe8c6de12 \ + --hash=sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367 \ + --hash=sha256:41a89040cb10bd345b3c1a873b2bf36413d48da1def52f268a055f7398514874 \ + --hash=sha256:43eedf29202c08550aac1d14e0ee619b0430aaef78f85864c1a892294fbc28cf \ + --hash=sha256:473c61b39e1460d386479b9b2f337da492042447c9b685f28be4f74d3529e566 \ + --hash=sha256:49a2dc67c154db2c1463013594c458881a069fcf98940e61a0569016a583020a \ + --hash=sha256:4c3c70630930447f9ef1caac7728c8ad1c56bc5015338b20fed0d08ea2480b3a \ + --hash=sha256:4d3df5fa7e36b3225954fba85589da77a0fe6a53e3976de39caf04a0db4c36f1 \ + --hash=sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6 \ + --hash=sha256:501d20b891688eb8e7aa903021f0b72d5a55db40ffaab27edefd1027caaafa61 \ + --hash=sha256:521a463429ef54143092c11a77e04056dd00636f72e8c45b70aaa3140d639726 \ + --hash=sha256:5558992a00dfd54ccbc64a32726a3357ec93825a418a401f5cc67df0ac5d9e49 \ + --hash=sha256:55c72fd6ea2da4c318e74ffdf93c4fe4e926051133657459131a95c846d16d44 \ + --hash=sha256:564d9f0d4d9509e1a870c920a89b2fec951b44bf5ba7d537a9e7c1ccec2c18af \ + --hash=sha256:580e97762b950f993ae618e167e7be9256b8353c2dcd8b99ec100eb50f5286aa \ + --hash=sha256:5a103c3eb905fcea0ab98be99c3a9a5ab2de60228aa5aceedc614c0281cf6153 \ + --hash=sha256:5c3310452e0d31390da9035c348633b43d7e7feb2e37be252be6da45abd1abcc \ + --hash=sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf \ + --hash=sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8 \ + --hash=sha256:678ae89ebc632c5c204c794f8dab2837c5f159aeb59e6ed0539500400577298c \ + --hash=sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85 \ + --hash=sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e \ + --hash=sha256:6f6ff873ed40292cd4969ef5310179afd5db59fdf055897e282485043fc80ad0 \ + --hash=sha256:6f8b465489f927b0df505cbe26ffbeed4d6d8a2bbc61ce90eb074ff129ef0ab1 \ + --hash=sha256:74c1fb26515153e482e00177a1ad654721bf9207da8a494a0c05e797ad27b992 \ + --hash=sha256:824e908bce90fb2743bd6b59db36eb4f45cd350a39637c9f73b1c1ea66f5b75f \ + --hash=sha256:8326e144341460402713f91df60ade3c999d601e7eb5ff8f6f7862d54de0610d \ + --hash=sha256:8873eb4460fd55333ea49b7d189749ecf6e55bf85080f11b1c4530ed3034cba1 \ + --hash=sha256:89eb3fa9524f7bec9de6e83cf3faed9d79bffa560672c118a96a171a6f55831e \ + --hash=sha256:8e57061305815dfc910a3634dcf584f08168a8836e6999983569f51a8544cd89 \ + --hash=sha256:929d7cbe1f01bb7baffb33dc14eb5691c95831450a26354cd210a8155170c93a \ + --hash=sha256:92d1935ee1f8d7442da9c0c4fa7ac20d07e94064184811b685f5c4fada64553b \ + --hash=sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f \ + --hash=sha256:990f6b3e2a27d683cb7602ed6c86f15ee6b43b1194736f9baaeb93d0016633b1 \ + --hash=sha256:9a0bd56e5b100aef69bd8562b74b46254e7c8812918d3baa700c8a8009b0af66 \ + --hash=sha256:9f302f4783709a78240ebc311b793f123328716a60911d667e0c036bc5dcbded \ + --hash=sha256:a78372c932c90ee474559c5ddfffd718238e8673c340dc21fe45c5b8b54559a0 \ + --hash=sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165 \ + --hash=sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778 \ + --hash=sha256:ab2943be7c652f09638800905ee1bab2c544e537edb57d527997a24c13dc1455 \ + --hash=sha256:ab4c29b49d560fe48b696cdcb127dd36e0bc2472548f3bf56cc5cb3da2b2984f \ + --hash=sha256:af223b406d6d000830c6f65f1e6431783fc3f713ba3e6cc8c024d5ee96170a4b \ + --hash=sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237 \ + --hash=sha256:bcc9aaa5d80322bc2fb24bb7accb4a30f81e90ab8d6ba187aec0744bc302ad81 \ + --hash=sha256:c07fda85708bc48578467e85099645167a955ba093be0a2dcba962195676e859 \ + --hash=sha256:c0d4b719b7da33599dfe3b22d3db1ef789210a0597bc650b7cee9c77c2be8c5c \ + --hash=sha256:c0ef0aaafc66fbd87842a3fe3902fd889825646bc21149eafe47be6072725835 \ + --hash=sha256:c2b5e7db5328427c57c8e8831abda175421b709672f6cfc3d630c3b7e2146393 \ + --hash=sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5 \ + --hash=sha256:c80ee5802e3fb9ea37938e7eecc307fb984837091d5fd262bb37238b1ae97641 \ + --hash=sha256:c9b822a577f560fbd9554812526831712c1436d2c046cedee4c3796d3543b144 \ + --hash=sha256:cae65ad55793da34db5f54e4029b89d3b9b9490d8abe1b4c7ab5d4b8ec7ebf74 \ + --hash=sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db \ + --hash=sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403 \ + --hash=sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9 \ + --hash=sha256:d472aeb4fbf9865e0c6d622d7f4d54a4e101a89715d8904282bb5f9a2f476c3f \ + --hash=sha256:d62cdfcfd89ccb8de04e0eda998535c406bf5e060ffd56be6c586cbcc05b3311 \ + --hash=sha256:d8f353eb14ee3441ee844ade4277d560cdd68288838673273b978e3d6d2c8f36 \ + --hash=sha256:dee69d7015dc235f526fe80a9c90d65eb0039103fe565776250881731f06349f \ + --hash=sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2 \ + --hash=sha256:e35b88984e7fa64aacecea39236cee32dd9bd8c55f57ba8a75cf2399553f9bd7 \ + --hash=sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239 \ + --hash=sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757 \ + --hash=sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72 \ + --hash=sha256:ecef2343af4cc68e05131e45024ba34f6095821988a9d0a02aa7c73fcc448aa9 \ + --hash=sha256:ed5a841e8bb29a55fb8159ed526b26adc5bdd7e8bd7bf793ce647cb08656cdf4 \ + --hash=sha256:ee17f18d2498f2673e432faaa71698032b0127ebf23ae5974eeaf806c279df24 \ + --hash=sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207 \ + --hash=sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e \ + --hash=sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1 \ + --hash=sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d \ + --hash=sha256:f7ee0e597f495cf415bcbd3da3caa3bd7e816b74d0d52b8145954c5e6fd3ff37 \ + --hash=sha256:f95393b4d66bfae908c3ca8d169d5f79cd65636ae15b5e7a4f6e67af675adb0e \ + --hash=sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570 \ + --hash=sha256:fd0858c20f078a32cf55f7e81473d96dcf3b93fd2ccdb3d40fdf54b8573df3af \ + --hash=sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48 + # via + # aiohttp + # yarl +proto-plus==1.27.1 \ + --hash=sha256:912a7460446625b792f6448bade9e55cd4e41e6ac10e27009ef71a7f317fa147 \ + --hash=sha256:e4643061f3a4d0de092d62aa4ad09fa4756b2cbb89d4627f3985018216f9fefc + # via google-api-core +protobuf==6.33.5 \ + --hash=sha256:3093804752167bcab3998bec9f1048baae6e29505adaf1afd14a37bddede533c \ + --hash=sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02 \ + --hash=sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c \ + --hash=sha256:8afa18e1d6d20af15b417e728e9f60f3aa108ee76f23c3b2c07a2c3b546d3afd \ + --hash=sha256:9b71e0281f36f179d00cbcb119cb19dec4d14a81393e5ea220f64b286173e190 \ + --hash=sha256:a5cb85982d95d906df1e2210e58f8e4f1e3cdc088e52c921a041f9c9a0386de5 \ + --hash=sha256:cbf16ba3350fb7b889fca858fb215967792dc125b35c7976ca4818bee3521cf0 \ + --hash=sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b + # via + # google-api-core + # googleapis-common-protos + # grpcio-status + # proto-plus + # yfinance +psutil==7.2.2 \ + --hash=sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372 \ + --hash=sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9 \ + --hash=sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841 \ + --hash=sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63 \ + --hash=sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979 \ + --hash=sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a \ + --hash=sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b \ + --hash=sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9 \ + --hash=sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee \ + --hash=sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312 \ + --hash=sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b \ + --hash=sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9 \ + --hash=sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e \ + --hash=sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc \ + --hash=sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1 \ + --hash=sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf \ + --hash=sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea \ + --hash=sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988 \ + --hash=sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486 \ + --hash=sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00 \ + --hash=sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8 + # via ipykernel +ptyprocess==0.7.0 ; os_name != 'nt' or (sys_platform != 'emscripten' and sys_platform != 'win32') \ + --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ + --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 + # via + # pexpect + # terminado +pure-eval==0.2.3 \ + --hash=sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 \ + --hash=sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42 + # via stack-data +pyarrow==23.0.0 \ + --hash=sha256:068701f6823449b1b6469120f399a1239766b117d211c5d2519d4ed5861f75de \ + --hash=sha256:075c29aeaa685fd1182992a9ed2499c66f084ee54eea47da3eb76e125e06064c \ + --hash=sha256:0800cc58a6d17d159df823f87ad66cefebf105b982493d4bad03ee7fab84b993 \ + --hash=sha256:14de7d48052cf4b0ed174533eafa3cfe0711b8076ad70bede32cf59f744f0d7c \ + --hash=sha256:15a414f710dc927132dd67c361f78c194447479555af57317066ee5116b90e9e \ + --hash=sha256:1801ba947015d10e23bca9dd6ef5d0e9064a81569a89b6e9a63b59224fd060df \ + --hash=sha256:180e3150e7edfcd182d3d9afba72f7cf19839a497cc76555a8dce998a8f67615 \ + --hash=sha256:18ec84e839b493c3886b9b5e06861962ab4adfaeb79b81c76afbd8d84c7d5fda \ + --hash=sha256:1a9ff6fa4141c24a03a1a434c63c8fa97ce70f8f36bccabc18ebba905ddf0f17 \ + --hash=sha256:20b187ed9550d233a872074159f765f52f9d92973191cd4b93f293a19efbe377 \ + --hash=sha256:2ef0075c2488932e9d3c2eb3482f9459c4be629aa673b725d5e3cf18f777f8e4 \ + --hash=sha256:36d1b5bc6ddcaff0083ceec7e2561ed61a51f49cce8be079ee8ed406acb6fdef \ + --hash=sha256:3a7c68c722da9bb5b0f8c10e3eae71d9825a4b429b40b32709df5d1fa55beb3d \ + --hash=sha256:3e0d2e6915eca7d786be6a77bf227fbc06d825a75b5b5fe9bcbef121dec32685 \ + --hash=sha256:427deac1f535830a744a4f04a6ac183a64fcac4341b3f618e693c41b7b98d2b0 \ + --hash=sha256:4292b889cd224f403304ddda8b63a36e60f92911f89927ec8d98021845ea21be \ + --hash=sha256:4b317ea6e800b5704e5e5929acb6e2dc13e9276b708ea97a39eb8b345aa2658b \ + --hash=sha256:4d85cb6177198f3812db4788e394b757223f60d9a9f5ad6634b3e32be1525803 \ + --hash=sha256:52265266201ec25b6839bf6bd4ea918ca6d50f31d13e1cf200b4261cd11dc25c \ + --hash=sha256:54810f6e6afc4ffee7c2e0051b61722fbea9a4961b46192dcfae8ea12fa09059 \ + --hash=sha256:5574d541923efcbfdf1294a2746ae3b8c2498a2dc6cd477882f6f4e7b1ac08d3 \ + --hash=sha256:5961a9f646c232697c24f54d3419e69b4261ba8a8b66b0ac54a1851faffcbab8 \ + --hash=sha256:5b86bb649e4112fb0614294b7d0a175c7513738876b89655605ebb87c804f861 \ + --hash=sha256:632b3e7c3d232f41d64e1a4a043fb82d44f8a349f339a1188c6a0dd9d2d47d8a \ + --hash=sha256:65666fc269669af1ef1c14478c52222a2aa5c907f28b68fb50a203c777e4f60c \ + --hash=sha256:76242c846db1411f1d6c2cc3823be6b86b40567ee24493344f8226ba34a81333 \ + --hash=sha256:799965a5379589510d888be3094c2296efd186a17ca1cef5b77703d4d5121f53 \ + --hash=sha256:7a7d067c9a88faca655c71bcc30ee2782038d59c802d57950826a07f60d83c4c \ + --hash=sha256:832141cc09fac6aab1cd3719951d23301396968de87080c57c9a7634e0ecd068 \ + --hash=sha256:84839d060a54ae734eb60a756aeacb62885244aaa282f3c968f5972ecc7b1ecc \ + --hash=sha256:a149a647dbfe928ce8830a713612aa0b16e22c64feac9d1761529778e4d4eaa5 \ + --hash=sha256:a244279f240c81f135631be91146d7fa0e9e840e1dfed2aba8483eba25cd98e6 \ + --hash=sha256:ad96a597547af7827342ffb3c503c8316e5043bb09b47a84885ce39394c96e00 \ + --hash=sha256:ae7f30f898dfe44ea69654a35c93e8da4cef6606dc4c72394068fd95f8e9f54a \ + --hash=sha256:b73519f8b52ae28127000986bf228fda781e81d3095cd2d3ece76eb5cf760e1b \ + --hash=sha256:b9edf990df77c2901e79608f08c13fbde60202334a4fcadb15c1f57bf7afee43 \ + --hash=sha256:bd5556c24622df90551063ea41f559b714aa63ca953db884cfb958559087a14e \ + --hash=sha256:c4692e83e42438dba512a570c6eaa42be2f8b6c0f492aea27dec54bdc495103a \ + --hash=sha256:ce9486e0535a843cf85d990e2ec5820a47918235183a5c7b8b97ed7e92c2d47d \ + --hash=sha256:dfd9e133e60eaa847fd80530a1b89a052f09f695d0b9c34c235ea6b2e0924cf7 \ + --hash=sha256:e438dd3f33894e34fd02b26bd12a32d30d006f5852315f611aa4add6c7fab4bc \ + --hash=sha256:ebc017d765d71d80a3f8584ca0566b53e40464586585ac64176115baa0ada7d3 \ + --hash=sha256:ef7cac8fe6fccd8b9e7617bfac785b0371a7fe26af59463074e4882747145d40 + # via + # data-formulator + # db-dtypes +pyasn1==0.6.2 \ + --hash=sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf \ + --hash=sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 \ + --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \ + --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6 + # via google-auth +pycparser==3.0 ; implementation_name != 'PyPy' \ + --hash=sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29 \ + --hash=sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992 + # via cffi +pydantic==2.12.5 \ + --hash=sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49 \ + --hash=sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d + # via + # litellm + # openai +pydantic-core==2.41.5 \ + --hash=sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90 \ + --hash=sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740 \ + --hash=sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84 \ + --hash=sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33 \ + --hash=sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0 \ + --hash=sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e \ + --hash=sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0 \ + --hash=sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34 \ + --hash=sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3 \ + --hash=sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815 \ + --hash=sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14 \ + --hash=sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375 \ + --hash=sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf \ + --hash=sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1 \ + --hash=sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808 \ + --hash=sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553 \ + --hash=sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1 \ + --hash=sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470 \ + --hash=sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2 \ + --hash=sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b \ + --hash=sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660 \ + --hash=sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c \ + --hash=sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594 \ + --hash=sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008 \ + --hash=sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a \ + --hash=sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a \ + --hash=sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd \ + --hash=sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284 \ + --hash=sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586 \ + --hash=sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869 \ + --hash=sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294 \ + --hash=sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f \ + --hash=sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66 \ + --hash=sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51 \ + --hash=sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc \ + --hash=sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d \ + --hash=sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c \ + --hash=sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07 \ + --hash=sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36 \ + --hash=sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e \ + --hash=sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05 \ + --hash=sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e \ + --hash=sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612 \ + --hash=sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b \ + --hash=sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe \ + --hash=sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11 \ + --hash=sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd \ + --hash=sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b \ + --hash=sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c \ + --hash=sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a \ + --hash=sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1 \ + --hash=sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf \ + --hash=sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858 \ + --hash=sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2 \ + --hash=sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9 \ + --hash=sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2 \ + --hash=sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3 \ + --hash=sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6 \ + --hash=sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770 \ + --hash=sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc \ + --hash=sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23 \ + --hash=sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26 \ + --hash=sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa \ + --hash=sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d \ + --hash=sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3 \ + --hash=sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d \ + --hash=sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034 \ + --hash=sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9 \ + --hash=sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1 \ + --hash=sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56 \ + --hash=sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b \ + --hash=sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c \ + --hash=sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e \ + --hash=sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9 \ + --hash=sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5 \ + --hash=sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e \ + --hash=sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc \ + --hash=sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb \ + --hash=sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0 \ + --hash=sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8 \ + --hash=sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69 \ + --hash=sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c \ + --hash=sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75 \ + --hash=sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f \ + --hash=sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad \ + --hash=sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b \ + --hash=sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7 + # via pydantic +pygments==2.19.2 \ + --hash=sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887 \ + --hash=sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b + # via + # ipython + # ipython-pygments-lexers + # jupyter-console + # nbconvert +pyjwt==2.11.0 \ + --hash=sha256:35f95c1f0fbe5d5ba6e43f00271c275f7a1a4db1dab27bf708073b75318ea623 \ + --hash=sha256:94a6bde30eb5c8e04fee991062b534071fd1439ef58d2adc9ccb823e7bcd0469 + # via msal +pymongo==4.16.0 \ + --hash=sha256:12762e7cc0f8374a8cae3b9f9ed8dabb5d438c7b33329232dd9b7de783454033 \ + --hash=sha256:1c01e8a7cd0ea66baf64a118005535ab5bf9f9eb63a1b50ac3935dccf9a54abe \ + --hash=sha256:1d638b0b1b294d95d0fdc73688a3b61e05cc4188872818cd240d51460ccabcb5 \ + --hash=sha256:21d02cc10a158daa20cb040985e280e7e439832fc6b7857bff3d53ef6914ad50 \ + --hash=sha256:25a6b03a68f9907ea6ec8bc7cf4c58a1b51a18e23394f962a6402f8e46d41211 \ + --hash=sha256:2b0714d7764efb29bf9d3c51c964aed7c4c7237b341f9346f15ceaf8321fdb35 \ + --hash=sha256:2cd60cd1e05de7f01927f8e25ca26b3ea2c09de8723241e5d3bcfdc70eaff76b \ + --hash=sha256:2d0082631a7510318befc2b4fdab140481eb4b9dd62d9245e042157085da2a70 \ + --hash=sha256:311d4549d6bf1f8c61d025965aebb5ba29d1481dc6471693ab91610aaffbc0eb \ + --hash=sha256:36ef2fee50eee669587d742fb456e349634b4fcf8926208766078b089054b24b \ + --hash=sha256:3ead8a0050c53eaa55935895d6919d393d0328ec24b2b9115bdbe881aa222673 \ + --hash=sha256:46ffb728d92dd5b09fc034ed91acf5595657c7ca17d4cf3751322cd554153c17 \ + --hash=sha256:4a19ea46a0fe71248965305a020bc076a163311aefbaa1d83e47d06fa30ac747 \ + --hash=sha256:4c4872299ebe315a79f7f922051061634a64fda95b6b17677ba57ef00b2ba2a4 \ + --hash=sha256:4d4f7ba040f72a9f43a44059872af5a8c8c660aa5d7f90d5344f2ed1c3c02721 \ + --hash=sha256:4fbb8d3552c2ad99d9e236003c0b5f96d5f05e29386ba7abae73949bfebc13dd \ + --hash=sha256:55f8d5a6fe2fa0b823674db2293f92d74cd5f970bc0360f409a1fc21003862d3 \ + --hash=sha256:5d9fdb386cf958e6ef6ff537d6149be7edb76c3268cd6833e6c36aa447e4443f \ + --hash=sha256:60307bb91e0ab44e560fe3a211087748b2b5f3e31f403baf41f5b7b0a70bd104 \ + --hash=sha256:61567f712bda04c7545a037e3284b4367cad8d29b3dec84b4bf3b2147020a75b \ + --hash=sha256:6b2a20edb5452ac8daa395890eeb076c570790dfce6b7a44d788af74c2f8cf96 \ + --hash=sha256:6f2077ec24e2f1248f9cac7b9a2dfb894e50cc7939fcebfb1759f99304caabef \ + --hash=sha256:77cfd37a43a53b02b7bd930457c7994c924ad8bbe8dff91817904bcbf291b371 \ + --hash=sha256:78037d02389745e247fe5ab0bcad5d1ab30726eaac3ad79219c7d6bbb07eec53 \ + --hash=sha256:85dc2f3444c346ea019a371e321ac868a4fab513b7a55fe368f0cc78de8177cc \ + --hash=sha256:8a0f73af1ea56c422b2dcfc0437459148a799ef4231c6aee189d2d4c59d6728f \ + --hash=sha256:8a254d49a9ffe9d7f888e3c677eed3729b14ce85abb08cd74732cead6ccc3c66 \ + --hash=sha256:8ba8405065f6e258a6f872fe62d797a28f383a12178c7153c01ed04e845c600c \ + --hash=sha256:91899dd7fb9a8c50f09c3c1cf0cb73bfbe2737f511f641f19b9650deb61c00ca \ + --hash=sha256:91ac0cb0fe2bf17616c2039dac88d7c9a5088f5cb5829b27c9d250e053664d31 \ + --hash=sha256:948152b30eddeae8355495f9943a3bf66b708295c0b9b6f467de1c620f215487 \ + --hash=sha256:9caacac0dd105e2555521002e2d17afc08665187017b466b5753e84c016628e6 \ + --hash=sha256:9d9885aad05f82fd7ea0c9ca505d60939746b39263fa273d0125170da8f59098 \ + --hash=sha256:a1bf44e13cf2d44d2ea2e928a8140d5d667304abe1a61c4d55b4906f389fbe64 \ + --hash=sha256:aa30cd16ddd2f216d07ba01d9635c873e97ddb041c61cf0847254edc37d1c60e \ + --hash=sha256:acda193f440dd88c2023cb00aa8bd7b93a9df59978306d14d87a8b12fe426b05 \ + --hash=sha256:bd4911c40a43a821dfd93038ac824b756b6e703e26e951718522d29f6eb166a8 \ + --hash=sha256:be1099a8295b1a722d03fb7b48be895d30f4301419a583dcf50e9045968a041c \ + --hash=sha256:c126fb72be2518395cc0465d4bae03125119136462e1945aea19840e45d89cfc \ + --hash=sha256:c53338613043038005bf2e41a2fafa08d29cdbc0ce80891b5366c819456c1ae9 \ + --hash=sha256:c789236366525c3ee3cd6e4e450a9ff629a7d1f4d88b8e18a0aea0615fd7ecf8 \ + --hash=sha256:cf0ec79e8ca7077f455d14d915d629385153b6a11abc0b93283ed73a8013e376 \ + --hash=sha256:d15f060bc6d0964a8bb70aba8f0cb6d11ae99715438f640cff11bbcf172eb0e8 \ + --hash=sha256:dabbf3c14de75a20cc3c30bf0c6527157224a93dfb605838eabb1a2ee3be008d \ + --hash=sha256:dbbc5b254c36c37d10abb50e899bc3939bbb7ab1e7c659614409af99bd3e7675 \ + --hash=sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b \ + --hash=sha256:f1c5f1f818b669875d191323a48912d3fcd2e4906410e8297bb09ac50c4d5ccc \ + --hash=sha256:f25001a955073b80510c0c3db0e043dbbc36904fd69e511c74e3d8640b8a5111 \ + --hash=sha256:f3867dc225d9423c245a51eaac2cfcd53dde8e0a8d8090bb6aed6e31bd6c2d4f \ + --hash=sha256:f513b2c6c0d5c491f478422f6b5b5c27ac1af06a54c93ef8631806f7231bd92e \ + --hash=sha256:f6e42c1bc985d9beee884780ae6048790eb4cd565c46251932906bdb1630034a + # via data-formulator +pymysql==1.1.2 \ + --hash=sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03 \ + --hash=sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9 + # via data-formulator +pyodbc==5.3.0 \ + --hash=sha256:01166162149adf2b8a6dc21a212718f205cabbbdff4047dc0c415af3fd85867e \ + --hash=sha256:08b2439500e212625471d32f8fde418075a5ddec556e095e5a4ba56d61df2dc6 \ + --hash=sha256:0df7ff47fab91ea05548095b00e5eb87ed88ddf4648c58c67b4db95ea4913e23 \ + --hash=sha256:13656184faa3f2d5c6f19b701b8f247342ed581484f58bf39af7315c054e69db \ + --hash=sha256:1629af4706e9228d79dabb4863c11cceb22a6dab90700db0ef449074f0150c0d \ + --hash=sha256:197bb6ddafe356a916b8ee1b8752009057fce58e216e887e2174b24c7ab99269 \ + --hash=sha256:2035c7dfb71677cd5be64d3a3eb0779560279f0a8dc6e33673499498caa88937 \ + --hash=sha256:25c4cfb2c08e77bc6e82f666d7acd52f0e52a0401b1876e60f03c73c3b8aedc0 \ + --hash=sha256:2fe0e063d8fb66efd0ac6dc39236c4de1a45f17c33eaded0d553d21c199f4d05 \ + --hash=sha256:363311bd40320b4a61454bebf7c38b243cd67c762ed0f8a5219de3ec90c96353 \ + --hash=sha256:3cc472c8ae2feea5b4512e23b56e2b093d64f7cbc4b970af51da488429ff7818 \ + --hash=sha256:3f1bdb3ce6480a17afaaef4b5242b356d4997a872f39e96f015cabef00613797 \ + --hash=sha256:58635a1cc859d5af3f878c85910e5d7228fe5c406d4571bffcdd281375a54b39 \ + --hash=sha256:5cbe4d753723c8a8f65020b7a259183ef5f14307587165ce37e8c7e251951852 \ + --hash=sha256:5ceaed87ba2ea848c11223f66f629ef121f6ebe621f605cde9cfdee4fd9f4b68 \ + --hash=sha256:5dd3d5e469f89a3112cf8b0658c43108a4712fad65e576071e4dd44d2bd763c7 \ + --hash=sha256:5ebf6b5d989395efe722b02b010cb9815698a4d681921bf5db1c0e1195ac1bde \ + --hash=sha256:6132554ffbd7910524d643f13ce17f4a72f3a6824b0adef4e9a7f66efac96350 \ + --hash=sha256:676031723aac7dcbbd2813bddda0e8abf171b20ec218ab8dfb21d64a193430ea \ + --hash=sha256:729c535341bb09c476f219d6f7ab194bcb683c4a0a368010f1cb821a35136f05 \ + --hash=sha256:74528fe148980d0c735c0ebb4a4dc74643ac4574337c43c1006ac4d09593f92d \ + --hash=sha256:754d052030d00c3ac38da09ceb9f3e240e8dd1c11da8906f482d5419c65b9ef5 \ + --hash=sha256:7713c740a10f33df3cb08f49a023b7e1e25de0c7c99650876bbe717bc95ee780 \ + --hash=sha256:7e9ab0b91de28a5ab838ac4db0253d7cc8ce2452efe4ad92ee6a57b922bf0c24 \ + --hash=sha256:8339d3094858893c1a68ee1af93efc4dff18b8b65de54d99104b99af6306320d \ + --hash=sha256:9b987a25a384f31e373903005554230f5a6d59af78bce62954386736a902a4b3 \ + --hash=sha256:a48d731432abaee5256ed6a19a3e1528b8881f9cb25cb9cf72d8318146ea991b \ + --hash=sha256:af4d8c9842fc4a6360c31c35508d6594d5a3b39922f61b282c2b4c9d9da99514 \ + --hash=sha256:afe7c4ac555a8d10a36234788fc6cfc22a86ce37fc5ba88a1f75b3e6696665dc \ + --hash=sha256:b180bc5e49b74fd40a24ef5b0fe143d0c234ac1506febe810d7434bf47cb925b \ + --hash=sha256:bc834567c2990584b9726cba365834d039380c9dbbcef3030ddeb00c6541b943 \ + --hash=sha256:bfeb3e34795d53b7d37e66dd54891d4f9c13a3889a8f5fe9640e56a82d770955 \ + --hash=sha256:c2eb0b08e24fe5c40c7ebe9240c5d3bd2f18cd5617229acee4b0a0484dc226f2 \ + --hash=sha256:c5c30c5cd40b751f77bbc73edd32c4498630939bcd4e72ee7e6c9a4b982cc5ca \ + --hash=sha256:c67e7f2ce649155ea89beb54d3b42d83770488f025cf3b6f39ca82e9c598a02e \ + --hash=sha256:c6ccb5315ec9e081f5cbd66f36acbc820ad172b8fa3736cf7f993cdf69bd8a96 \ + --hash=sha256:c79df54bbc25bce9f2d87094e7b39089c28428df5443d1902b0cc5f43fd2da6f \ + --hash=sha256:cf18797a12e70474e1b7f5027deeeccea816372497e3ff2d46b15bec2d18a0cc \ + --hash=sha256:d255f6b117d05cfc046a5201fdf39535264045352ea536c35777cf66d321fbb8 \ + --hash=sha256:d32c3259762bef440707098010035bbc83d1c73d81a434018ab8c688158bd3bb \ + --hash=sha256:d89a7f2e24227150c13be8164774b7e1f9678321a4248f1356a465b9cc17d31e \ + --hash=sha256:e3c39de3005fff3ae79246f952720d44affc6756b4b85398da4c5ea76bf8f506 \ + --hash=sha256:ebc3be93f61ea0553db88589e683ace12bf975baa954af4834ab89f5ee7bf8ae \ + --hash=sha256:f1ad0e93612a6201621853fc661209d82ff2a35892b7d590106fe8f97d9f1f2a \ + --hash=sha256:f927b440c38ade1668f0da64047ffd20ec34e32d817f9a60d07553301324b364 \ + --hash=sha256:fe77eb9dcca5fc1300c9121f81040cc9011d28cff383e2c35416e9ec06d4bc95 + # via data-formulator +pyproject-hooks==1.2.0 \ + --hash=sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8 \ + --hash=sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913 + # via build +python-dateutil==2.9.0.post0 \ + --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ + --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 + # via + # arrow + # azure-kusto-data + # botocore + # google-cloud-bigquery + # jupyter-client + # pandas +python-dotenv==1.2.1 \ + --hash=sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6 \ + --hash=sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61 + # via + # data-formulator + # litellm +python-json-logger==4.0.0 \ + --hash=sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2 \ + --hash=sha256:f58e68eb46e1faed27e0f574a55a0455eecd7b8a5b88b85a784519ba3cff047f + # via jupyter-events +pytz==2025.2 \ + --hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \ + --hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00 + # via + # pandas + # yfinance +pywinpty==3.0.2 ; os_name == 'nt' \ + --hash=sha256:1505cc4cb248af42cb6285a65c9c2086ee9e7e574078ee60933d5d7fa86fb004 \ + --hash=sha256:18f78b81e4cfee6aabe7ea8688441d30247b73e52cd9657138015c5f4ee13a51 \ + --hash=sha256:28297cecc37bee9f24d8889e47231972d6e9e84f7b668909de54f36ca785029a \ + --hash=sha256:327790d70e4c841ebd9d0f295a780177149aeb405bca44c7115a3de5c2054b23 \ + --hash=sha256:34b55ae9a1b671fe3eae071d86618110538e8eaad18fcb1531c0830b91a82767 \ + --hash=sha256:663383ecfab7fc382cc97ea5c4f7f0bb32c2f889259855df6ea34e5df42d305b \ + --hash=sha256:99fdd9b455f0ad6419aba6731a7a0d2f88ced83c3c94a80ff9533d95fa8d8a9e + # via + # jupyter-server + # jupyter-server-terminals + # terminado +pyyaml==6.0.3 \ + --hash=sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c \ + --hash=sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3 \ + --hash=sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6 \ + --hash=sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c \ + --hash=sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65 \ + --hash=sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a \ + --hash=sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1 \ + --hash=sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310 \ + --hash=sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4 \ + --hash=sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea \ + --hash=sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e \ + --hash=sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac \ + --hash=sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9 \ + --hash=sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7 \ + --hash=sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35 \ + --hash=sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb \ + --hash=sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b \ + --hash=sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c \ + --hash=sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd \ + --hash=sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824 \ + --hash=sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065 \ + --hash=sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c \ + --hash=sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c \ + --hash=sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764 \ + --hash=sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196 \ + --hash=sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b \ + --hash=sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00 \ + --hash=sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac \ + --hash=sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8 \ + --hash=sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e \ + --hash=sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28 \ + --hash=sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3 \ + --hash=sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5 \ + --hash=sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf \ + --hash=sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5 \ + --hash=sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702 \ + --hash=sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788 \ + --hash=sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d \ + --hash=sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc \ + --hash=sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba \ + --hash=sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5 \ + --hash=sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26 \ + --hash=sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f \ + --hash=sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b \ + --hash=sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be \ + --hash=sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c \ + --hash=sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6 \ + --hash=sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0 + # via + # huggingface-hub + # jupyter-events +pyzmq==27.1.0 \ + --hash=sha256:01c0e07d558b06a60773744ea6251f769cd79a41a97d11b8bf4ab8f034b0424d \ + --hash=sha256:0790a0161c281ca9723f804871b4027f2e8b5a528d357c8952d08cd1a9c15581 \ + --hash=sha256:08363b2011dec81c354d694bdecaef4770e0ae96b9afea70b3f47b973655cc05 \ + --hash=sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28 \ + --hash=sha256:15c8bd0fe0dabf808e2d7a681398c4e5ded70a551ab47482067a572c054c8e2e \ + --hash=sha256:1779be8c549e54a1c38f805e56d2a2e5c009d26de10921d7d51cfd1c8d4632ea \ + --hash=sha256:18770c8d3563715387139060d37859c02ce40718d1faf299abddcdcc6a649066 \ + --hash=sha256:190cbf120fbc0fc4957b56866830def56628934a9d112aec0e2507aa6a032b97 \ + --hash=sha256:19c9468ae0437f8074af379e986c5d3d7d7bfe033506af442e8c879732bedbe0 \ + --hash=sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113 \ + --hash=sha256:1f0b2a577fd770aa6f053211a55d1c47901f4d537389a034c690291485e5fe92 \ + --hash=sha256:226b091818d461a3bef763805e75685e478ac17e9008f49fce2d3e52b3d58b86 \ + --hash=sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd \ + --hash=sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233 \ + --hash=sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31 \ + --hash=sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc \ + --hash=sha256:53b40f8ae006f2734ee7608d59ed661419f087521edbfc2149c3932e9c14808c \ + --hash=sha256:544b4e3b7198dde4a62b8ff6685e9802a9a1ebf47e77478a5eb88eca2a82f2fd \ + --hash=sha256:5bbf8d3630bf96550b3be8e1fc0fea5cbdc8d5466c1192887bd94869da17a63e \ + --hash=sha256:6bb54ca21bcfe361e445256c15eedf083f153811c37be87e0514934d6913061e \ + --hash=sha256:6df079c47d5902af6db298ec92151db82ecb557af663098b92f2508c398bb54f \ + --hash=sha256:6f3afa12c392f0a44a2414056d730eebc33ec0926aae92b5ad5cf26ebb6cc128 \ + --hash=sha256:7200bb0f03345515df50d99d3db206a0a6bee1955fbb8c453c76f5bf0e08fb96 \ + --hash=sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f \ + --hash=sha256:7ccc0700cfdf7bd487bea8d850ec38f204478681ea02a582a8da8171b7f90a1c \ + --hash=sha256:8085a9fba668216b9b4323be338ee5437a235fe275b9d1610e422ccc279733e2 \ + --hash=sha256:80d834abee71f65253c91540445d37c4c561e293ba6e741b992f20a105d69146 \ + --hash=sha256:90e6e9441c946a8b0a667356f7078d96411391a3b8f80980315455574177ec97 \ + --hash=sha256:93ad4b0855a664229559e45c8d23797ceac03183c7b6f5b4428152a6b06684a5 \ + --hash=sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf \ + --hash=sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540 \ + --hash=sha256:ac25465d42f92e990f8d8b0546b01c391ad431c3bf447683fdc40565941d0604 \ + --hash=sha256:add071b2d25f84e8189aaf0882d39a285b42fa3853016ebab234a5e78c7a43db \ + --hash=sha256:bafcb3dd171b4ae9f19ee6380dfc71ce0390fefaf26b504c0e5f628d7c8c54f2 \ + --hash=sha256:c65047adafe573ff023b3187bb93faa583151627bc9c51fc4fb2c561ed689d39 \ + --hash=sha256:c895a6f35476b0c3a54e3eb6ccf41bf3018de937016e6e18748317f25d4e925f \ + --hash=sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355 \ + --hash=sha256:ce980af330231615756acd5154f29813d553ea555485ae712c491cd483df6b7a \ + --hash=sha256:cedc4c68178e59a4046f97eca31b148ddcf51e88677de1ef4e78cf06c5376c9a \ + --hash=sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856 \ + --hash=sha256:d54530c8c8b5b8ddb3318f481297441af102517602b569146185fa10b63f4fa9 \ + --hash=sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7 \ + --hash=sha256:e343d067f7b151cfe4eb3bb796a7752c9d369eed007b91231e817071d2c2fec7 \ + --hash=sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394 \ + --hash=sha256:eca6b47df11a132d1745eb3b5b5e557a7dae2c303277aa0e69c6ba91b8736e07 \ + --hash=sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496 \ + --hash=sha256:f605d884e7c8be8fe1aa94e0a783bf3f591b84c24e4bc4f3e7564c82ac25e271 \ + --hash=sha256:fbb4f2400bfda24f12f009cba62ad5734148569ff4949b1b6ec3b519444342e6 + # via + # ipykernel + # jupyter-client + # jupyter-console + # jupyter-server +referencing==0.37.0 \ + --hash=sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231 \ + --hash=sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +regex==2026.1.15 \ + --hash=sha256:05d75a668e9ea16f832390d22131fe1e8acc8389a694c8febc3e340b0f810b93 \ + --hash=sha256:0751a26ad39d4f2ade8fe16c59b2bf5cb19eb3d2cd543e709e583d559bd9efde \ + --hash=sha256:08df9722d9b87834a3d701f3fca570b2be115654dbfd30179f30ab2f39d606d3 \ + --hash=sha256:0bf065240704cb8951cc04972cf107063917022511273e0969bdb34fc173456c \ + --hash=sha256:0bf650f26087363434c4e560011f8e4e738f6f3e029b85d4904c50135b86cfa5 \ + --hash=sha256:0dcd31594264029b57bf16f37fd7248a70b3b764ed9e0839a8f271b2d22c0785 \ + --hash=sha256:0f0c7684c7f9ca241344ff95a1de964f257a5251968484270e91c25a755532c5 \ + --hash=sha256:124dc36c85d34ef2d9164da41a53c1c8c122cfb1f6e1ec377a1f27ee81deb794 \ + --hash=sha256:164759aa25575cbc0651bef59a0b18353e54300d79ace8084c818ad8ac72b7d5 \ + --hash=sha256:166551807ec20d47ceaeec380081f843e88c8949780cd42c40f18d16168bed10 \ + --hash=sha256:18388a62989c72ac24de75f1449d0fb0b04dfccd0a1a7c1c43af5eb503d890f6 \ + --hash=sha256:194312a14819d3e44628a44ed6fea6898fdbecb0550089d84c403475138d0a09 \ + --hash=sha256:1ae6020fb311f68d753b7efa9d4b9a5d47a5d6466ea0d5e3b5a471a960ea6e4a \ + --hash=sha256:1cb740d044aff31898804e7bf1181cc72c03d11dfd19932b9911ffc19a79070a \ + --hash=sha256:1e1808471fbe44c1a63e5f577a1d5f02fe5d66031dcbdf12f093ffc1305a858e \ + --hash=sha256:1e8cd52557603f5c66a548f69421310886b28b7066853089e1a71ee710e1cdc1 \ + --hash=sha256:2748c1ec0663580b4510bd89941a31560b4b439a0b428b49472a3d9944d11cd8 \ + --hash=sha256:27618391db7bdaf87ac6c92b31e8f0dfb83a9de0075855152b720140bda177a2 \ + --hash=sha256:2a8d7b50c34578d0d3bf7ad58cde9652b7d683691876f83aedc002862a35dc5e \ + --hash=sha256:2b091aefc05c78d286657cd4db95f2e6313375ff65dcf085e42e4c04d9c8d410 \ + --hash=sha256:2c2b80399a422348ce5de4fe40c418d6299a0fa2803dd61dc0b1a2f28e280fcf \ + --hash=sha256:2f2775843ca49360508d080eaa87f94fa248e2c946bbcd963bb3aae14f333413 \ + --hash=sha256:32655d17905e7ff8ba5c764c43cb124e34a9245e45b83c22e81041e1071aee10 \ + --hash=sha256:3601ffb5375de85a16f407854d11cca8fe3f5febbe3ac78fb2866bb220c74d10 \ + --hash=sha256:3d7d92495f47567a9b1669c51fc8d6d809821849063d168121ef801bbc213846 \ + --hash=sha256:40c86d8046915bb9aeb15d3f3f15b6fd500b8ea4485b30e1bbc799dab3fe29f8 \ + --hash=sha256:4161d87f85fa831e31469bfd82c186923070fc970b9de75339b68f0c75b51903 \ + --hash=sha256:41aef6f953283291c4e4e6850607bd71502be67779586a61472beacb315c97ec \ + --hash=sha256:453078802f1b9e2b7303fb79222c054cb18e76f7bdc220f7530fdc85d319f99e \ + --hash=sha256:492534a0ab925d1db998defc3c302dae3616a2fc3fe2e08db1472348f096ddf2 \ + --hash=sha256:4c5ef43b5c2d4114eb8ea424bb8c9cec01d5d17f242af88b2448f5ee81caadbc \ + --hash=sha256:4c8fcc5793dde01641a35905d6731ee1548f02b956815f8f1cab89e515a5bdf1 \ + --hash=sha256:4def140aa6156bc64ee9912383d4038f3fdd18fee03a6f222abd4de6357ce42a \ + --hash=sha256:5170907244b14303edc5978f522f16c974f32d3aa92109fabc2af52411c9433b \ + --hash=sha256:56a5595d0f892f214609c9f76b41b7428bed439d98dc961efafdd1354d42baae \ + --hash=sha256:57e7d17f59f9ebfa9667e6e5a1c0127b96b87cb9cede8335482451ed00788ba4 \ + --hash=sha256:5ef19071f4ac9f0834793af85bd04a920b4407715624e40cb7a0631a11137cdf \ + --hash=sha256:619843841e220adca114118533a574a9cd183ed8a28b85627d2844c500a2b0db \ + --hash=sha256:621f73a07595d83f28952d7bd1e91e9d1ed7625fb7af0064d3516674ec93a2a2 \ + --hash=sha256:6d220a2517f5893f55daac983bfa9fe998a7dbcaee4f5d27a88500f8b7873788 \ + --hash=sha256:6e42844ad64194fa08d5ccb75fe6a459b9b08e6d7296bd704460168d58a388f3 \ + --hash=sha256:726ea4e727aba21643205edad8f2187ec682d3305d790f73b7a51c7587b64bdd \ + --hash=sha256:74f45d170a21df41508cb67165456538425185baaf686281fa210d7e729abc34 \ + --hash=sha256:7ef7d5d4bd49ec7364315167a4134a015f61e8266c6d446fc116a9ac4456e10d \ + --hash=sha256:8050ba2e3ea1d8731a549e83c18d2f0999fbc99a5f6bd06b4c91449f55291804 \ + --hash=sha256:82345326b1d8d56afbe41d881fdf62f1926d7264b2fc1537f99ae5da9aad7913 \ + --hash=sha256:8355ad842a7c7e9e5e55653eade3b7d1885ba86f124dd8ab1f722f9be6627434 \ + --hash=sha256:86c1077a3cc60d453d4084d5b9649065f3bf1184e22992bd322e1f081d3117fb \ + --hash=sha256:8dd16fba2758db7a3780a051f245539c4451ca20910f5a5e6ea1c08d06d4a76b \ + --hash=sha256:8e32f7896f83774f91499d239e24cebfadbc07639c1494bb7213983842348337 \ + --hash=sha256:91c5036ebb62663a6b3999bdd2e559fd8456d17e2b485bf509784cd31a8b1705 \ + --hash=sha256:9250d087bc92b7d4899ccd5539a1b2334e44eee85d848c4c1aef8e221d3f8c8f \ + --hash=sha256:9479cae874c81bf610d72b85bb681a94c95722c127b55445285fb0e2c82db8e1 \ + --hash=sha256:968c14d4f03e10b2fd960f1d5168c1f0ac969381d3c1fcc973bc45fb06346599 \ + --hash=sha256:99ad739c3686085e614bf77a508e26954ff1b8f14da0e3765ff7abbf7799f952 \ + --hash=sha256:9d787e3310c6a6425eb346be4ff2ccf6eece63017916fd77fe8328c57be83521 \ + --hash=sha256:a1774cd1981cd212506a23a14dba7fdeaee259f5deba2df6229966d9911e767a \ + --hash=sha256:a30a68e89e5a218b8b23a52292924c1f4b245cb0c68d1cce9aec9bbda6e2c160 \ + --hash=sha256:b10e42a6de0e32559a92f2f8dc908478cc0fa02838d7dbe764c44dca3fa13569 \ + --hash=sha256:b2a13dd6a95e95a489ca242319d18fc02e07ceb28fa9ad146385194d95b3c829 \ + --hash=sha256:b30bcbd1e1221783c721483953d9e4f3ab9c5d165aa709693d3f3946747b1aea \ + --hash=sha256:b5a28980a926fa810dbbed059547b02783952e2efd9c636412345232ddb87ff6 \ + --hash=sha256:b5f7d8d2867152cdb625e72a530d2ccb48a3d199159144cbdd63870882fb6f80 \ + --hash=sha256:bfb0d6be01fbae8d6655c8ca21b3b72458606c4aec9bbc932db758d47aba6db1 \ + --hash=sha256:bfd876041a956e6a90ad7cdb3f6a630c07d491280bfeed4544053cd434901681 \ + --hash=sha256:c08c1f3e34338256732bd6938747daa3c0d5b251e04b6e43b5813e94d503076e \ + --hash=sha256:c243da3436354f4af6c3058a3f81a97d47ea52c9bd874b52fd30274853a1d5df \ + --hash=sha256:c32bef3e7aeee75746748643667668ef941d28b003bfc89994ecf09a10f7a1b5 \ + --hash=sha256:c661fc820cfb33e166bf2450d3dadbda47c8d8981898adb9b6fe24e5e582ba60 \ + --hash=sha256:c6c4dcdfff2c08509faa15d36ba7e5ef5fcfab25f1e8f85a0c8f45bc3a30725d \ + --hash=sha256:c6c565d9a6e1a8d783c1948937ffc377dd5771e83bd56de8317c450a954d2056 \ + --hash=sha256:c8a154cf6537ebbc110e24dabe53095e714245c272da9c1be05734bdad4a61aa \ + --hash=sha256:c9c08c2fbc6120e70abff5d7f28ffb4d969e14294fb2143b4b5c7d20e46d1714 \ + --hash=sha256:ca89c5e596fc05b015f27561b3793dc2fa0917ea0d7507eebb448efd35274a70 \ + --hash=sha256:cf8ff04c642716a7f2048713ddc6278c5fd41faa3b9cab12607c7abecd012c22 \ + --hash=sha256:cfecdaa4b19f9ca534746eb3b55a5195d5c95b88cac32a205e981ec0a22b7d31 \ + --hash=sha256:d426616dae0967ca225ab12c22274eb816558f2f99ccb4a1d52ca92e8baf180f \ + --hash=sha256:d5eaa4a4c5b1906bd0d2508d68927f15b81821f85092e06f1a34a4254b0e1af3 \ + --hash=sha256:d639a750223132afbfb8f429c60d9d318aeba03281a5f1ab49f877456448dcf1 \ + --hash=sha256:d920392a6b1f353f4aa54328c867fec3320fa50657e25f64abf17af054fc97ac \ + --hash=sha256:d991483606f3dbec93287b9f35596f41aa2e92b7c2ebbb935b63f409e243c9af \ + --hash=sha256:d9ea2604370efc9a174c1b5dcc81784fb040044232150f7f33756049edfc9026 \ + --hash=sha256:dca3582bca82596609959ac39e12b7dad98385b4fefccb1151b937383cec547d \ + --hash=sha256:e43a55f378df1e7a4fa3547c88d9a5a9b7113f653a66821bcea4718fe6c58763 \ + --hash=sha256:e69d0deeb977ffe7ed3d2e4439360089f9c3f217ada608f0f88ebd67afb6385e \ + --hash=sha256:e90b8db97f6f2c97eb045b51a6b2c5ed69cedd8392459e0642d4199b94fabd7e \ + --hash=sha256:e9bf3f0bbdb56633c07d7116ae60a576f846efdd86a8848f8d62b749e1209ca7 \ + --hash=sha256:ec94c04149b6a7b8120f9f44565722c7ae31b7a6d2275569d2eefa76b83da3be \ + --hash=sha256:eddf73f41225942c1f994914742afa53dc0d01a6e20fe14b878a1b1edc74151f \ + --hash=sha256:ee6854c9000a10938c79238de2379bea30c82e4925a371711af45387df35cab8 \ + --hash=sha256:ef71d476caa6692eea743ae5ea23cde3260677f70122c4d258ca952e5c2d4e84 \ + --hash=sha256:f1862739a1ffb50615c0fde6bae6569b5efbe08d98e59ce009f68a336f64da75 \ + --hash=sha256:f192a831d9575271a22d804ff1a5355355723f94f31d9eef25f0d45a152fdc1a \ + --hash=sha256:f82110ab962a541737bd0ce87978d4c658f06e7591ba899192e2712a517badbb \ + --hash=sha256:f9ca1cbdc0fbfe5e6e6f8221ef2309988db5bcede52443aeaee9a4ad555e0dac \ + --hash=sha256:fe2fda4110a3d0bc163c2e0664be44657431440722c5c5315c65155cab92f9e5 \ + --hash=sha256:febd38857b09867d3ed3f4f1af7d241c5c50362e25ef43034995b77a50df494e + # via tiktoken +requests==2.32.5 \ + --hash=sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6 \ + --hash=sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf + # via + # azure-core + # azure-kusto-data + # google-api-core + # google-cloud-bigquery + # jupyterlab-server + # msal + # tiktoken + # yfinance +rfc3339-validator==0.1.4 \ + --hash=sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b \ + --hash=sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 \ + --hash=sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9 \ + --hash=sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055 + # via + # jsonschema + # jupyter-events +rfc3987-syntax==1.1.0 \ + --hash=sha256:6c3d97604e4c5ce9f714898e05401a0445a641cfa276432b0a648c80856f6a3f \ + --hash=sha256:717a62cbf33cffdd16dfa3a497d81ce48a660ea691b1ddd7be710c22f00b4a0d + # via jsonschema +rpds-py==0.30.0 \ + --hash=sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f \ + --hash=sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136 \ + --hash=sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7 \ + --hash=sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65 \ + --hash=sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4 \ + --hash=sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf \ + --hash=sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4 \ + --hash=sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2 \ + --hash=sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c \ + --hash=sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4 \ + --hash=sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3 \ + --hash=sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6 \ + --hash=sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89 \ + --hash=sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85 \ + --hash=sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa \ + --hash=sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb \ + --hash=sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6 \ + --hash=sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87 \ + --hash=sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856 \ + --hash=sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4 \ + --hash=sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f \ + --hash=sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53 \ + --hash=sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229 \ + --hash=sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad \ + --hash=sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23 \ + --hash=sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db \ + --hash=sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038 \ + --hash=sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27 \ + --hash=sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18 \ + --hash=sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083 \ + --hash=sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c \ + --hash=sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738 \ + --hash=sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898 \ + --hash=sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e \ + --hash=sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7 \ + --hash=sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08 \ + --hash=sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6 \ + --hash=sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551 \ + --hash=sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e \ + --hash=sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0 \ + --hash=sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2 \ + --hash=sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05 \ + --hash=sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0 \ + --hash=sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5 \ + --hash=sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404 \ + --hash=sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7 \ + --hash=sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394 \ + --hash=sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb \ + --hash=sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15 \ + --hash=sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed \ + --hash=sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6 \ + --hash=sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e \ + --hash=sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95 \ + --hash=sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d \ + --hash=sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950 \ + --hash=sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3 \ + --hash=sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5 \ + --hash=sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97 \ + --hash=sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e \ + --hash=sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e \ + --hash=sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b \ + --hash=sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd \ + --hash=sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad \ + --hash=sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8 \ + --hash=sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425 \ + --hash=sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d \ + --hash=sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825 \ + --hash=sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51 \ + --hash=sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e \ + --hash=sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f \ + --hash=sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8 \ + --hash=sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f \ + --hash=sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d \ + --hash=sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07 \ + --hash=sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877 \ + --hash=sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31 \ + --hash=sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58 \ + --hash=sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94 \ + --hash=sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28 \ + --hash=sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000 \ + --hash=sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1 \ + --hash=sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1 \ + --hash=sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7 \ + --hash=sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40 \ + --hash=sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d \ + --hash=sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0 \ + --hash=sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84 \ + --hash=sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f \ + --hash=sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a \ + --hash=sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419 \ + --hash=sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8 \ + --hash=sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a \ + --hash=sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9 \ + --hash=sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be \ + --hash=sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed \ + --hash=sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a \ + --hash=sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d \ + --hash=sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f \ + --hash=sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2 \ + --hash=sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f \ + --hash=sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5 + # via + # jsonschema + # referencing +rsa==4.9.1 \ + --hash=sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 \ + --hash=sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75 + # via google-auth +s3transfer==0.16.0 \ + --hash=sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe \ + --hash=sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920 + # via boto3 +scikit-learn==1.8.0 \ + --hash=sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2 \ + --hash=sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a \ + --hash=sha256:146b4d36f800c013d267b29168813f7a03a43ecd2895d04861f1240b564421da \ + --hash=sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9 \ + --hash=sha256:2838551e011a64e3053ad7618dda9310175f7515f1742fa2d756f7c874c05961 \ + --hash=sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6 \ + --hash=sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271 \ + --hash=sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809 \ + --hash=sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242 \ + --hash=sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4 \ + --hash=sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7 \ + --hash=sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76 \ + --hash=sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6 \ + --hash=sha256:5e30adb87f0cc81c7690a84f7932dd66be5bac57cfe16b91cb9151683a4a2d3b \ + --hash=sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e \ + --hash=sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7 \ + --hash=sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e \ + --hash=sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57 \ + --hash=sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735 \ + --hash=sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb \ + --hash=sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb \ + --hash=sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e \ + --hash=sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd \ + --hash=sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a \ + --hash=sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9 \ + --hash=sha256:ada8121bcb4dac28d930febc791a69f7cb1673c8495e5eee274190b73a4559c1 \ + --hash=sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde \ + --hash=sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3 \ + --hash=sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f \ + --hash=sha256:c57b1b610bd1f40ba43970e11ce62821c2e6569e4d74023db19c6b26f246cb3b \ + --hash=sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3 \ + --hash=sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e \ + --hash=sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702 \ + --hash=sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c \ + --hash=sha256:f984ca4b14914e6b4094c5d52a32ea16b49832c03bd17a110f004db3c223e8e1 \ + --hash=sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4 \ + --hash=sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd + # via data-formulator +scipy==1.17.0 \ + --hash=sha256:00fb5f8ec8398ad90215008d8b6009c9db9fa924fd4c7d6be307c6f945f9cd73 \ + --hash=sha256:031121914e295d9791319a1875444d55079885bbae5bdc9c5e0f2ee5f09d34ff \ + --hash=sha256:0937a0b0d8d593a198cededd4c439a0ea216a3f36653901ea1f3e4be949056f8 \ + --hash=sha256:0cf46c8013fec9d3694dc572f0b54100c28405d55d3e2cb15e2895b25057996e \ + --hash=sha256:0d5018a57c24cb1dd828bcf51d7b10e65986d549f52ef5adb6b4d1ded3e32a57 \ + --hash=sha256:130d12926ae34399d157de777472bf82e9061c60cc081372b3118edacafe1d00 \ + --hash=sha256:13c4096ac6bc31d706018f06a49abe0485f96499deb82066b94d19b02f664209 \ + --hash=sha256:13e861634a2c480bd237deb69333ac79ea1941b94568d4b0efa5db5e263d4fd1 \ + --hash=sha256:1f9586a58039d7229ce77b52f8472c972448cded5736eaf102d5658bbac4c269 \ + --hash=sha256:1ff269abf702f6c7e67a4b7aad981d42871a11b9dd83c58d2d2ea624efbd1088 \ + --hash=sha256:255c0da161bd7b32a6c898e7891509e8a9289f0b1c6c7d96142ee0d2b114c2ea \ + --hash=sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e \ + --hash=sha256:272a9f16d6bb4667e8b50d25d71eddcc2158a214df1b566319298de0939d2ab7 \ + --hash=sha256:2abd71643797bd8a106dff97894ff7869eeeb0af0f7a5ce02e4227c6a2e9d6fd \ + --hash=sha256:2b531f57e09c946f56ad0b4a3b2abee778789097871fc541e267d2eca081cff1 \ + --hash=sha256:30509da9dbec1c2ed8f168b8d8aa853bc6723fede1dbc23c7d43a56f5ab72a67 \ + --hash=sha256:33af70d040e8af9d5e7a38b5ed3b772adddd281e3062ff23fec49e49681c38cf \ + --hash=sha256:357ca001c6e37601066092e7c89cca2f1ce74e2a520ca78d063a6d2201101df2 \ + --hash=sha256:3625c631a7acd7cfd929e4e31d2582cf00f42fcf06011f59281271746d77e061 \ + --hash=sha256:363ad4ae2853d88ebcde3ae6ec46ccca903ea9835ee8ba543f12f575e7b07e4e \ + --hash=sha256:40052543f7bbe921df4408f46003d6f01c6af109b9e2c8a66dd1cf6cf57f7d5d \ + --hash=sha256:423ca1f6584fc03936972b5f7c06961670dbba9f234e71676a7c7ccf938a0d61 \ + --hash=sha256:474da16199f6af66601a01546144922ce402cb17362e07d82f5a6cf8f963e449 \ + --hash=sha256:4e00562e519c09da34c31685f6acc3aa384d4d50604db0f245c14e1b4488bfa2 \ + --hash=sha256:5194c445d0a1c7a6c1a4a4681b6b7c71baad98ff66d96b949097e7513c9d6742 \ + --hash=sha256:5fb10d17e649e1446410895639f3385fd2bf4c3c7dfc9bea937bddcbc3d7b9ba \ + --hash=sha256:65ec32f3d32dfc48c72df4291345dae4f048749bc8d5203ee0a3f347f96c5ce6 \ + --hash=sha256:6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752 \ + --hash=sha256:6e886000eb4919eae3a44f035e63f0fd8b651234117e8f6f29bad1cd26e7bc45 \ + --hash=sha256:7204fddcbec2fe6598f1c5fdf027e9f259106d05202a959a9f1aecf036adc9f6 \ + --hash=sha256:819fc26862b4b3c73a60d486dbb919202f3d6d98c87cf20c223511429f2d1a97 \ + --hash=sha256:8547e7c57f932e7354a2319fab613981cde910631979f74c9b542bb167a8b9db \ + --hash=sha256:85b0ac3ad17fa3be50abd7e69d583d98792d7edc08367e01445a1e2076005379 \ + --hash=sha256:87b411e42b425b84777718cc41516b8a7e0795abfa8e8e1d573bf0ef014f0812 \ + --hash=sha256:88c22af9e5d5a4f9e027e26772cc7b5922fab8bcc839edb3ae33de404feebd9e \ + --hash=sha256:9244608d27eafe02b20558523ba57f15c689357c85bdcfe920b1828750aa26eb \ + --hash=sha256:979c3a0ff8e5ba254d45d59ebd38cde48fce4f10b5125c680c7a4bfe177aab07 \ + --hash=sha256:9eeb9b5f5997f75507814ed9d298ab23f62cf79f5a3ef90031b1ee2506abdb5b \ + --hash=sha256:9fad7d3578c877d606b1150135c2639e9de9cecd3705caa37b66862977cc3e72 \ + --hash=sha256:a38c3337e00be6fd8a95b4ed66b5d988bac4ec888fd922c2ea9fe5fb1603dd67 \ + --hash=sha256:aabf057c632798832f071a8dde013c2e26284043934f53b00489f1773b33527e \ + --hash=sha256:c17514d11b78be8f7e6331b983a65a7f5ca1fd037b95e27b280921fe5606286a \ + --hash=sha256:c5e8647f60679790c2f5c76be17e2e9247dc6b98ad0d3b065861e082c56e078d \ + --hash=sha256:cacbaddd91fcffde703934897c5cd2c7cb0371fac195d383f4e1f1c5d3f3bd04 \ + --hash=sha256:d7425fcafbc09a03731e1bc05581f5fad988e48c6a861f441b7ab729a49a55ea \ + --hash=sha256:dac97a27520d66c12a34fd90a4fe65f43766c18c0d6e1c0a80f114d2260080e4 \ + --hash=sha256:dbf133ced83889583156566d2bdf7a07ff89228fe0c0cb727f777de92092ec6b \ + --hash=sha256:e8c0b331c2c1f531eb51f1b4fc9ba709521a712cce58f1aa627bc007421a5306 \ + --hash=sha256:eb2651271135154aa24f6481cbae5cc8af1f0dd46e6533fb7b56aa9727b6a232 \ + --hash=sha256:ebb7446a39b3ae0fe8f416a9a3fdc6fba3f11c634f680f16a239c5187bc487c0 \ + --hash=sha256:ec0827aa4d36cb79ff1b81de898e948a51ac0b9b1c43e4a372c0508c38c0f9a3 \ + --hash=sha256:edce1a1cf66298cccdc48a1bdf8fb10a3bf58e8b58d6c3883dd1530e103f87c0 \ + --hash=sha256:eec3842ec9ac9de5917899b277428886042a93db0b227ebbe3a333b64ec7643d \ + --hash=sha256:ef28d815f4d2686503e5f4f00edc387ae58dfd7a2f42e348bb53359538f01558 \ + --hash=sha256:f2a4942b0f5f7c23c7cd641a0ca1955e2ae83dedcff537e3a0259096635e186b \ + --hash=sha256:f3cd947f20fe17013d401b64e857c6b2da83cae567adbb75b9dcba865abc66d8 \ + --hash=sha256:f603d8a5518c7426414d1d8f82e253e454471de682ce5e39c29adb0df1efb86b \ + --hash=sha256:f7df7941d71314e60a481e02d5ebcb3f0185b8d799c70d03d8258f6c80f3d467 \ + --hash=sha256:f9eb55bb97d00f8b7ab95cb64f873eb0bf54d9446264d9f3609130381233483f \ + --hash=sha256:fc02c37a5639ee67d8fb646ffded6d793c06c5622d36b35cfa8fe5ececb8f042 \ + --hash=sha256:fe508b5690e9eaaa9467fc047f833af58f1152ae51a0d0aed67aa5801f4dd7d6 + # via scikit-learn +send2trash==2.1.0 \ + --hash=sha256:0da2f112e6d6bb22de6aa6daa7e144831a4febf2a87261451c4ad849fe9a873c \ + --hash=sha256:1c72b39f09457db3c05ce1d19158c2cbef4c32b8bedd02c155e49282b7ea7459 + # via jupyter-server +setuptools==80.10.2 \ + --hash=sha256:8b0e9d10c784bf7d262c4e5ec5d4ec94127ce206e8738f29a437945fbc219b70 \ + --hash=sha256:95b30ddfb717250edb492926c92b5221f7ef3fbcc2b07579bcd4a27da21d0173 + # via jupyterlab +shellingham==1.5.4 \ + --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ + --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de + # via huggingface-hub +six==1.17.0 \ + --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \ + --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81 + # via + # python-dateutil + # rfc3339-validator +sniffio==1.3.1 \ + --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ + --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc + # via openai +soupsieve==2.8.3 \ + --hash=sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349 \ + --hash=sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95 + # via beautifulsoup4 +stack-data==0.6.3 \ + --hash=sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9 \ + --hash=sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 + # via ipython +terminado==0.18.1 \ + --hash=sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0 \ + --hash=sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e + # via + # jupyter-server + # jupyter-server-terminals +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via scikit-learn +tiktoken==0.12.0 \ + --hash=sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa \ + --hash=sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e \ + --hash=sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb \ + --hash=sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25 \ + --hash=sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff \ + --hash=sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b \ + --hash=sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5 \ + --hash=sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3 \ + --hash=sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def \ + --hash=sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded \ + --hash=sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be \ + --hash=sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd \ + --hash=sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a \ + --hash=sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0 \ + --hash=sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0 \ + --hash=sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b \ + --hash=sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37 \ + --hash=sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb \ + --hash=sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3 \ + --hash=sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3 \ + --hash=sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b \ + --hash=sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a \ + --hash=sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3 \ + --hash=sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160 \ + --hash=sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967 \ + --hash=sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646 \ + --hash=sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931 \ + --hash=sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a \ + --hash=sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697 \ + --hash=sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8 \ + --hash=sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa \ + --hash=sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365 \ + --hash=sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e \ + --hash=sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830 \ + --hash=sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16 \ + --hash=sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88 \ + --hash=sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f \ + --hash=sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63 \ + --hash=sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad \ + --hash=sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc \ + --hash=sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71 \ + --hash=sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27 \ + --hash=sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd + # via litellm +tinycss2==1.4.0 \ + --hash=sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7 \ + --hash=sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289 + # via bleach +tokenizers==0.22.2 \ + --hash=sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e \ + --hash=sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001 \ + --hash=sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7 \ + --hash=sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd \ + --hash=sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4 \ + --hash=sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67 \ + --hash=sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a \ + --hash=sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5 \ + --hash=sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917 \ + --hash=sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c \ + --hash=sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a \ + --hash=sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc \ + --hash=sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92 \ + --hash=sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5 \ + --hash=sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48 \ + --hash=sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b + # via litellm +tornado==6.5.4 \ + --hash=sha256:053e6e16701eb6cbe641f308f4c1a9541f91b6261991160391bfc342e8a551a1 \ + --hash=sha256:1768110f2411d5cd281bac0a090f707223ce77fd110424361092859e089b38d1 \ + --hash=sha256:2d50f63dda1d2cac3ae1fa23d254e16b5e38153758470e9956cbc3d813d40843 \ + --hash=sha256:50ff0a58b0dc97939d29da29cd624da010e7f804746621c78d14b80238669335 \ + --hash=sha256:6076d5dda368c9328ff41ab5d9dd3608e695e8225d1cd0fd1e006f05da3635a8 \ + --hash=sha256:6eb82872335a53dd063a4f10917b3efd28270b56a33db69009606a0312660a6f \ + --hash=sha256:9c86b1643b33a4cd415f8d0fe53045f913bf07b4a3ef646b735a6a86047dda84 \ + --hash=sha256:a22fa9047405d03260b483980635f0b041989d8bcc9a313f8fe18b411d84b1d7 \ + --hash=sha256:d1cf66105dc6acb5af613c054955b8137e34a03698aa53272dbda4afe252be17 \ + --hash=sha256:d6241c1a16b1c9e4cc28148b1cda97dd1c6cb4fb7068ac1bedc610768dff0ba9 \ + --hash=sha256:e5fb5e04efa54cf0baabdd10061eb4148e0be137166146fff835745f59ab9f7f \ + --hash=sha256:fa07d31e0cd85c60713f2b995da613588aa03e1303d75705dca6af8babc18ddc + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # notebook + # terminado +tqdm==4.67.2 \ + --hash=sha256:649aac53964b2cb8dec76a14b405a4c0d13612cb8933aae547dd144eacc99653 \ + --hash=sha256:9a12abcbbff58b6036b2167d9d3853042b9d436fe7330f06ae047867f2f8e0a7 + # via + # huggingface-hub + # openai +traitlets==5.14.3 \ + --hash=sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7 \ + --hash=sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-console + # jupyter-core + # jupyter-events + # jupyter-server + # jupyterlab + # matplotlib-inline + # nbclient + # nbconvert + # nbformat +typer-slim==0.21.1 \ + --hash=sha256:6e6c31047f171ac93cc5a973c9e617dbc5ab2bddc4d0a3135dc161b4e2020e0d \ + --hash=sha256:73495dd08c2d0940d611c5a8c04e91c2a0a98600cbd4ee19192255a233b6dbfd + # via huggingface-hub +typing-extensions==4.15.0 \ + --hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \ + --hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548 + # via + # aiosignal + # anyio + # azure-core + # azure-identity + # azure-keyvault-secrets + # azure-storage-blob + # beautifulsoup4 + # flask-limiter + # grpcio + # huggingface-hub + # ipython + # limits + # openai + # pydantic + # pydantic-core + # referencing + # typer-slim + # typing-inspection +typing-inspection==0.4.2 \ + --hash=sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7 \ + --hash=sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464 + # via pydantic +tzdata==2025.3 \ + --hash=sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1 \ + --hash=sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7 + # via + # arrow + # pandas +uri-template==1.3.0 \ + --hash=sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7 \ + --hash=sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363 + # via jsonschema +urllib3==2.6.3 \ + --hash=sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed \ + --hash=sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4 + # via + # botocore + # requests +vega-datasets==0.9.0 \ + --hash=sha256:3d7c63917be6ca9b154b565f4779a31fedce57b01b5b9d99d8a34a7608062a1d \ + --hash=sha256:9dbe9834208e8ec32ab44970df315de9102861e4cda13d8e143aab7a80d93fc0 + # via data-formulator +vl-convert-python==1.9.0.post1 \ + --hash=sha256:3c1558fa0055e88c465bd3d71760cde9fa2c94a95f776a0ef9178252fd820b1f \ + --hash=sha256:43e9515f65bbcd317d1ef328787fd7bf0344c2fde9292eb7a0e64d5d3d29fccb \ + --hash=sha256:7e263269ac0d304640ca842b44dfe430ed863accd9edecff42e279bfc48ce940 \ + --hash=sha256:a5b06b3128037519001166f5341ec7831e19fbd7f3a5f78f73d557ac2d5859ef \ + --hash=sha256:b0e7a3245f32addec7e7abeb1badf72b1513ed71ba1dba7aca853901217b3f4e \ + --hash=sha256:e6ecfe4b7e2ea9e8c30fd6d6eaea3ef85475be1ad249407d9796dce4ecdb5b32 + # via data-formulator +wcwidth==0.5.3 \ + --hash=sha256:53123b7af053c74e9fe2e92ac810301f6139e64379031f7124574212fb3b4091 \ + --hash=sha256:d584eff31cd4753e1e5ff6c12e1edfdb324c995713f75d26c29807bb84bf649e + # via prompt-toolkit +webcolors==25.10.0 \ + --hash=sha256:032c727334856fc0b968f63daa252a1ac93d33db2f5267756623c210e57a4f1d \ + --hash=sha256:62abae86504f66d0f6364c2a8520de4a0c47b80c03fc3a5f1815fedbef7c19bf + # via jsonschema +webencodings==0.5.1 \ + --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ + --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 + # via + # bleach + # tinycss2 +websocket-client==1.9.0 \ + --hash=sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98 \ + --hash=sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef + # via jupyter-server +websockets==16.0 \ + --hash=sha256:0298d07ee155e2e9fda5be8a9042200dd2e3bb0b8a38482156576f863a9d457c \ + --hash=sha256:08d7af67b64d29823fed316505a89b86705f2b7981c07848fb5e3ea3020c1abe \ + --hash=sha256:152284a83a00c59b759697b7f9e9cddf4e3c7861dd0d964b472b70f78f89e80e \ + --hash=sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec \ + --hash=sha256:19c4dc84098e523fd63711e563077d39e90ec6702aff4b5d9e344a60cb3c0cb1 \ + --hash=sha256:1c1b30e4f497b0b354057f3467f56244c603a79c0d1dafce1d16c283c25f6e64 \ + --hash=sha256:31a52addea25187bde0797a97d6fc3d2f92b6f72a9370792d65a6e84615ac8a8 \ + --hash=sha256:32da954ffa2814258030e5a57bc73a3635463238e797c7375dc8091327434206 \ + --hash=sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156 \ + --hash=sha256:349f83cd6c9a415428ee1005cadb5c2c56f4389bc06a9af16103c3bc3dcc8b7d \ + --hash=sha256:417b28978cdccab24f46400586d128366313e8a96312e4b9362a4af504f3bbad \ + --hash=sha256:485c49116d0af10ac698623c513c1cc01c9446c058a4e61e3bf6c19dff7335a2 \ + --hash=sha256:4a1aba3340a8dca8db6eb5a7986157f52eb9e436b74813764241981ca4888f03 \ + --hash=sha256:50f23cdd8343b984957e4077839841146f67a3d31ab0d00e6b824e74c5b2f6e8 \ + --hash=sha256:52a0fec0e6c8d9a784c2c78276a48a2bdf099e4ccc2a4cad53b27718dbfd0230 \ + --hash=sha256:52ac480f44d32970d66763115edea932f1c5b1312de36df06d6b219f6741eed8 \ + --hash=sha256:5569417dc80977fc8c2d43a86f78e0a5a22fee17565d78621b6bb264a115d4ea \ + --hash=sha256:569d01a4e7fba956c5ae4fc988f0d4e187900f5497ce46339c996dbf24f17641 \ + --hash=sha256:5a4b4cc550cb665dd8a47f868c8d04c8230f857363ad3c9caf7a0c3bf8c61ca6 \ + --hash=sha256:5f451484aeb5cafee1ccf789b1b66f535409d038c56966d6101740c1614b86c6 \ + --hash=sha256:5f6261a5e56e8d5c42a4497b364ea24d94d9563e8fbd44e78ac40879c60179b5 \ + --hash=sha256:6e5a82b677f8f6f59e8dfc34ec06ca6b5b48bc4fcda346acd093694cc2c24d8f \ + --hash=sha256:71c989cbf3254fbd5e84d3bff31e4da39c43f884e64f2551d14bb3c186230f00 \ + --hash=sha256:781caf5e8eee67f663126490c2f96f40906594cb86b408a703630f95550a8c3e \ + --hash=sha256:7be95cfb0a4dae143eaed2bcba8ac23f4892d8971311f1b06f3c6b78952ee70b \ + --hash=sha256:86890e837d61574c92a97496d590968b23c2ef0aeb8a9bc9421d174cd378ae39 \ + --hash=sha256:878b336ac47938b474c8f982ac2f7266a540adc3fa4ad74ae96fea9823a02cc9 \ + --hash=sha256:8b6e209ffee39ff1b6d0fa7bfef6de950c60dfb91b8fcead17da4ee539121a79 \ + --hash=sha256:8cc451a50f2aee53042ac52d2d053d08bf89bcb31ae799cb4487587661c038a0 \ + --hash=sha256:8d7f0659570eefb578dacde98e24fb60af35350193e4f56e11190787bee77dac \ + --hash=sha256:95724e638f0f9c350bb1c2b0a7ad0e83d9cc0c9259f3ea94e40d7b02a2179ae5 \ + --hash=sha256:9b5aca38b67492ef518a8ab76851862488a478602229112c4b0d58d63a7a4d5c \ + --hash=sha256:a069d734c4a043182729edd3e9f247c3b2a4035415a9172fd0f1b71658a320a8 \ + --hash=sha256:a0b31e0b424cc6b5a04b8838bbaec1688834b2383256688cf47eb97412531da1 \ + --hash=sha256:a35539cacc3febb22b8f4d4a99cc79b104226a756aa7400adc722e83b0d03244 \ + --hash=sha256:a5e18a238a2b2249c9a9235466b90e96ae4795672598a58772dd806edc7ac6d3 \ + --hash=sha256:a653aea902e0324b52f1613332ddf50b00c06fdaf7e92624fbf8c77c78fa5767 \ + --hash=sha256:abf050a199613f64c886ea10f38b47770a65154dc37181bfaff70c160f45315a \ + --hash=sha256:af80d74d4edfa3cb9ed973a0a5ba2b2a549371f8a741e0800cb07becdd20f23d \ + --hash=sha256:b14dc141ed6d2dde437cddb216004bcac6a1df0935d79656387bd41632ba0bbd \ + --hash=sha256:b784ca5de850f4ce93ec85d3269d24d4c82f22b7212023c974c401d4980ebc5e \ + --hash=sha256:bc59589ab64b0022385f429b94697348a6a234e8ce22544e3681b2e9331b5944 \ + --hash=sha256:c0204dc62a89dc9d50d682412c10b3542d748260d743500a85c13cd1ee4bde82 \ + --hash=sha256:c0ee0e63f23914732c6d7e0cce24915c48f3f1512ec1d079ed01fc629dab269d \ + --hash=sha256:caab51a72c51973ca21fa8a18bd8165e1a0183f1ac7066a182ff27107b71e1a4 \ + --hash=sha256:d6297ce39ce5c2e6feb13c1a996a2ded3b6832155fcfc920265c76f24c7cceb5 \ + --hash=sha256:daa3b6ff70a9241cf6c7fc9e949d41232d9d7d26fd3522b1ad2b4d62487e9904 \ + --hash=sha256:e0334872c0a37b606418ac52f6ab9cfd17317ac26365f7f65e203e2d0d0d359f \ + --hash=sha256:e6578ed5b6981005df1860a56e3617f14a6c307e6a71b4fff8c48fdc50f3ed2c \ + --hash=sha256:eaded469f5e5b7294e2bdca0ab06becb6756ea86894a47806456089298813c89 \ + --hash=sha256:f4a32d1bd841d4bcbffdcb3d2ce50c09c3909fbead375ab28d0181af89fd04da \ + --hash=sha256:fd3cb4adb94a2a6e2b7c0d8d05cb94e6f1c81a0cf9dc2694fb65c7e8d94c42e4 + # via yfinance +werkzeug==3.1.5 \ + --hash=sha256:5111e36e91086ece91f93268bb39b4a35c1e6f1feac762c9c822ded0a4e322dc \ + --hash=sha256:6a548b0e88955dd07ccb25539d7d0cc97417ee9e179677d22c7041c8f078ce67 + # via + # flask + # flask-cors +widgetsnbextension==4.0.15 \ + --hash=sha256:8156704e4346a571d9ce73b84bee86a29906c9abfd7223b7228a28899ccf3366 \ + --hash=sha256:de8610639996f1567952d763a5a41af8af37f2575a41f9852a38f947eb82a3b9 + # via ipywidgets +wrapt==2.1.0 \ + --hash=sha256:01559d2961c29edc6263849fd9d32b29a20737da67648c7fd752a67bd96208c7 \ + --hash=sha256:0b660be1c9cdfb4c711baab4ccbd0e9d1b65a0480d38729ec8cdbf3b29cb7f15 \ + --hash=sha256:0e9129d1b582c55ad0dfb9e29e221daa0e02b18c67d8642bc8d08dd7038b3aed \ + --hash=sha256:0fa64a9a07df7f85b352adc42b43e7f44085fb11191b8f5b9b77219f7aaf7e17 \ + --hash=sha256:0ff9797e6e0b82b330ef80b0cdba7fcd0ca056d4c7af2ca44e3d05fd47929ede \ + --hash=sha256:12687e6271df7ae5706bee44cc1f77fecb7805976ec9f14f58381b30ae2aceb5 \ + --hash=sha256:2893498fe898719ac8fb6b4fe36ca86892bec1e2480d94e3bd1bc592c00527ad \ + --hash=sha256:2ccc89cd504fc29c32f0b24046e8edf3ef0fcbc5d5efe8c91b303c099863d2c8 \ + --hash=sha256:2cd647097df1df78f027ac7d5d663f05daa1a117b69cf7f476cb299f90557747 \ + --hash=sha256:355779ff720c11a2a5cffd03332dbce1005cb4747dca65b0fc8cdd5f8bf1037e \ + --hash=sha256:38bbe336ee32f67eb99f886bd4f040d91310b7e660061bb03b9083d26e8cf915 \ + --hash=sha256:38de19e30e266c15d542ceb0603e657db4e82c53e7f47fd70674ae5da2b41180 \ + --hash=sha256:3e2e156fe2d41700b837be9b1d8d80ebab44e9891589bc7c41578ef110184e29 \ + --hash=sha256:46583aae3c807aa76f96355c4943031225785ed160c84052612bba0e9d456639 \ + --hash=sha256:4b0a29509ef7b501abe47b693a3c91d1f21c9a948711f6ce7afa81eb274c7eae \ + --hash=sha256:52bb58b3207ace156b6134235fd43140994597704fd07d148cbcfb474ee084ea \ + --hash=sha256:5509d9150ed01c4149e40020fa68e917d5c4bb77d311e79535565c2a0418afcb \ + --hash=sha256:57df799e67b011847ef7ac64b05ed4633e56b64e7e7cab5eb83dc9689dbe0acf \ + --hash=sha256:5bacf063143fa86f15b00a21259a81c95c527a18d504b8c820835366d361c879 \ + --hash=sha256:6653bf30dbbafd55cb4553195cc60b94920b6711a8835866c0e02aa9f22c5598 \ + --hash=sha256:66f588c8b3a44863156cfaccb516f946a64b3b03a6880822ab0b878135ca1f5c \ + --hash=sha256:7112cbf72fc4035afe1e3314a311654c41dd92c2932021ef76f5ca87583917b3 \ + --hash=sha256:737e1e491473047cb66944b8b8fd23f3f542019afd6cf0569d1356d18a7ea6d5 \ + --hash=sha256:73e742368b52f9cf0921e1d2bcb8a6a44ede2e372e33df6e77caa136a942099f \ + --hash=sha256:757ff1de7e1d8db1839846672aaecf4978af433cc57e808255b83980e9651914 \ + --hash=sha256:771ec962fe3ccb078177c9b8f3529e204ffcbb11d62d509e0a438e6a83f7ca68 \ + --hash=sha256:7a0471df3fb4e85a9ff62f7142cdb169e31172467cdb79a713f9b1319c555903 \ + --hash=sha256:7c06653908a23a85c4b2455b9d37c085f9756c09058df87b4a2fce2b2f8d58c2 \ + --hash=sha256:7f7bf95bae7ac5f2bbcb307464b3b0ff70569dd3b036a87b1cf7efb2c76e66e5 \ + --hash=sha256:875a10a6f3b667f90a39010af26acf684ba831d9b18a86b242899d57c74550fa \ + --hash=sha256:9b2da9c8f1723994b335dbf9f496fbfabc76bcdd001f73772b8eb2118a714cea \ + --hash=sha256:9e971000347f61271725e801ef44fa5d01b52720e59737f0d96280bffb98c5d1 \ + --hash=sha256:9f1e9bac6a6c1ba65e0ac50e32c575266734a07b6c17e718c4babd91e2faa69b \ + --hash=sha256:a64c0fb29c89810973f312a04c067b63523e7303b9a2653820cbf16474c2e5cf \ + --hash=sha256:a7b158558438874e5fd5cb505b5a635bd08c84857bc937973d9e12e1166cdf3b \ + --hash=sha256:ad3aa174d06a14b4758d5a1678b9adde8b8e657c6695de9a3d4c223f4fcbbcce \ + --hash=sha256:bc7d496b6e16bd2f77e37e8969b21a7b58d6954e46c6689986fb67b9078100e5 \ + --hash=sha256:be2f541a242818829526e5d08c716b6730970ed0dc1b76ba962a546947d0f005 \ + --hash=sha256:bffa584240d41bc3127510e07a752f94223d73bb1283ac2e99ac44235762efd2 \ + --hash=sha256:c0fc3e388a14ef8101c685dc80b4d2932924a639a03e5c44b5ffabbda2f1f2dc \ + --hash=sha256:c70b4829c6f2f4af4cdaa16442032fcaf882063304160555e4a19b43fd2c6c9d \ + --hash=sha256:c87cd4f61a3b7cd65113e74006e1cd6352b74807fcc65d440e8342f001f8de5e \ + --hash=sha256:cbc07f101f5f1e7c23ec06a07e45715f459de992108eeb381b21b76d94dbaf4f \ + --hash=sha256:cc9e37bfe67f6ea738851dd606640a87692ff81bcc76df313fb75d08e05e855f \ + --hash=sha256:ce0cf4c79c19904aaf2e822af280d7b3c23ad902f57e31c5a19433bc86e5d36d \ + --hash=sha256:d3dd4f8c2256fcde1a85037a1837afc52e8d32d086fd669ae469455fd9a988d6 \ + --hash=sha256:d61238a072501ed071a9f4b9567d10c2eb3d2f1a0258ae79b47160871d8f29c3 \ + --hash=sha256:d7fd4c4ee51ebdf245549d54a7c2181a4f39caac97c9dc8a050b5ba814067a29 \ + --hash=sha256:d877003dbc601e1365bd03f6a980965a20d585f90c056f33e1fc241b63a6f0e7 \ + --hash=sha256:da379cbdf3b7d97ace33a69a391b7a7e2130b1aca94dc447246217994233974c \ + --hash=sha256:e00f8559ceac0fb45091daad5f15d37f2c22bdc28ed71521d47ff01aad8fff3d \ + --hash=sha256:e035693a0d25ea5bf5826df3e203dff7d091b0d5442aaefec9ca8f2bab38417f \ + --hash=sha256:e3958ba70aef2895d8c62c2d31f51ced188f60451212294677b92f4b32c12978 \ + --hash=sha256:e45f54903da38fc4f6f66397fd550fc0dac6164b4c5e721c1b4eb05664181821 \ + --hash=sha256:e90656b433808a0ab68e95aaf9f588aea5c8c7a514e180849dfc638ba00ec449 \ + --hash=sha256:eabe95ea5fbe1524a53c0f3fc535c99f2aa376ec1451b0b79d943d2240d80e36 + # via deprecated +yarl==1.22.0 \ + --hash=sha256:01e73b85a5434f89fc4fe27dcda2aff08ddf35e4d47bbbea3bdcd25321af538a \ + --hash=sha256:078278b9b0b11568937d9509b589ee83ef98ed6d561dfe2020e24a9fd08eaa2b \ + --hash=sha256:078a8aefd263f4d4f923a9677b942b445a2be970ca24548a8102689a3a8ab8da \ + --hash=sha256:0b5bcc1a9c4839e7e30b7b30dd47fe5e7e44fb7054ec29b5bb8d526aa1041093 \ + --hash=sha256:0d6e6885777af0f110b0e5d7e5dda8b704efed3894da26220b7f3d887b839a79 \ + --hash=sha256:0dd9a702591ca2e543631c2a017e4a547e38a5c0f29eece37d9097e04a7ac683 \ + --hash=sha256:131a085a53bfe839a477c0845acf21efc77457ba2bcf5899618136d64f3303a2 \ + --hash=sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff \ + --hash=sha256:139718f35149ff544caba20fce6e8a2f71f1e39b92c700d8438a0b1d2a631a02 \ + --hash=sha256:1834bb90991cc2999f10f97f5f01317f99b143284766d197e43cd5b45eb18d03 \ + --hash=sha256:1ab72135b1f2db3fed3997d7e7dc1b80573c67138023852b6efb336a5eae6511 \ + --hash=sha256:1e7ce67c34138a058fd092f67d07a72b8e31ff0c9236e751957465a24b28910c \ + --hash=sha256:1e8fbaa7cec507aa24ea27a01456e8dd4b6fab829059b69844bd348f2d467124 \ + --hash=sha256:22965c2af250d20c873cdbee8ff958fb809940aeb2e74ba5f20aaf6b7ac8c70c \ + --hash=sha256:22b029f2881599e2f1b06f8f1db2ee63bd309e2293ba2d566e008ba12778b8da \ + --hash=sha256:243dda95d901c733f5b59214d28b0120893d91777cb8aa043e6ef059d3cddfe2 \ + --hash=sha256:2ca6fd72a8cd803be290d42f2dec5cdcd5299eeb93c2d929bf060ad9efaf5de0 \ + --hash=sha256:31f0b53913220599446872d757257be5898019c85e7971599065bc55065dc99d \ + --hash=sha256:334b8721303e61b00019474cc103bdac3d7b1f65e91f0bfedeec2d56dfe74b53 \ + --hash=sha256:33e32a0dd0c8205efa8e83d04fc9f19313772b78522d1bdc7d9aed706bfd6138 \ + --hash=sha256:34b36c2c57124530884d89d50ed2c1478697ad7473efd59cfd479945c95650e4 \ + --hash=sha256:3b06bcadaac49c70f4c88af4ffcfbe3dc155aab3163e75777818092478bcbbe7 \ + --hash=sha256:3b7c88eeef021579d600e50363e0b6ee4f7f6f728cd3486b9d0f3ee7b946398d \ + --hash=sha256:3e2daa88dc91870215961e96a039ec73e4937da13cf77ce17f9cad0c18df3503 \ + --hash=sha256:3ea66b1c11c9150f1372f69afb6b8116f2dd7286f38e14ea71a44eee9ec51b9d \ + --hash=sha256:42188e6a615c1a75bcaa6e150c3fe8f3e8680471a6b10150c5f7e83f47cc34d2 \ + --hash=sha256:433885ab5431bc3d3d4f2f9bd15bfa1614c522b0f1405d62c4f926ccd69d04fa \ + --hash=sha256:4398557cbf484207df000309235979c79c4356518fd5c99158c7d38203c4da4f \ + --hash=sha256:45c2842ff0e0d1b35a6bf1cd6c690939dacb617a70827f715232b2e0494d55d1 \ + --hash=sha256:47743b82b76d89a1d20b83e60d5c20314cbd5ba2befc9cda8f28300c4a08ed4d \ + --hash=sha256:4792b262d585ff0dff6bcb787f8492e40698443ec982a3568c2096433660c694 \ + --hash=sha256:47d8a5c446df1c4db9d21b49619ffdba90e77c89ec6e283f453856c74b50b9e3 \ + --hash=sha256:47fdb18187e2a4e18fda2c25c05d8251a9e4a521edaed757fef033e7d8498d9a \ + --hash=sha256:4c52a6e78aef5cf47a98ef8e934755abf53953379b7d53e68b15ff4420e6683d \ + --hash=sha256:50678a3b71c751d58d7908edc96d332af328839eea883bb554a43f539101277a \ + --hash=sha256:51af598701f5299012b8416486b40fceef8c26fc87dc6d7d1f6fc30609ea0aa6 \ + --hash=sha256:594fcab1032e2d2cc3321bb2e51271e7cd2b516c7d9aee780ece81b07ff8244b \ + --hash=sha256:59c189e3e99a59cf8d83cbb31d4db02d66cda5a1a4374e8a012b51255341abf5 \ + --hash=sha256:5a3bf7f62a289fa90f1990422dc8dff5a458469ea71d1624585ec3a4c8d6960f \ + --hash=sha256:5c401e05ad47a75869c3ab3e35137f8468b846770587e70d71e11de797d113df \ + --hash=sha256:5d0fcda9608875f7d052eff120c7a5da474a6796fe4d83e152e0e4d42f6d1a9b \ + --hash=sha256:669930400e375570189492dc8d8341301578e8493aec04aebc20d4717f899dd6 \ + --hash=sha256:68986a61557d37bb90d3051a45b91fa3d5c516d177dfc6dd6f2f436a07ff2b6b \ + --hash=sha256:6944b2dc72c4d7f7052683487e3677456050ff77fcf5e6204e98caf785ad1967 \ + --hash=sha256:6a635ea45ba4ea8238463b4f7d0e721bad669f80878b7bfd1f89266e2ae63da2 \ + --hash=sha256:6c5010a52015e7c70f86eb967db0f37f3c8bd503a695a49f8d45700144667708 \ + --hash=sha256:70dfd4f241c04bd9239d53b17f11e6ab672b9f1420364af63e8531198e3f5fe8 \ + --hash=sha256:719ae08b6972befcba4310e49edb1161a88cdd331e3a694b84466bd938a6ab10 \ + --hash=sha256:7861058d0582b847bc4e3a4a4c46828a410bca738673f35a29ba3ca5db0b473b \ + --hash=sha256:792a2af6d58177ef7c19cbf0097aba92ca1b9cb3ffdd9c7470e156c8f9b5e028 \ + --hash=sha256:8009b3173bcd637be650922ac455946197d858b3630b6d8787aa9e5c4564533e \ + --hash=sha256:8218f4e98d3c10d683584cb40f0424f4b9fd6e95610232dd75e13743b070ee33 \ + --hash=sha256:852863707010316c973162e703bddabec35e8757e67fcb8ad58829de1ebc8590 \ + --hash=sha256:8884d8b332a5e9b88e23f60bb166890009429391864c685e17bd73a9eda9105c \ + --hash=sha256:8dee9c25c74997f6a750cd317b8ca63545169c098faee42c84aa5e506c819b53 \ + --hash=sha256:939fe60db294c786f6b7c2d2e121576628468f65453d86b0fe36cb52f987bd74 \ + --hash=sha256:9d7672ecf7557476642c88497c2f8d8542f8e36596e928e9bcba0e42e1e7d71f \ + --hash=sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1 \ + --hash=sha256:9fb17ea16e972c63d25d4a97f016d235c78dd2344820eb35bc034bc32012ee27 \ + --hash=sha256:a49370e8f711daec68d09b821a34e1167792ee2d24d405cbc2387be4f158b520 \ + --hash=sha256:a9b1ba5610a4e20f655258d5a1fdc7ebe3d837bb0e45b581398b99eb98b1f5ca \ + --hash=sha256:b0748275abb8c1e1e09301ee3cf90c8a99678a4e92e4373705f2a2570d581273 \ + --hash=sha256:b266bd01fedeffeeac01a79ae181719ff848a5a13ce10075adbefc8f1daee70e \ + --hash=sha256:b4f15793aa49793ec8d1c708ab7f9eded1aa72edc5174cae703651555ed1b601 \ + --hash=sha256:b6a6f620cfe13ccec221fa312139135166e47ae169f8253f72a0abc0dae94376 \ + --hash=sha256:b790b39c7e9a4192dc2e201a282109ed2985a1ddbd5ac08dc56d0e121400a8f7 \ + --hash=sha256:b8a0588521a26bf92a57a1705b77b8b59044cdceccac7151bd8d229e66b8dedb \ + --hash=sha256:ba440ae430c00eee41509353628600212112cd5018d5def7e9b05ea7ac34eb65 \ + --hash=sha256:bca03b91c323036913993ff5c738d0842fc9c60c4648e5c8d98331526df89784 \ + --hash=sha256:bebf8557577d4401ba8bd9ff33906f1376c877aa78d1fe216ad01b4d6745af71 \ + --hash=sha256:bec03d0d388060058f5d291a813f21c011041938a441c593374da6077fe21b1b \ + --hash=sha256:bf4a21e58b9cde0e401e683ebd00f6ed30a06d14e93f7c8fd059f8b6e8f87b6a \ + --hash=sha256:c0232bce2170103ec23c454e54a57008a9a72b5d1c3105dc2496750da8cfa47c \ + --hash=sha256:c4647674b6150d2cae088fc07de2738a84b8bcedebef29802cf0b0a82ab6face \ + --hash=sha256:c7044802eec4524fde550afc28edda0dd5784c4c45f0be151a2d3ba017daca7d \ + --hash=sha256:ca1f59c4e1ab6e72f0a23c13fca5430f889634166be85dbf1013683e49e3278e \ + --hash=sha256:cfebc0ac8333520d2d0423cbbe43ae43c8838862ddb898f5ca68565e395516e9 \ + --hash=sha256:d3e32536234a95f513bd374e93d717cf6b2231a791758de6c509e3653f234c95 \ + --hash=sha256:d5372ca1df0f91a86b047d1277c2aaf1edb32d78bbcefffc81b40ffd18f027ed \ + --hash=sha256:d77e1b2c6d04711478cb1c4ab90db07f1609ccf06a287d5607fcd90dc9863acf \ + --hash=sha256:d947071e6ebcf2e2bee8fce76e10faca8f7a14808ca36a910263acaacef08eca \ + --hash=sha256:de6b9a04c606978fdfe72666fa216ffcf2d1a9f6a381058d4378f8d7b1e5de62 \ + --hash=sha256:e1651bf8e0398574646744c1885a41198eba53dc8a9312b954073f845c90a8df \ + --hash=sha256:e1b51bebd221006d3d2f95fbe124b22b247136647ae5dcc8c7acafba66e5ee67 \ + --hash=sha256:e340382d1afa5d32b892b3ff062436d592ec3d692aeea3bef3a5cfe11bbf8c6f \ + --hash=sha256:e4b582bab49ac33c8deb97e058cd67c2c50dac0dd134874106d9c774fd272529 \ + --hash=sha256:e51ac5435758ba97ad69617e13233da53908beccc6cfcd6c34bbed8dcbede486 \ + --hash=sha256:e5542339dcf2747135c5c85f68680353d5cb9ffd741c0f2e8d832d054d41f35a \ + --hash=sha256:e6438cc8f23a9c1478633d216b16104a586b9761db62bfacb6425bac0a36679e \ + --hash=sha256:ea70f61a47f3cc93bdf8b2f368ed359ef02a01ca6393916bc8ff877427181e74 \ + --hash=sha256:ebd4549b108d732dba1d4ace67614b9545b21ece30937a63a65dd34efa19732d \ + --hash=sha256:efb07073be061c8f79d03d04139a80ba33cbd390ca8f0297aae9cce6411e4c6b \ + --hash=sha256:f1e09112a2c31ffe8d80be1b0988fa6a18c5d5cad92a9ffbb1c04c91bfe52ad2 \ + --hash=sha256:f3d7a87a78d46a2e3d5b72587ac14b4c16952dd0887dbb051451eceac774411e \ + --hash=sha256:f4afb5c34f2c6fecdcc182dfcfc6af6cccf1aa923eed4d6a12e9d96904e1a0d8 \ + --hash=sha256:f6d2cb59377d99718913ad9a151030d6f83ef420a2b8f521d94609ecc106ee82 \ + --hash=sha256:f87ac53513d22240c7d59203f25cc3beac1e574c6cd681bbfd321987b69f95fd \ + --hash=sha256:ff86011bd159a9d2dfc89c34cfd8aff12875980e3bd6a39ff097887520e60249 + # via aiohttp +yfinance==1.1.0 \ + --hash=sha256:1e852ce10a5d6679200efa2b09ed3f3b01dbe84505e2c1c139be7d2b3a597b10 \ + --hash=sha256:e610fec1d2b052e3b8f2cf44bdcff014bcf15458a9b072d5a3e02507e20d69d2 + # via data-formulator +zipp==3.23.0 \ + --hash=sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e \ + --hash=sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166 + # via importlib-metadata diff --git a/src/app/App.tsx b/src/app/App.tsx index 209f3ac5..78155545 100644 --- a/src/app/App.tsx +++ b/src/app/App.tsx @@ -10,8 +10,8 @@ import { dfActions, dfSelectors, fetchAvailableModels, - getSessionId, } from './dfSlice' +import { getBrowserId } from './identity'; import { red, purple, blue, brown, yellow, orange, } from '@mui/material/colors'; @@ -60,6 +60,7 @@ import { } from "react-router-dom"; import { About } from '../views/About'; import { MessageSnackbar } from '../views/MessageSnackbar'; +import { ChartRenderService } from '../views/ChartRenderService'; import { DictTable } from '../components/ComponentType'; import { AppDispatch } from './store'; import dfLogo from '../assets/df-logo.png'; @@ -68,10 +69,9 @@ import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; import UploadFileIcon from '@mui/icons-material/UploadFile'; import DownloadIcon from '@mui/icons-material/Download'; import { handleDBDownload } from '../views/DBTableManager'; -import { getUrls } from './utils'; +import { getUrls, fetchWithIdentity } from './utils'; import { UnifiedDataUploadDialog } from '../views/UnifiedDataUploadDialog'; import ChatIcon from '@mui/icons-material/Chat'; -import { AgentRulesDialog } from '../views/AgentRulesDialog'; import ArticleIcon from '@mui/icons-material/Article'; import EditIcon from '@mui/icons-material/Edit'; import ArrowBackIcon from '@mui/icons-material/ArrowBack'; @@ -100,6 +100,12 @@ const AppBar = styled(MuiAppBar)(({ theme }) => ({ })); declare module '@mui/material/styles' { + interface PaletteColor { + bgcolor?: string; + } + interface SimplePaletteColorOptions { + bgcolor?: string; + } interface Palette { derived: Palette['primary']; custom: Palette['primary']; @@ -159,7 +165,7 @@ export const ImportStateButton: React.FC<{}> = ({ }) => { } export const ExportStateButton: React.FC<{}> = ({ }) => { - const sessionId = useSelector((state: DataFormulatorState) => state.sessionId); + const identity = useSelector((state: DataFormulatorState) => state.identity); const tables = useSelector((state: DataFormulatorState) => state.tables); const fullStateJson = useSelector((state: DataFormulatorState) => { // Fields to exclude from serialization @@ -168,7 +174,7 @@ export const ExportStateButton: React.FC<{}> = ({ }) => { 'selectedModelId', 'testedModels', 'dataLoaderConnectParams', - 'sessionId', + 'identity', 'agentRules', 'serverConfig', ]); @@ -197,7 +203,7 @@ export const ExportStateButton: React.FC<{}> = ({ }) => { a.click(); } let firstTableName = tables.length > 0 ? tables[0].id: ''; - download(fullStateJson, `df_state_${firstTableName}_${sessionId?.slice(0, 4)}.json`, 'text/plain'); + download(fullStateJson, `df_state_${firstTableName}_${identity.id.slice(0, 4)}.json`, 'text/plain'); }} startIcon={} > @@ -241,7 +247,7 @@ const TableMenu: React.FC = () => { const SessionMenu: React.FC = () => { const [anchorEl, setAnchorEl] = useState(null); const open = Boolean(anchorEl); - const sessionId = useSelector((state: DataFormulatorState) => state.sessionId); + const identity = useSelector((state: DataFormulatorState) => state.identity); const tables = useSelector((state: DataFormulatorState) => state.tables); const theme = useTheme(); @@ -274,12 +280,12 @@ const SessionMenu: React.FC = () => { database file - {sessionId && tables.some(t => t.virtual) && + {tables.some(t => t.virtual) && This session contains data stored in the database, export and reload the database to resume the session later. } - t.virtual)} onClick={() => { - handleDBDownload(sessionId ?? ''); + t.virtual)} onClick={() => { + handleDBDownload(identity.id); }}> {}}> - - - ) : ( + let actionButtons = ( - - - ); - } - let tableSelectionPanel = - {/* Recent Data Loaders */} - - - External Data Loaders - - ( + - ); - })} - - {/* Derived Views Section */} - {dbTables.filter(t => t.view_source !== null).length > 0 && ( - - - - - t.view_source !== null && t.view_source !== undefined && !tables.some(t2 => t2.id === t.name)).length === 0} - sx={{ - padding: 0.5, - mr: 0.5, - '&:hover': { - backgroundColor: alpha(theme.palette.primary.main, 0.08), - } - }} - > - - - - - - - {dbTables.filter(t => t.view_source !== null).map((t, i) => { - return ( - - ); - })} - - - - )} + {dataLoaderType} + + ))} - let dataConnectorView = + let dataConnectorView = - - {/* File upload */} - {selectedDataLoader === 'file upload' && ( - - {uploadFileButton({isUploading ? 'uploading...' : 'upload a csv/tsv file to the local database'})} + {/* Empty state when no loader selected */} + {selectedDataLoader === '' && ( + + + Select a data loader from the left panel + )} {/* Data loader forms */} {dataLoaderMetadata && Object.entries(dataLoaderMetadata).map(([dataLoaderType, metadata]) => ( selectedDataLoader === dataLoaderType && ( - + { setIsUploading(false); - fetchTables().then(() => { - // Switch back to tables view after import - setSelectedDataLoader(""); - // Navigate to the first imported table after tables are fetched - if (status === "success" && importedTables && importedTables.length > 0) { - setSelectedTabKey(importedTables[0]); - } - }); - if (status === "error") { + if (status === "success") { + onClose?.(); + } else { setSystemMessage(message, "error"); } }} @@ -890,172 +339,11 @@ export const DBManagerPane: React.FC<{ ))} ; - let tableView = - {/* Empty state */} - {selectedTabKey === '' && ( - - The database is empty, refresh the table list or import some data to get started. - - )} - - {/* Table content */} - {dbTables.map((t, i) => { - if (selectedTabKey !== t.name) return null; - - const currentTable = t; - - return ( - - - - {currentTable.view_source ? : } - - {currentTable.name} - - - {currentTable.source_metadata && `imported from ${currentTable.source_metadata.data_loader_type}.${currentTable.source_metadata.source_table_name}`} - - - - handleDropTable(currentTable.name)} - title="Drop Table" - > - - - - - - - { - return Object.fromEntries( - currentTable.columns.map((col) => [col.name, String(row[col.name] ?? '')]) - ); - })} - columnDefs={currentTable.columns.map((col) => ({ - id: col.name, - label: col.name, - minWidth: 80 - }))} - rowsPerPageNum={-1} - compact={false} - maxCellWidth={80} - isIncompleteTable={currentTable.row_count > 10} - maxHeight={340} - /> - - {currentTable.row_count > 10 && ( - - - Showing first 9 rows of {currentTable.row_count} total rows - - - )} - - {tables.some(t => t.id === currentTable.name) ? ( - - - - Loaded - - - ) : ( - - {/* Watch settings - only show for tables that can be refreshed */} - {currentTable.source_metadata && ( - - - setWatchEnabled(e.target.checked)} - size="small" - /> - } - label={ - - Watch Mode - - } - /> - {watchEnabled ? ( - - - check for updates every - - {[ - { seconds: 10, label: '10s' }, - { seconds: 30, label: '30s' }, - { seconds: 60, label: '1m' }, - { seconds: 300, label: '5m' }, - { seconds: 600, label: '10m' }, - { seconds: 1800, label: '30m' }, - { seconds: 3600, label: '1h' }, - { seconds: 86400, label: '24h' }, - ].map((opt) => ( - setWatchInterval(opt.seconds)} - sx={{ - cursor: 'pointer', - fontSize: '0.7rem', - height: 24, - }} - /> - ))} - - ) : - automatically check and refresh data from the database at regular intervals - } - - - )} - - - )} - - ); - })} - ; - let mainContent = {/* Button navigation - similar to TableSelectionView */} - + {/* Available Tables Section - always visible */} {tableSelectionPanel} - {/* Reset Confirmation Popover */} - setResetAnchorEl(null)} - anchorOrigin={{ - vertical: 'bottom', - horizontal: 'left', - }} - transformOrigin={{ - vertical: 'top', - horizontal: 'left', - }} - > - - - Reset backend database and delete all tables? This cannot be undone. - - - - - - - - {/* Content area - show connector view if a connector is selected, otherwise show table view */} + {/* Content area - show selected data loader form */} - {selectedDataLoader !== "" ? dataConnectorView : tableView} + {dataConnectorView} @@ -1153,9 +400,10 @@ export const DataLoaderForm: React.FC<{ onFinish: (status: "success" | "error", message: string, importedTables?: string[]) => void }> = ({dataLoaderType, paramDefs, authInstructions, onImport, onFinish}) => { - const dispatch = useDispatch(); + const dispatch = useDispatch(); const theme = useTheme(); const params = useSelector((state: DataFormulatorState) => state.dataLoaderConnectParams[dataLoaderType] ?? {}); + const frontendRowLimit = useSelector((state: DataFormulatorState) => state.config?.frontendRowLimit ?? 10000); const [tableMetadata, setTableMetadata] = useState>({}); let [displaySamples, setDisplaySamples] = useState>({}); @@ -1163,6 +411,9 @@ export const DataLoaderForm: React.FC<{ const [tableImportConfigs, setTableImportConfigs] = useState>({}); const [subsetConfigAnchor, setSubsetConfigAnchor] = useState<{element: HTMLElement, tableName: string} | null>(null); + // Store on server toggle for data loader imports + const [importStoreOnServer, setImportStoreOnServer] = useState(true); + // Helper to get import config for a table (defaults to 'none') const getTableConfig = (tableName: string): TableImportConfig => { return tableImportConfigs[tableName] ?? { mode: 'none' }; @@ -1185,7 +436,7 @@ export const DataLoaderForm: React.FC<{ } let tableMetadataBox = [ - + @@ -1492,69 +743,95 @@ export const DataLoaderForm: React.FC<{ ); })()} , - Object.keys(tableMetadata).length > 0 && + Object.keys(tableMetadata).length > 0 && + + setImportStoreOnServer(e.target.checked)} + size="small" + /> + } + label={ + + {importStoreOnServer ? 'Store on server' : `Local only (≤${frontendRowLimit.toLocaleString()} rows)`} + + } + /> + ] @@ -1562,7 +839,7 @@ export const DataLoaderForm: React.FC<{ const isConnected = Object.keys(tableMetadata).length > 0; return ( - + Import tables from {dataLoaderType} @@ -1575,111 +852,94 @@ export const DataLoaderForm: React.FC<{ } {isConnected ? ( // Connected state: show connection parameters and disconnect button - - - - - {paramDefs.filter((paramDef) => params[paramDef.name]).map((paramDef, index) => ( - - - {paramDef.name}: - - - {params[paramDef.name] || '(empty)'} - - {index < paramDefs.filter((paramDef) => params[paramDef.name]).length - 1 && ( - - • - - )} - - ))} - - - - - - - - table filter + + + + {paramDefs.filter((paramDef) => params[paramDef.name]).map((paramDef, index) => ( + + + {paramDef.name}: - + + {params[paramDef.name] || '(empty)'} + + {index < paramDefs.filter((p) => params[p.name]).length - 1 && ( + · + )} + + ))} + + + + + table filter + setTableFilter(event.target.value)} /> - - - - + + @@ -1688,33 +948,40 @@ export const DataLoaderForm: React.FC<{ ) : ( // Not connected: show connection forms <> - + {paramDefs.map((paramDef) => ( - + {paramDef.name} - {paramDef.required && *} + {paramDef.required && *} { dispatch(dfActions.updateDataLoaderConnectParam({ dataLoaderType, paramName: paramDef.name, @@ -1723,35 +990,26 @@ export const DataLoaderForm: React.FC<{ /> ))} - - - - - table filter - - + + + + + table filter + setTableFilter(event.target.value)} /> @@ -1760,10 +1018,11 @@ export const DataLoaderForm: React.FC<{ } - - - - - {authInstructions.trim()} - - + {authInstructions.trim() && ( + + {authInstructions.trim()} + + )} )} ); diff --git a/src/views/DataFormulator.tsx b/src/views/DataFormulator.tsx index f9cea9d5..714077ef 100644 --- a/src/views/DataFormulator.tsx +++ b/src/views/DataFormulator.tsx @@ -26,6 +26,7 @@ import { useTheme, alpha, } from '@mui/material'; +import { borderColor, shadow, radius } from '../app/tokens'; import { FreeDataViewFC } from './DataView'; import { VisualizationViewFC } from './VisualizationView'; @@ -35,11 +36,12 @@ import { DndProvider } from 'react-dnd' import { HTML5Backend } from 'react-dnd-html5-backend' import { toolName } from '../app/App'; import { DataThread } from './DataThread'; +import { ChartRecBox } from './ChartRecBox'; import dfLogo from '../assets/df-logo.png'; import exampleImageTable from "../assets/example-image-table.png"; import { ModelSelectionButton } from './ModelSelectionDialog'; -import { getUrls } from '../app/utils'; +import { getUrls, fetchWithIdentity } from '../app/utils'; import { UnifiedDataUploadDialog, UploadTabType, DataLoadMenu } from './UnifiedDataUploadDialog'; import { ReportView } from './ReportView'; import { ExampleSession, exampleSessions, ExampleSessionCard } from './ExampleSessions'; @@ -48,6 +50,7 @@ import { useDataRefresh, useDerivedTableRefresh } from '../app/useDataRefresh'; export const DataFormulatorFC = ({ }) => { const tables = useSelector((state: DataFormulatorState) => state.tables); + const focusedTableId = useSelector((state: DataFormulatorState) => state.focusedTableId); const models = useSelector((state: DataFormulatorState) => state.models); const selectedModelId = useSelector((state: DataFormulatorState) => state.selectedModelId); const viewMode = useSelector((state: DataFormulatorState) => state.viewMode); @@ -146,7 +149,7 @@ export const DataFormulatorFC = ({ }) => { body: JSON.stringify({ model }), }; try { - const response = await fetch(getUrls().TEST_MODEL, {...message }); + const response = await fetchWithIdentity(getUrls().TEST_MODEL, {...message }); const data = await response.json(); const status = data["status"] || 'error'; return {model, status, message: data["message"] || ""}; @@ -199,8 +202,8 @@ export const DataFormulatorFC = ({ }) => { ); let borderBoxStyle = { - border: '1px solid rgba(0,0,0,0.1)', - borderRadius: '16px', + border: `1px solid ${borderColor.view}`, + borderRadius: radius.pill, //boxShadow: '0 0 5px rgba(0,0,0,0.1)', } @@ -209,7 +212,8 @@ export const DataFormulatorFC = ({ }) => { + display: 'flex', height: '100%', width: 'fit-content', flexDirection: 'column', + position: 'relative'}}> {tables.length > 0 ? { overflow: 'hidden', alignContent: 'flex-start', height: '100%', - }}/> : ""} + }}/> : ""} + {/* Floating chat chip for exploration — overlays bottom of DataThread + {tables.length > 0 && ( + + + + )} */} ): strin return uniqueName; }; -export const DataLoadingChat: React.FC = () => { +export const DataLoadingChat: React.FC<{storeOnServer?: boolean}> = ({storeOnServer = true}) => { const theme = useTheme(); const dispatch = useDispatch(); const inputBoxRef = useRef<(() => void) | null>(null); @@ -50,13 +52,13 @@ export const DataLoadingChat: React.FC = () => { let threadsComponent = dataCleanBlocksThread.map((thread, i) => { return }) @@ -82,8 +84,8 @@ export const DataLoadingChat: React.FC = () => { const unique = getUniqueTableName(base, existingNames); const table = createTableFromText(unique, selectedTable.content.value, selectedTable.context); if (table) { - dispatch(dfActions.loadTable(table)); - dispatch(fetchFieldSemanticType(table)); + const tableWithSource = { ...table, source: { type: 'extract' as const } }; + dispatch(loadTable({ table: tableWithSource, storeOnServer })); } }; @@ -189,8 +191,7 @@ export const DataLoadingChat: React.FC = () => { maxWidth: 240, overflow: 'hidden', height: '100%', - borderRight: '1px solid', - borderColor: 'divider' + borderRight: `1px solid ${borderColor.view}` }}> { )} { flexDirection: 'row', alignItems: 'center', gap: 1, - borderTop: '1px solid', - borderColor: 'divider', + borderTop: `1px solid ${borderColor.divider}`, '& .MuiButton-root': { textTransform: 'none' } }}> + + + + + ); -}; +}); -// Compact view for thread0 - displays table cards with charts in a simple grid -// Reuses SingleThreadGroupView with compact mode -let CompactThread0View: FC<{ - scrollRef: any, - leafTables: DictTable[]; - chartElements: { tableId: string, chartId: string, element: any }[]; - sx?: SxProps -}> = function ({ - scrollRef, - leafTables, - chartElements, - sx -}) { +const WorkspacePanel: FC<{ + tables: DictTable[], + chartElements: { tableId: string, chartId: string, element: any }[], + sx?: SxProps, +}> = function ({ tables, chartElements, sx }) { const theme = useTheme(); - + const dispatch = useDispatch(); + const focusedTableId = useSelector((state: DataFormulatorState) => state.focusedTableId); + const focusedChartId = useSelector((state: DataFormulatorState) => state.focusedChartId); + const [uploadDialogOpen, setUploadDialogOpen] = useState(false); + + const fileItemSx = (isActive: boolean) => ({ + display: 'flex', + alignItems: 'center', + gap: 0.75, + px: 1, + py: '3px', + borderRadius: '4px', + cursor: 'pointer', + fontSize: 11, + transition: transition.fast, + backgroundColor: isActive ? alpha(theme.palette.primary.main, 0.08) : 'transparent', + '&:hover': { + backgroundColor: isActive ? alpha(theme.palette.primary.main, 0.12) : 'rgba(0,0,0,0.04)', + }, + }); + + const getTableIcon = (table: DictTable) => { + const isStreaming = (table.source?.type === 'stream' || table.source?.type === 'database') && table.source?.autoRefresh; + const iconSx = { fontSize: 14, color: 'text.secondary', flexShrink: 0 }; + if (isStreaming) return ; + if (table.virtual) return ; + return ; + }; + return ( - - - - - workspace + + + workspace + + + + {tables.map((table) => { + const isActive = focusedTableId === table.id; + const tableCharts = chartElements.filter(ce => ce.tableId === table.id); + const handleTableClick = () => { + dispatch(dfActions.setFocusedTable(table.id)); + if (tableCharts.length === 0) { + // No charts yet — create a placeholder to enter chart creation view + dispatch(dfActions.setFocusedChart(null)); + } else { + // Has charts — focus the first one if not already viewing one from this table + const alreadyFocused = tableCharts.some(ce => ce.chartId === focusedChartId); + if (!alreadyFocused) { + dispatch(dfActions.setFocusedChart(tableCharts[0].chartId)); + } + } + }; + return ( + + {getTableIcon(table)} + + {table.displayId || table.id} + + {table.attachedMetadata && ( + + )} + + ); + })} + setUploadDialogOpen(true)} + > + + + add data... - - - - + + + setUploadDialogOpen(false)} + initialTab="menu" + /> ); -} +}; let SingleThreadGroupView: FC<{ scrollRef: any, threadIdx: number, + threadLabel?: string, // Custom label like "thread 1.1" for split sub-threads + isSplitThread?: boolean, // When true, truncate used tables to immediate parent + "..." + hideLabel?: boolean, // When true, hide the thread label divider leafTables: DictTable[]; chartElements: { tableId: string, chartId: string, element: any }[]; usedIntermediateTableIds: string[], - compact?: boolean, // When true, only show table cards in a simple column (for thread0) + globalHighlightedTableIds: string[], + focusedThreadLeafId?: string, // The leaf table ID of the thread containing the focused table sx?: SxProps }> = function ({ scrollRef, threadIdx, + threadLabel, + isSplitThread = false, + hideLabel = false, leafTables, chartElements, usedIntermediateTableIds, // tables that have been used - compact = false, + globalHighlightedTableIds, + focusedThreadLeafId, sx }) { let tables = useSelector((state: DataFormulatorState) => state.tables); const { manualRefresh } = useDataRefresh(); + const tableById = useMemo(() => new Map(tables.map(t => [t.id, t])), [tables]); let leafTableIds = leafTables.map(lt => lt.id); + // Thread is highlighted only if this thread's leaf tables include the focused thread's leaf + const threadHighlighted = focusedThreadLeafId + ? leafTableIds.includes(focusedThreadLeafId) + : false; + // Ancestor thread: not the focused thread, but *owns* some highlighted tables + // (tables that only appear as used/shared references don't count) + const isAncestorThread = !threadHighlighted && globalHighlightedTableIds.length > 0 + && leafTables.some(lt => { + const trigs = getTriggers(lt, tables); + const chainIds = [...trigs.map(t => t.tableId), lt.id]; + const ownedIds = chainIds.filter(id => !usedIntermediateTableIds.includes(id)); + return ownedIds.some(id => globalHighlightedTableIds.includes(id)); + }); + const shouldHighlightThread = threadHighlighted || isAncestorThread; let parentTableId = leafTables[0].derive?.trigger.tableId || undefined; - let parentTable = tables.find(t => t.id == parentTableId) as DictTable; + let parentTable = (parentTableId ? tableById.get(parentTableId) : undefined) as DictTable; let charts = useSelector(dfSelectors.getAllCharts); let focusedChartId = useSelector((state: DataFormulatorState) => state.focusedChartId); let focusedTableId = useSelector((state: DataFormulatorState) => state.focusedTableId); let agentActions = useSelector((state: DataFormulatorState) => state.agentActions); + // Pre-index running agent table IDs for O(1) lookup + const runningAgentTableIds = useMemo(() => { + const ids = new Set(); + for (const a of agentActions) { + if (!a.hidden && a.status === 'running') ids.add(a.tableId); + } + return ids; + }, [agentActions]); + // Metadata popup state const [metadataPopupOpen, setMetadataPopupOpen] = useState(false); const [selectedTableForMetadata, setSelectedTableForMetadata] = useState(null); @@ -739,6 +622,29 @@ let SingleThreadGroupView: FC<{ })); } + // Rename popup state + const [renamePopupOpen, setRenamePopupOpen] = useState(false); + const [selectedTableForRename, setSelectedTableForRename] = useState(null); + const [renameAnchorEl, setRenameAnchorEl] = useState(null); + + const handleOpenRenamePopup = (table: DictTable, anchorEl: HTMLElement) => { + setSelectedTableForRename(table); + setRenameAnchorEl(anchorEl); + setRenamePopupOpen(true); + }; + + const handleCloseRenamePopup = () => { + setRenamePopupOpen(false); + setSelectedTableForRename(null); + setRenameAnchorEl(null); + }; + + const handleSaveRename = (newName: string) => { + if (selectedTableForRename) { + handleUpdateTableDisplayId(selectedTableForRename.id, newName); + } + }; + const handleOpenMetadataPopup = (table: DictTable, anchorEl: HTMLElement) => { setSelectedTableForMetadata(table); setMetadataAnchorEl(anchorEl); @@ -819,8 +725,10 @@ let SingleThreadGroupView: FC<{ if (sourceTable) { // Use the new rows if this is the table being refreshed const rows = sourceId === sourceTableId ? newRows : sourceTable.rows; + // Use workspace table name for virtual tables so sandbox code can find the parquet + const tableName = sourceTable.virtual?.tableId || sourceTable.id.replace(/\.[^/.]+$/, ""); return { - name: sourceTable.id, + name: tableName, rows: rows }; } @@ -829,13 +737,19 @@ let SingleThreadGroupView: FC<{ if (parentTableData.length > 0) { try { - const response = await fetch(getUrls().REFRESH_DERIVED_DATA, { + // Build request body with required output_variable and virtual flag + const requestBody: any = { + input_tables: parentTableData, + code: derivedTable.derive.code, + output_variable: derivedTable.derive.outputVariable || 'result_df', + virtual: !!derivedTable.virtual?.tableId, + output_table_name: derivedTable.virtual?.tableId + }; + + const response = await fetchWithIdentity(getUrls().REFRESH_DERIVED_DATA, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - input_tables: parentTableData, - code: derivedTable.derive.code - }) + body: JSON.stringify(requestBody) }); const result = await response.json(); @@ -904,298 +818,6 @@ let SingleThreadGroupView: FC<{ } }; - let buildTriggerCard = (trigger: Trigger) => { - let selectedClassName = trigger.chart?.id == focusedChartId ? 'selected-card' : ''; - - let triggerCard =
- - - -
; - - return - {triggerCard} - - - - ; - } - - let buildTableCard = (tableId: string, compact = false) => { - - if (parentTable && tableId == parentTable.id && parentTable.anchored && tableIdList.length > 1) { - let table = tables.find(t => t.id == tableId); - return - { - event.stopPropagation(); - dispatch(dfActions.setFocusedTable(tableId)); - - // Find and set the first chart associated with this table - let firstRelatedChart = charts.find((c: Chart) => c.tableRef == tableId && c.source != "trigger"); - - if (firstRelatedChart) { - dispatch(dfActions.setFocusedChart(firstRelatedChart.id)); - } - }} - > - - - - {table?.displayId || tableId} - - - - - } - - // filter charts relavent to this - let relevantCharts = chartElements.filter(ce => ce.tableId == tableId && !usedIntermediateTableIds.includes(tableId)); - - let table = tables.find(t => t.id == tableId); - - let selectedClassName = tableId == focusedTableId ? 'selected-card' : ''; - - let collapsedProps = collapsed ? { width: '50%', "& canvas": { width: 60, maxHeight: 50 } } : { width: '100%' } - - let releventChartElements = relevantCharts.map((ce, j) => - - {buildChartCard(ce, focusedChartId, charts.find(c => c.id == ce.chartId)?.unread)} - ) - - // only charts without dependency can be deleted - let tableDeleteEnabled = !tables.some(t => t.derive?.trigger.tableId == tableId); - - const iconColor = tableId === focusedTableId ? theme.palette.primary.main : 'rgba(0,0,0,0.6)'; - const iconOpacity = table?.anchored ? 1 : 0.5; - - let tableCardIcon = table?.virtual ? ( - - ) : ( - - ) - - let regularTableBox = c.chartId == focusedChartId) ? scrollRef : null} - sx={{ padding: '0px' }}> - { - dispatch(dfActions.setFocusedTable(tableId)); - if (focusedChart?.tableRef != tableId) { - let firstRelatedChart = charts.find((c: Chart) => c.tableRef == tableId && c.source != 'trigger'); - if (firstRelatedChart) { - dispatch(dfActions.setFocusedChart(firstRelatedChart.id)); - } else { - //dispatch(dfActions.createNewChart({ tableId: tableId, chartType: '?' })); - } - } - }}> - - - {/* For non-derived tables: icon opens menu; for derived tables: icon toggles anchored */} - {table?.derive == undefined ? ( - - { - event.stopPropagation(); - handleOpenTableMenu(table!, event.currentTarget); - }}> - {tableCardIcon} - - - ) : ( - t.derive?.trigger.tableId == tableId)} - onClick={(event) => { - event.stopPropagation(); - dispatch(dfActions.updateTableAnchored({tableId: tableId, anchored: !table?.anchored})); - }}> - {tableCardIcon} - - )} - - {/* Only show streaming icon when actively watching for updates */} - {(table?.source?.type === 'stream' || table?.source?.type === 'database') && table?.source?.autoRefresh ? ( - - { - event.stopPropagation(); - handleOpenStreamingSettingsPopup(table!, event.currentTarget); - }} - sx={{ - padding: 0.25, - '&:hover': { - transform: 'scale(1.2)', - transition: 'all 0.1s linear' - } - }} - > - - - - ) : ""} - {focusedTableId == tableId ? : {table?.displayId || tableId}} - - - - - { - event.stopPropagation(); - dispatch(dfActions.setFocusedTable(tableId)); - dispatch(dfActions.setFocusedChart(undefined)); - }} - > - - - - - {/* Delete button - shown for all deletable tables */} - {tableDeleteEnabled && ( - - { - event.stopPropagation(); - dispatch(dfActions.deleteTable(tableId)); - }} - > - - - - )} - - - - - - let chartElementProps = collapsed ? { display: 'flex', flexWrap: 'wrap' } : {} - - let relevantAgentActions = agentActions.filter(a => a.tableId == tableId).filter(a => a.hidden == false); - - let agentActionBox = ( - - ) - - return [ - regularTableBox, - - {!leafTableIds.includes(tableId) && - - } - - {releventChartElements} - {agentActionBox} - - - ] - } - const theme = useTheme(); let focusedChart = useSelector((state: DataFormulatorState) => charts.find(c => c.id == focusedChartId)); @@ -1213,209 +835,320 @@ let SingleThreadGroupView: FC<{ let newTableIds = tableIdList.filter(id => !usedTableIdsInThread.includes(id)); let newTriggers = triggers.filter(tg => newTableIds.includes(tg.resultTableId)); - let highlightedTableIds: string[] = []; - if (focusedTableId && leafTableIds.includes(focusedTableId)) { - highlightedTableIds = [...tableIdList, focusedTableId]; - } else if (focusedTableId && newTableIds.includes(focusedTableId)) { - highlightedTableIds = tableIdList.slice(0, tableIdList.indexOf(focusedTableId) + 1); + // Use the global highlighted table IDs (computed at DataThread level from the focused table's full ancestor chain) + let highlightedTableIds = globalHighlightedTableIds; + + let _buildTriggerCard = (trigger: Trigger, highlighted: boolean = false) => { + return buildTriggerCard(trigger, focusedChartId, highlighted); } - let tableElementList = newTableIds.map((tableId, i) => buildTableCard(tableId)); - let triggerCards = newTriggers.map((trigger) => buildTriggerCard(trigger)); + // Shared props for buildTableCard calls + let tableCardProps: Omit = { + tables, charts, chartElements, usedIntermediateTableIds, + highlightedTableIds, agentActions, focusedTableId, focusedChartId, focusedChart, + parentTable, tableIdList, collapsed, scrollRef, dispatch, + handleOpenTableMenu, primaryBgColor: theme.palette.primary.bgcolor, + }; - let leafTableComp = leafTables.length > 1 ? leafTables.map((lt, i) => { + let _buildTableCard = (tableId: string) => { + return buildTableCard({ tableId, ...tableCardProps }); + } - let leafTrigger = lt.derive?.trigger; + let tableElementList = newTableIds.map((tableId, i) => _buildTableCard(tableId)); + let triggerCards = newTriggers.map((trigger) => { + const triggerTableId = trigger.resultTableId; + const isHL = triggerTableId ? highlightedTableIds.includes(triggerTableId) : false; + return _buildTriggerCard(trigger, isHL); + }); - let leftBorder = i == leafTables.length - 1 ? `none` : `1px dashed rgba(0, 0, 0, 0.3)`; - let stackML = '8px'; - let spaceBox = + // Build a flat sequence of timeline items: [trigger, table, charts, trigger, table, charts, ...] + let timelineItems: { key: string; element: React.ReactNode; type: 'used-table' | 'trigger' | 'table' | 'chart' | 'leaf-trigger' | 'leaf-table'; highlighted: boolean; tableId?: string; isRunning?: boolean }[] = []; + + // Add used (shared) tables at the top + // Only show the immediate parent + "..." for further ancestors + let displayedUsedTableIds = usedTableIdsInThread; + if (usedTableIdsInThread.length > 1) { + // Keep only the last (immediate parent), prepend "..." placeholder + displayedUsedTableIds = usedTableIdsInThread.slice(-1); + timelineItems.push({ + key: 'used-table-ellipsis', + type: 'used-table', + highlighted: false, + element: ( + + … + + ), + }); + } + displayedUsedTableIds.forEach((tableId, i) => { + let table = tableById.get(tableId) as DictTable; + timelineItems.push({ + key: `used-table-${tableId}-${i}`, + type: 'used-table', + tableId: tableId, + highlighted: highlightedTableIds.includes(tableId), + element: ( + { dispatch(dfActions.setFocusedTable(tableId)) }}> + {table.displayId || tableId} + + ), + }); + }); - if (focusedTableId && leafTableIds.indexOf(focusedTableId) > i) { - leftBorder = `3px solid ${theme.palette.primary.light}`; - stackML = '7px'; + // Interleave triggers and tables for the main thread body + newTableIds.forEach((tableId, i) => { + const trigger = newTriggers.find(t => t.resultTableId === tableId); + const isHighlighted = highlightedTableIds.includes(tableId); + + // Add trigger card if exists + if (trigger) { + const triggerCard = triggerCards[newTriggers.indexOf(trigger)]; + if (triggerCard) { + timelineItems.push({ + key: triggerCard?.key || `woven-trigger-${tableId}`, + type: 'trigger', + highlighted: isHighlighted, + element: triggerCard, + }); + } } - if (focusedTableId && lt.id == focusedTableId) { - spaceBox = + // Add table card and its charts + const tableCard = tableElementList[i]; + if (Array.isArray(tableCard)) { + tableCard.forEach((subItem: any, j: number) => { + if (!subItem) return; + const subKey = subItem?.key || `woven-${tableId}-${j}`; + const isChart = subKey.includes('chart') || subKey.includes('agent'); + const isAgent = subKey.includes('agent'); + const isAgentRunning = isAgent && runningAgentTableIds.has(tableId); + timelineItems.push({ + key: subKey, + type: isChart ? 'chart' : 'table', + tableId: isChart ? undefined : tableId, + highlighted: isHighlighted, + element: subItem, + ...(isAgentRunning ? { isRunning: true } : {}), + }); + }); } + }); - return - {spaceBox} - - {leafTrigger && buildTriggerCard(leafTrigger)} - {buildTableCard(lt.id)} - - ; - }) : leafTables.map((lt, i) => { - return - - {lt.derive?.trigger && buildTriggerCard(lt.derive.trigger)} - {buildTableCard(lt.id)} - - ; + // Add leaf table components + leafTables.forEach((lt, i) => { + let leafTrigger = lt.derive?.trigger; + if (leafTrigger) { + timelineItems.push({ + key: `leaf-trigger-${lt.id}`, + type: 'leaf-trigger', + highlighted: highlightedTableIds.includes(lt.id), + element: _buildTriggerCard(leafTrigger, highlightedTableIds.includes(lt.id)), + }); + } + let leafCards = _buildTableCard(lt.id); + if (Array.isArray(leafCards)) { + leafCards.forEach((subItem: any, j: number) => { + if (!subItem) return; + const subKey = subItem?.key || `leaf-card-${lt.id}-${j}`; + const isChart = subKey.includes('chart') || subKey.includes('agent'); + const isAgent = subKey.includes('agent'); + const isAgentRunning = isAgent && runningAgentTableIds.has(lt.id); + timelineItems.push({ + key: subKey, + type: isChart ? 'chart' : 'leaf-table', + tableId: isChart ? undefined : lt.id, + highlighted: highlightedTableIds.includes(lt.id), + element: subItem, + ...(isAgentRunning ? { isRunning: true } : {}), + }); + }); + } }); - // Compact mode: just show leaf table cards in a simple column - if (compact) { - // For compact mode, ensure highlightedTableIds includes focused table if it's a leaf - if (focusedTableId && leafTableIds.includes(focusedTableId)) { - highlightedTableIds = [focusedTableId]; + // Timeline rendering helper + const TIMELINE_WIDTH = 16; + const DOT_SIZE = 6; + const CARD_PY = '4px'; // vertical padding for each timeline row + + const getTimelineDot = (item: typeof timelineItems[0]) => { + const isTable = item.type === 'table' || item.type === 'leaf-table' || item.type === 'used-table'; + const color = item.highlighted + ? theme.palette.primary.main + : 'rgba(0,0,0,0.4)'; + + // For running agent items, show a spinner instead of a dot + if (item.isRunning) { + return ; } - + + // For table items, show a type-specific icon instead of a dot + if (isTable && item.tableId) { + const tableForDot = tableById.get(item.tableId); + const iconSx = { fontSize: 14, color }; + const isStreaming = tableForDot && (tableForDot.source?.type === 'stream' || tableForDot.source?.type === 'database') && tableForDot.source?.autoRefresh; + + if (isStreaming) { + return ; + } + if (tableForDot?.virtual) { + return ; + } + return ; + } + + return ; + }; + + const renderTimelineItem = (item: typeof timelineItems[0], index: number, isLast: boolean) => { + const isTrigger = item.type === 'trigger' || item.type === 'leaf-trigger'; + const isTable = item.type === 'table' || item.type === 'leaf-table' || item.type === 'used-table'; + const isChart = item.type === 'chart'; + const dashedColor = item.highlighted ? theme.palette.primary.main : 'rgba(0,0,0,0.4)'; + const dashedWidth = item.highlighted ? '2px' : '1px'; + const dashedStyle = item.highlighted ? 'solid' : 'dashed'; + const triggerColor = item.highlighted + ? alpha(theme.palette.custom.main, 0.5) + : 'rgba(0,0,0,0.12)'; + const rowHighlightSx = {}; + + // Triggers: thick solid bar with a dot in the middle and a horizontal tick to the card + if (isTrigger) { + return ( + + + {/* Dashed connector from previous element */} + + {/* Thick solid bar — top half */} + + {/* Horizontal tick to the right */} + + + + {/* Thick solid bar — bottom half */} + + {/* Dashed connector to next element */} + + + + {item.element} + + + ); + } + + // Charts/agents: dot on the timeline with a horizontal tick line to the chart + if (isChart) { + return ( + + + + + {getTimelineDot(item)} + + + {!isLast && } + {isLast && } + + + {item.element} + + + ); + } + + // Tables (primary nodes): settings icon on the timeline, more vertical spacing + const tableForItem = item.tableId ? tableById.get(item.tableId) : undefined; return ( - - {leafTables.map((table) => { - const tableCardResult = buildTableCard(table.id, compact); - // buildTableCard returns an array [regularTableBox, chartBox] - // In compact mode, we want to show them stacked - return ( - - {tableCardResult} - - ); - })} - - e.stopPropagation()} - > - { - e.stopPropagation(); - if (selectedTableForMenu) { - handleOpenMetadataPopup(selectedTableForMenu, tableMenuAnchorEl!); - } - handleCloseTableMenu(); - }} - sx={{ fontSize: '12px', display: 'flex', alignItems: 'center', gap: 1 }} - > - - {selectedTableForMenu?.attachedMetadata ? "Edit metadata" : "Attach metadata"} - - {/* Watch for updates option - only shown when table has stream/database source but not actively watching */} - {selectedTableForMenu && - (selectedTableForMenu.source?.type === 'stream' || selectedTableForMenu.source?.type === 'database') && - ( - { - e.stopPropagation(); - if (selectedTableForMenu) { - handleOpenStreamingSettingsPopup(selectedTableForMenu, tableMenuAnchorEl!); - } - handleCloseTableMenu(); - }} - sx={{ fontSize: '12px', display: 'flex', alignItems: 'center', gap: 1 }} - > - - Watch for updates - + + + {index > 0 && ( + )} - {/* Refresh data - hidden for database tables */} - {selectedTableForMenu?.source?.type !== 'database' && ( - { - e.stopPropagation(); - if (selectedTableForMenu) { - handleOpenRefreshDialog(selectedTableForMenu); - } - }} - sx={{ fontSize: '12px', display: 'flex', alignItems: 'center', gap: 1 }} - > - - Refresh data - + {index === 0 && } + + {getTimelineDot(item)} + + {!isLast && ( + )} - - {selectedTableForRefresh && ( - - )} - {selectedTableForStreamingSettings && ( - manualRefresh(selectedTableForStreamingSettings.id)} - /> - )} + {isLast && } + + + {item.element} + ); - } + }; + return - - - - {threadIdx === -1 ? 'thread0' : `thread - ${threadIdx + 1}`} - - - -
- {usedTableIdsInThread.map((tableId, i) => { - let table = tables.find(t => t.id === tableId) as DictTable; - return [ - { dispatch(dfActions.setFocusedTable(tableId)) }}> - {table.displayId || tableId} - , - - - ] - })} - - {tableElementList.length > triggerCards.length ? - w(tableElementList, triggerCards, "") : w(triggerCards, tableElementList, "")} + > + {!hideLabel && ( + + + + {threadLabel || (threadIdx === -1 ? 'thread0' : `thread - ${threadIdx + 1}`)} + + - {leafTableComp} + )} +
+ {timelineItems.map((item, index) => renderTimelineItem(item, index, index === timelineItems.length - 1))}
+ - {/* Table actions menu for non-derived, non-virtual tables */} + {/* Table actions menu */} { e.stopPropagation(); if (selectedTableForMenu) { - handleOpenMetadataPopup(selectedTableForMenu, tableMenuAnchorEl!); + handleOpenRenamePopup(selectedTableForMenu, tableMenuAnchorEl!); } handleCloseTableMenu(); }} sx={{ fontSize: '12px', display: 'flex', alignItems: 'center', gap: 1 }} > - - {selectedTableForMenu?.attachedMetadata ? "Edit metadata" : "Attach metadata"} + + Rename + {/* Pin option - only for derived tables */} + {selectedTableForMenu?.derive != undefined && ( + { + e.stopPropagation(); + if (selectedTableForMenu) { + dispatch(dfActions.updateTableAnchored({tableId: selectedTableForMenu.id, anchored: !selectedTableForMenu.anchored})); + } + handleCloseTableMenu(); + }} + sx={{ fontSize: '12px', display: 'flex', alignItems: 'center', gap: 1 }} + > + + {selectedTableForMenu?.anchored ? "Unpin table" : "Pin table"} + + )} + {/* Non-derived table options */} + {selectedTableForMenu?.derive == undefined && ( + { + e.stopPropagation(); + if (selectedTableForMenu) { + handleOpenMetadataPopup(selectedTableForMenu, tableMenuAnchorEl!); + } + handleCloseTableMenu(); + }} + sx={{ fontSize: '12px', display: 'flex', alignItems: 'center', gap: 1 }} + > + + {selectedTableForMenu?.attachedMetadata ? "Edit metadata" : "Attach metadata"} + + )} {/* Watch for updates option - only shown when table has stream/database source but not actively watching */} {selectedTableForMenu && + selectedTableForMenu.derive == undefined && (selectedTableForMenu.source?.type === 'stream' || selectedTableForMenu.source?.type === 'database') && !selectedTableForMenu.source?.autoRefresh && ( )} - {/* Refresh data - hidden for database tables */} - {selectedTableForMenu?.source?.type !== 'database' && ( + {/* Refresh data - hidden for database tables and derived tables */} + {selectedTableForMenu?.derive == undefined && selectedTableForMenu?.source?.type !== 'database' && ( { e.stopPropagation(); @@ -1522,81 +1296,14 @@ let SingleThreadGroupView: FC<{ } -const VegaLiteChartElement = memo<{ - chart: Chart, - assembledSpec: any, - table: any, - status: 'available' | 'pending' | 'unavailable', - isSaved?: boolean, - onChartClick: (chartId: string, tableId: string) => void, - onDelete: (chartId: string) => void -}>(({ chart, assembledSpec, table, status, isSaved, onChartClick, onDelete }) => { - const id = `data-thread-chart-Element-${chart.id}`; - return ( - onChartClick(chart.id, table.id)} - className="vega-thumbnail-box" - style={{ width: "100%", position: "relative", cursor: "pointer !important" }} - > - - {isSaved && - - } - {status == 'pending' && - - } - - - { - event.stopPropagation(); - onDelete(chart.id); - }} - > - - - - - - - - - - ); -}); - -const MemoizedChartObject = memo<{ +/** Lightweight chart thumbnail — shows cached PNG, skeleton, or status icon. */ +const ChartThumbnail: FC<{ chart: Chart; table: DictTable; - conceptShelfItems: FieldItem[]; status: 'available' | 'pending' | 'unavailable'; onChartClick: (chartId: string, tableId: string) => void; onDelete: (chartId: string) => void; -}>(({ chart, table, conceptShelfItems, status, onChartClick, onDelete }) => { - - let visTableRows: any[] = []; - if (table.rows.length > 1000) { - visTableRows = structuredClone(_.sampleSize(table.rows, 1000)); - } else { - visTableRows = structuredClone(table.rows); - } - - // Preprocess the data for aggregations (same as VisualizationView) - visTableRows = prepVisTable(visTableRows, conceptShelfItems, chart.encodingMap); +}> = ({ chart, table, status, onChartClick, onDelete }) => { let deleteButton = @@ -1608,39 +1315,31 @@ const MemoizedChartObject = memo<{ + const pendingOverlay = status == 'pending' ? + + : null; + if (['Auto', '?'].includes(chart.chartType)) { - let element = onChartClick(chart.id, table.id)} sx={{ width: "100%", color: 'text.secondary', height: 48, display: "flex", backgroundColor: "white", position: 'relative', flexDirection: "column" }}> - {status == 'pending' ? - - : ''} + {pendingOverlay} {deleteButton} - - return element; + ; } if (status == 'unavailable' || chart.chartType == "Table") { let chartTemplate = getChartTemplate(chart.chartType); - - let element = onChartClick(chart.id, table.id)} - sx={{ - display: "flex", backgroundColor: "white", position: 'relative', - flexDirection: "column" - }}> - {status == 'pending' ? - - : ''} + sx={{ display: "flex", backgroundColor: "white", position: 'relative', flexDirection: "column" }}> + {pendingOverlay} {generateChartSkeleton(chartTemplate?.icon, 32, 32, chart.chartType == 'Table' ? 1 : 0.5)} @@ -1648,72 +1347,245 @@ const MemoizedChartObject = memo<{ {deleteButton} ; - return element; } - // prepare the chart to be rendered - let assembledChart = assembleVegaChart(chart.chartType, chart.encodingMap, conceptShelfItems, visTableRows, table.metadata, 20, true); - assembledChart["background"] = "transparent"; + // ---- Thumbnail path: use cached PNG from ChartRenderService ---- + if (chart.thumbnail) { + return ( + onChartClick(chart.id, table.id)} + className="vega-thumbnail-box" + style={{ width: "100%", position: "relative", cursor: "pointer" }} + > + + {chart.saved && + + } + {pendingOverlay} + {deleteButton} + + {`${chart.chartType} + + + + ); + } - // Temporary fix, down sample the dataset - if (assembledChart["data"]["values"].length > 5000) { - let values = assembledChart["data"]["values"]; - assembledChart = (({ data, ...o }) => o)(assembledChart); + // ---- Fallback: skeleton while ChartRenderService is still processing ---- + let chartTemplate = getChartTemplate(chart.chartType); + return ( + onChartClick(chart.id, table.id)} + className="vega-thumbnail-box" + style={{ width: "100%", position: "relative", cursor: "pointer" }} + > + + {chart.saved && + + } + {pendingOverlay} + {deleteButton} + + {generateChartSkeleton(chartTemplate?.icon, 48, 48, 0.3)} + + + + ); +}; + +// Height estimation constants (px) – per-type heights + py:4px (8px) gap per row +const LAYOUT_TABLE_HEIGHT = 28 + 8; // table card + row padding +const LAYOUT_TRIGGER_HEIGHT = 43 + 8; // trigger card (2 lines) + row padding +const LAYOUT_CHART_HEIGHT = 90 + 8; // chart card (~70-110) + row padding +const LAYOUT_MESSAGE_HEIGHT = 80 + 8; // agent message (~60-120) + row padding +const LAYOUT_THREAD_OVERHEAD = 52; // header divider + thread padding +const LAYOUT_THREAD_GAP = 8; // my: 0.5 = 4px top + 4px bottom between threads + +function estimateThreadHeight( + tableCount: number, triggerCount: number, chartCount: number, messageCount: number +): number { + return LAYOUT_THREAD_OVERHEAD + + tableCount * LAYOUT_TABLE_HEIGHT + + triggerCount * LAYOUT_TRIGGER_HEIGHT + + chartCount * LAYOUT_CHART_HEIGHT + + messageCount * LAYOUT_MESSAGE_HEIGHT; +} - let getRandom = (seed: number) => { - let x = Math.sin(seed++) * 10000; - return x - Math.floor(x); +/** + * Compute a balanced column layout for threads. + * + * @param heights – Estimated pixel height for each thread (in display order). + * @param numColumns – Maximum number of columns to distribute into. + * @param flexOrder – When true, threads may be reordered across columns for + * better balance (LPT heuristic). When false, the original + * order is preserved (optimal contiguous partitioning via + * binary-search on the maximum column height). + * @returns An array of columns, where each column is an array of original + * thread indices. Empty columns are omitted. + */ +function computeThreadColumnLayout( + heights: number[], + numColumns: number, + flexOrder: boolean = false, +): number[][] { + if (heights.length === 0) return []; + if (heights.length === 1) return [[0]]; + + const cols = Math.min(numColumns, heights.length); + if (cols <= 1) return [heights.map((_, i) => i)]; + + return flexOrder + ? layoutFlexOrder(heights, cols) + : layoutPreserveOrder(heights, cols); +} + +/** + * Balanced layout *with* reordering (LPT – Longest Processing Time first). + * Assigns the tallest unplaced thread to whichever column is currently shortest. + */ +function layoutFlexOrder(heights: number[], numColumns: number): number[][] { + const indexed = heights.map((h, i) => ({ idx: i, h })); + indexed.sort((a, b) => b.h - a.h); // tallest first + + const columns: number[][] = Array.from({ length: numColumns }, () => []); + const colH: number[] = new Array(numColumns).fill(0); + + for (const item of indexed) { + let minCol = 0; + for (let c = 1; c < numColumns; c++) { + if (colH[c] < colH[minCol]) minCol = c; } - let getRandomSubarray = (arr: any[], size: number) => { - let shuffled = arr.slice(0), i = arr.length, temp, index; - while (i--) { - index = Math.floor((i + 1) * getRandom(233 * i + 888)); - temp = shuffled[index]; - shuffled[index] = shuffled[i]; - shuffled[i] = temp; + columns[minCol].push(item.idx); + colH[minCol] += item.h; + } + + return columns.filter(c => c.length > 0); +} + +/** + * Balanced layout *preserving* thread order. + * + * Uses binary-search on the maximum column height to find the tightest + * contiguous partitioning of threads into ≤ numColumns groups. + */ +function layoutPreserveOrder(heights: number[], numColumns: number): number[][] { + const maxH = Math.max(...heights); + const totalH = heights.reduce((s, h) => s + h, 0); + + // Can we fit all threads into `numColumns` columns with no column > target? + const canPartition = (target: number): boolean => { + let cols = 1, cur = 0; + for (const h of heights) { + if (cur + h > target && cur > 0) { + cols++; + cur = h; + if (cols > numColumns) return false; + } else { + cur += h; } - return shuffled.slice(0, size); } - assembledChart["data"] = { "values": getRandomSubarray(values, 5000) }; + return true; + }; + + // Binary-search for the minimum feasible max-column height + let lo = maxH, hi = totalH; + while (lo < hi) { + const mid = Math.floor((lo + hi) / 2); + if (canPartition(mid)) hi = mid; else lo = mid + 1; } - assembledChart['config'] = { - "axis": { "labelLimit": 30 } + // Build the actual partition with the optimal target + const target = lo; + const columns: number[][] = [[]]; + let cur = 0; + for (let i = 0; i < heights.length; i++) { + if (cur + heights[i] > target && columns[columns.length - 1].length > 0) { + columns.push([]); + cur = 0; + } + columns[columns.length - 1].push(i); + cur += heights[i]; } - const element = onChartClick(chart.id, table.id)} - onDelete={() => onDelete(chart.id)} - />; + return columns; +} - return element; -}, (prevProps, nextProps) => { - // Custom comparison function for memoization - // Only re-render if the chart or its dependencies have changed +/** + * Choose the best column layout that balances scroll burden vs whitespace. + * + * 1. If a single column fits within SCROLL_TOLERANCE × viewportHeight, + * use one column — the small scroll is preferable to the whitespace + * of an extra column (e.g. one long thread + one tiny thread). + * 2. Otherwise, evaluate layouts for 1 … maxColumns and pick the smallest + * column count whose tallest column fits within viewportHeight. + * 3. If nothing eliminates scrolling, pick the layout that minimises the + * tallest column (least scrolling). + */ +const SCROLL_TOLERANCE = 1.5; // allow up to 50% overflow before adding columns + +function chooseBestColumnLayout( + heights: number[], + maxColumns: number, + viewportHeight: number, + flexOrder: boolean = false, + minColumns: number = 1, +): number[][] { + if (heights.length === 0) return []; + + const cap = Math.min(maxColumns, heights.length); + const start = Math.min(Math.max(minColumns, 1), cap); + const tolerantHeight = viewportHeight * SCROLL_TOLERANCE; + + // Compute effective column height including gaps between threads + const columnEffectiveHeight = (col: number[]) => { + const contentH = col.reduce((sum, idx) => sum + heights[idx], 0); + const gapH = Math.max(0, col.length - 1) * LAYOUT_THREAD_GAP; + return contentH + gapH; + }; - // when conceptShelfItems change, we only need to re-render the chart if the conceptShelfItems depended by the chart have changed - let nextReferredConcepts = Object.values(nextProps.chart.encodingMap).filter(e => e.fieldID || e.aggregate).map(e => `${e.fieldID}:${e.aggregate}`); + // Evaluate every candidate column count (start … cap). + // Pick the smallest n whose tallest column fits within tolerance. + // If none fits, pick the one with the shortest tallest column. + let bestLayout: number[][] = []; + let bestMaxH = Infinity; - return ( - prevProps.chart.id === nextProps.chart.id && - prevProps.chart.chartType === nextProps.chart.chartType && - prevProps.chart.saved === nextProps.chart.saved && - prevProps.status === nextProps.status && - _.isEqual(prevProps.chart.encodingMap, nextProps.chart.encodingMap) && - // Only check tables/charts that this specific chart depends on - _.isEqual(prevProps.table, nextProps.table) && - _.isEqual(prevProps.table.attachedMetadata, nextProps.table.attachedMetadata) && - // Check if conceptShelfItems have changed - _.isEqual( - prevProps.conceptShelfItems.filter(c => nextReferredConcepts.includes(c.id)), - nextProps.conceptShelfItems.filter(c => nextReferredConcepts.includes(c.id))) - ); -}); + for (let n = start; n <= cap; n++) { + const layout = computeThreadColumnLayout(heights, n, flexOrder); + const maxH = Math.max(...layout.map(columnEffectiveHeight)); + + // Smallest n that fits within tolerance → least whitespace + if (maxH <= tolerantHeight) { + return layout; + } + + // Otherwise track the layout with the shortest tallest column + if (maxH < bestMaxH) { + bestMaxH = maxH; + bestLayout = layout; + } + } + + return bestLayout; +} export const DataThread: FC<{sx?: SxProps}> = function ({ sx }) { @@ -1724,10 +1596,10 @@ export const DataThread: FC<{sx?: SxProps}> = function ({ sx }) { let chartSynthesisInProgress = useSelector((state: DataFormulatorState) => state.chartSynthesisInProgress); const conceptShelfItems = useSelector((state: DataFormulatorState) => state.conceptShelfItems); - - let [threadDrawerOpen, setThreadDrawerOpen] = useState(false); + const agentActions = useSelector((state: DataFormulatorState) => state.agentActions); const scrollRef = useRef(null) + const containerRef = useRef(null) const executeScroll = (smooth: boolean = true) => { if (scrollRef.current != null) { @@ -1737,16 +1609,9 @@ export const DataThread: FC<{sx?: SxProps}> = function ({ sx }) { }) } } - // run this function from an event handler or an effect to execute scroll - const dispatch = useDispatch(); - useEffect(() => { - // make it smooth when drawer from open -> close, otherwise just jump - executeScroll(!threadDrawerOpen); - }, [threadDrawerOpen]) - useEffect(() => { // load the example datasets if (focusedTableId) { @@ -1754,16 +1619,27 @@ export const DataThread: FC<{sx?: SxProps}> = function ({ sx }) { } }, [focusedTableId]); + // O(1) table lookup by ID + const tableById = useMemo(() => new Map(tables.map(t => [t.id, t])), [tables]); + + // Cached getTriggers — avoids repeated chain walks within a single render + const _tCache = new Map(); + const getCachedTriggers = (lt: DictTable): Trigger[] => { + if (_tCache.has(lt.id)) return _tCache.get(lt.id)!; + const triggers = getTriggers(lt, tables); + _tCache.set(lt.id, triggers); + return triggers; + }; + // Now use useMemo to memoize the chartElements array let chartElements = useMemo(() => { return charts.filter(c => c.source == "user").map((chart) => { const table = getDataTable(chart, tables, charts, conceptShelfItems); let status: 'available' | 'pending' | 'unavailable' = chartSynthesisInProgress.includes(chart.id) ? 'pending' : checkChartAvailability(chart, conceptShelfItems, table.rows) ? 'available' : 'unavailable'; - let element = { dispatch(dfActions.setFocusedChart(chart.id)); @@ -1785,13 +1661,50 @@ export const DataThread: FC<{sx?: SxProps}> = function ({ sx }) { return false; } let leafTables = [ ...tables.filter(t => isLeafTable(t)) ]; - + + // Split long derivation chains by promoting intermediate tables as additional "leaves". + // If a chain has more than MAX_CHAIN_TABLES tables, we add a split point every + // MAX_CHAIN_TABLES steps. The sort (shorter chains first) ensures the intermediate + // leaf is processed before the real leaf, so its tables get claimed — making the + // real leaf's thread show only the remaining (new) tables. + // When counting chain length, exclude "used" tables (already claimed by an earlier + // chain) so that shared ancestors don't inflate the count. The first chain to + // contain a table still counts it as owned. + const MAX_CHAIN_TABLES = 5; + + // Process leaves in order, tracking claimed tables to simulate the later claim loop. + // A table is "used" for a chain only if a *previous* chain already claimed it. + const claimedForSplit = new Set(); + const extraLeaves: DictTable[] = []; + for (const lt of leafTables) { + const triggers = getCachedTriggers(lt); + const allChainIds = [lt.id, ...triggers.map(t => t.tableId)]; + // Tables not yet claimed by an earlier chain count as owned + const ownedIds = allChainIds.filter(id => !claimedForSplit.has(id)); + if (ownedIds.length > MAX_CHAIN_TABLES) { + // Walk only owned (unclaimed) triggers for split positions + const ownedTriggers = triggers.filter(t => !claimedForSplit.has(t.tableId)); + for (let pos = MAX_CHAIN_TABLES - 1; pos < ownedTriggers.length; pos += MAX_CHAIN_TABLES) { + const midId = ownedTriggers[pos].tableId; + const midTable = tableById.get(midId); + if (midTable && !leafTables.includes(midTable) && !extraLeaves.includes(midTable)) { + extraLeaves.push(midTable); + } + } + } + // Claim all tables in this chain for subsequent chains + allChainIds.forEach(id => claimedForSplit.add(id)); + } + if (extraLeaves.length > 0) { + leafTables.push(...extraLeaves); + } + // we want to sort the leaf tables by the order of their ancestors // for example if ancestor of list a is [0, 3] and the ancestor of list b is [0, 2] then b should come before a // when tables are anchored, we want to give them a higher order (so that they are displayed after their peers) let tableOrder = Object.fromEntries(tables.map((table, index) => [table.id, index + (table.anchored ? 1 : 0) * tables.length])); let getAncestorOrders = (leafTable: DictTable) => { - let triggers = getTriggers(leafTable, tables); + let triggers = getCachedTriggers(leafTable); return [...triggers.map(t => tableOrder[t.tableId]), tableOrder[leafTable.id]]; } @@ -1810,273 +1723,307 @@ export const DataThread: FC<{sx?: SxProps}> = function ({ sx }) { return aOrders.length - bOrders.length; }); - // Identify hanging tables (tables with no descendants or parents) - let isHangingTable = (table: DictTable) => { - // A table is hanging if: - // 1. It has no derive.source (no parent) - // 2. No other table derives from it (no descendants) - const hasNoParent = table.derive == undefined; - const hasNoDescendants = !tables.some(t => t.derive?.trigger.tableId == table.id); - return hasNoParent && hasNoDescendants; - }; + // Compute global highlighted table IDs from the focused table's full ancestor chain + let globalHighlightedTableIds: string[] = useMemo(() => { + if (!focusedTableId) return []; + let focusedTable = tableById.get(focusedTableId); + if (!focusedTable) return []; + // Walk up the trigger chain from the focused table to collect all ancestor IDs + let ids: string[] = [focusedTableId]; + let current = focusedTable; + while (current.derive && !current.anchored) { + let parentId = current.derive.trigger.tableId; + ids.unshift(parentId); + let parent = tableById.get(parentId); + if (!parent) break; + current = parent; + } + return ids; + }, [focusedTableId, tableById]); + + // Determine which leaf table's thread the focused table belongs to + let focusedThreadLeafId: string | undefined = useMemo(() => { + if (!focusedTableId) return undefined; + // Check if focused table IS a leaf table + let directLeaf = leafTables.find(lt => lt.id === focusedTableId); + if (directLeaf) return directLeaf.id; + // Otherwise, find the leaf table whose ancestor chain includes the focused table + for (const lt of leafTables) { + const triggers = getCachedTriggers(lt); + const chainIds = [...triggers.map(t => t.tableId), lt.id]; + if (chainIds.includes(focusedTableId)) { + return lt.id; + } + } + return undefined; + }, [focusedTableId, leafTables, tables]); - // Separate hanging tables from regular leaf tables - let hangingTables = leafTables.filter(t => isHangingTable(t)); - let regularLeafTables = leafTables.filter(t => !isHangingTable(t)); + let hasContent = leafTables.length > 0 || tables.some(t => !t.derive); - // Build groups for regular leaf tables (excluding hanging tables) - let leafTableGroups = regularLeafTables.reduce((groups: { [groupId: string]: DictTable[] }, leafTable) => { - // Get the immediate parent table ID (first trigger in the chain) - const triggers = getTriggers(leafTable, tables); - const immediateParentTableId = triggers.length > 0 ? triggers[triggers.length - 1].tableId : 'root'; - - let groupId = immediateParentTableId + (leafTable.anchored ? ('-' + leafTable.id) : ''); + // Collect all base (non-derived) tables for the workspace panel. + let baseTables = tables.filter(t => !t.derive); + // Threaded tables: leaf tables that have a derivation chain + let threadedTables = leafTables.filter(lt => { + const triggers = getTriggers(lt, tables); + return triggers.length + 1 > 1; + }); - let subgroupIdCount = 0; - while (groups[groupId] && groups[groupId].length >= 4) { - groupId = groupId + '-' + subgroupIdCount; - subgroupIdCount++; - } + // Build thread entries and their estimated heights for layout + type ThreadEntry = { key: string; groupId: string; leafTables: DictTable[]; threadIdx: number; threadLabel?: string; isSplitThread?: boolean; usedTableIds?: string[]; hideLabel?: boolean }; + let allThreadEntries: ThreadEntry[] = []; + let allThreadHeights: number[] = []; - // Initialize group if it doesn't exist - if (!groups[groupId]) { - groups[groupId] = []; - } - - // Add leaf table to its group - groups[groupId].push(leafTable); - - return groups; - }, {}); - - // Filter threads to only include those with length > 1 - let filteredLeafTableGroups: { [groupId: string]: DictTable[] } = {}; - Object.entries(leafTableGroups).forEach(([groupId, groupTables]) => { - // Calculate thread length: count all tables in the thread chain - const threadLength = groupTables.reduce((maxLength, leafTable) => { - const triggers = getTriggers(leafTable, tables); - // Thread length = number of triggers + 1 (the leaf table itself) - return Math.max(maxLength, triggers.length + 1); - }, 0); - - // Only include threads with length > 1 - if (threadLength > 1) { - filteredLeafTableGroups[groupId] = groupTables; - } else { - // Add single-table threads to hanging tables (they go to thread0) - groupTables.forEach(table => { - if (!hangingTables.includes(table)) { - hangingTables.push(table); - } - }); - } - }); + // Track which leaf tables are promoted (split) vs real leaves + const extraLeafIds = new Set(extraLeaves.map(t => t.id)); - // Create thread0 group for hanging tables - let thread0Group: { [groupId: string]: DictTable[] } = {}; + // Track which table IDs have been claimed by earlier threads + let claimedTableIds = new Set(); + + // Hanging tables: source tables with no children — displayed as a group before thread 1 + let hangingTables = leafTables.filter(lt => !lt.derive); if (hangingTables.length > 0) { - thread0Group['thread0'] = hangingTables; + hangingTables.forEach(lt => claimedTableIds.add(lt.id)); + let hangingChartCount = hangingTables.reduce((sum, lt) => sum + chartElements.filter(ce => ce.tableId === lt.id).length, 0); + let hangingMessageCount = hangingTables.reduce((sum, lt) => sum + agentActions.filter(a => a.tableId === lt.id && !a.hidden).length, 0); + allThreadEntries.push({ + key: 'hanging-tables', + groupId: 'hanging-tables', + leafTables: hangingTables, + threadIdx: -1, + hideLabel: true, + }); + allThreadHeights.push(estimateThreadHeight(hangingTables.length, 0, hangingChartCount, hangingMessageCount)); } - let drawerOpen = threadDrawerOpen && (Object.keys(filteredLeafTableGroups).length > 0 || hangingTables.length > 0); - let allGroupsForWidth = { ...filteredLeafTableGroups, ...thread0Group }; - let collaposedViewWidth = Math.max(...Object.values(allGroupsForWidth).map(x => x.length)) > 1 ? 248 : 232 + // Regular threads: one per threaded leaf table + // Assign sub-thread numbering: split (promoted) threads get the main index (1, 2, ...), + // real leaf tables whose chain was split get a sub-index (1.1, 1.2, ...) + let realThreadIdx = 0; // counter for main threads + // Pre-scan: find which real leaf each extra leaf belongs to + let extraLeafToRealLeaf = new Map(); + // Also build reverse: real leaf -> list of extra leaves in its chain + let realLeafToExtraLeaves = new Map(); + for (const lt of threadedTables) { + if (!extraLeafIds.has(lt.id)) { + // This is a real leaf — find all extra leaves that are ancestors of it + const triggers = getCachedTriggers(lt); + const chainIds = triggers.map(t => t.tableId); + const myExtras: string[] = []; + for (const extraId of extraLeafIds) { + if (chainIds.includes(extraId)) { + if (!extraLeafToRealLeaf.has(extraId)) { + extraLeafToRealLeaf.set(extraId, lt.id); + } + myExtras.push(extraId); + } + } + if (myExtras.length > 0) { + realLeafToExtraLeaves.set(lt.id, myExtras); + } + } + } + // Map from extra leaf id -> its assigned main thread index + let extraLeafToThreadIdx = new Map(); + // Track sub-index counters per chain (keyed by first extra leaf's thread idx) + let subThreadCounters = new Map(); + + threadedTables.forEach((lt, i) => { + const triggers = getCachedTriggers(lt); + + // Collect all table IDs in this thread's chain + let threadTableIds = new Set(); + triggers.forEach(t => threadTableIds.add(t.tableId)); + threadTableIds.add(lt.id); + + // Only new (unclaimed) tables contribute to this thread's height + let newTableIds = [...threadTableIds].filter(id => !claimedTableIds.has(id)); + + let newTriggerCount = triggers.filter(t => newTableIds.includes(t.resultTableId)).length; + let chartCount = newTableIds.reduce((sum, tid) => sum + chartElements.filter(ce => ce.tableId === tid).length, 0); + let messageCount = newTableIds.reduce((sum, tid) => sum + agentActions.filter(a => a.tableId === tid && !a.hidden).length, 0); + + // +1 table and +1 trigger for the leaf table itself + let totalTables = newTableIds.length + 1; + let totalTriggers = newTriggerCount + 1; + + // Claim this thread's tables for subsequent threads + threadTableIds.forEach(id => claimedTableIds.add(id)); + + // Determine thread label and whether this is a split sub-thread + const isSplit = extraLeafIds.has(lt.id); + // A real leaf is a "continuation" if it has extra leaves in its chain + const isContinuation = !isSplit && realLeafToExtraLeaves.has(lt.id); + let threadLabel: string; + let threadIdxForEntry: number; + + if (isSplit) { + // Promoted intermediate — gets a main thread index + realThreadIdx++; + extraLeafToThreadIdx.set(lt.id, realThreadIdx); + threadLabel = `thread - ${realThreadIdx}`; + threadIdxForEntry = realThreadIdx - 1; + } else if (isContinuation) { + // Real leaf whose chain was split — gets sub-index under the last extra leaf's index + const myExtras = realLeafToExtraLeaves.get(lt.id) || []; + // Use the last extra leaf's thread index (the one closest to this leaf in the chain) + const lastExtra = myExtras[myExtras.length - 1]; + const parentIdx = extraLeafToThreadIdx.get(lastExtra) ?? realThreadIdx; + const subIdx = (subThreadCounters.get(parentIdx) || 0) + 1; + subThreadCounters.set(parentIdx, subIdx); + threadLabel = `thread - ${parentIdx}.${subIdx}`; + threadIdxForEntry = i; + } else { + // Normal thread (no splitting involved) + realThreadIdx++; + threadLabel = `thread - ${realThreadIdx}`; + threadIdxForEntry = realThreadIdx - 1; + } - let view = - {/* Render thread0 (hanging tables) first if it exists - using compact view */} - {Object.entries(thread0Group).map(([groupId, leafTables], i) => { - return 1 ? '216px' : '200px', - transition: 'all 0.3s ease', - }} /> - })} - {/* Render regular threads (length > 1) */} - {Object.entries(filteredLeafTableGroups).map(([groupId, leafTables], i) => { - // Calculate used tables from thread0 and previous threads - let usedIntermediateTableIds = Object.values(thread0Group).flat() - .map(x => [ ...getTriggers(x, tables).map(y => y.tableId) || []]).flat(); - let usedLeafTableIds = Object.values(thread0Group).flat().map(x => x.id); - - // Add tables from previous regular threads - const previousThreadGroups = Object.values(filteredLeafTableGroups).slice(0, i); - usedIntermediateTableIds = [...usedIntermediateTableIds, ...previousThreadGroups.flat() - .map(x => [ ...getTriggers(x, tables).map(y => y.tableId) || []]).flat()]; - usedLeafTableIds = [...usedLeafTableIds, ...previousThreadGroups.flat().map(x => x.id)]; - - return 1 ? '216px' : '200px', - transition: 'all 0.3s ease', - }} /> - })} - + allThreadEntries.push({ + key: `thread-${lt.id}-${i}`, + groupId: lt.id, + leafTables: [lt], + threadIdx: threadIdxForEntry, + threadLabel, + isSplitThread: isContinuation, + }); + allThreadHeights.push(estimateThreadHeight(totalTables, totalTriggers, chartCount, messageCount)); + }); - // Calculate total thread count (thread0 + regular threads) - let totalThreadCount = Object.keys(filteredLeafTableGroups).length + (Object.keys(thread0Group).length > 0 ? 1 : 0); - let threadIndices: number[] = []; - if (Object.keys(thread0Group).length > 0) { - threadIndices.push(-1); // thread0 + // Pre-compute usedTableIds for each entry (avoids quadratic recomputation in renderThreadEntry) + { + let accumulated: string[] = []; + for (const entry of allThreadEntries) { + entry.usedTableIds = [...accumulated]; + for (const lt of entry.leafTables) { + const triggers = getCachedTriggers(lt); + accumulated.push(...triggers.map(t => t.tableId), lt.id); + } + } } - threadIndices.push(...Array.from({length: Object.keys(filteredLeafTableGroups).length}, (_, i) => i)); - - let jumpButtonsDrawerOpen = - {_.chunk(threadIndices, 3).map((group, groupIdx) => { - const getLabel = (idx: number) => idx === -1 ? '0' : String(idx + 1); - const startNum = getLabel(group[0]); - const endNum = getLabel(group[group.length - 1]); - const label = startNum === endNum ? startNum : `${startNum}-${endNum}`; - - return ( - - { - setTimeout(() => { - // Get currently most visible thread index - const viewportCenter = window.innerWidth / 2; - const currentIndex = Array.from(document.querySelectorAll('[data-thread-index]')).reduce((closest, element) => { - const rect = element.getBoundingClientRect(); - const distance = Math.abs(rect.left + rect.width/2 - viewportCenter); - const idx = parseInt(element.getAttribute('data-thread-index') || '0'); - if (!closest || distance < closest.distance) { - return { index: idx, distance }; - } - return closest; - }, null as { index: number, distance: number } | null)?.index || 0; - - // If moving from larger to smaller numbers (scrolling left), target first element - // If moving from smaller to larger numbers (scrolling right), target last element - const targetIndex = currentIndex > group[0] ? group[0] : group[group.length - 1]; - - const targetElement = document.querySelector(`[data-thread-index="${targetIndex}"]`); - if (targetElement) { - targetElement.scrollIntoView({ - behavior: 'smooth', - block: 'nearest', // Don't change vertical scroll - inline: currentIndex > group[group.length - 1] ? 'start' : 'end' - }); - } - }, 100); - }} - > - {label} - - - ); - })} - - let jumpButtonDrawerClosed = - {threadIndices.map((threadIdx) => { - const label = threadIdx === -1 ? '0' : String(threadIdx + 1); - return ( - - { - const threadElement = document.querySelector(`[data-thread-index="${threadIdx}"]`); - threadElement?.scrollIntoView({ behavior: 'smooth' }); - }} - > - {label} - - - ); - })} - + // Pick the best column layout: balances scroll burden vs whitespace. + // Measure actual panel height from the DOM (accounts for browser zoom, panel resizing, etc.) + const availableHeight = containerRef.current?.clientHeight ?? 600; + const MAX_COLUMNS = 3; + const hasMultipleThreads = allThreadEntries.length > 1; + const columnLayout: number[][] = chooseBestColumnLayout( + allThreadHeights, MAX_COLUMNS, availableHeight, /* flexOrder */ false, + /* minColumns */ hasMultipleThreads ? 2 : 1 + ); + const actualColumns = columnLayout.length || 1; + + const CARD_WIDTH = hasMultipleThreads ? 220 : 220; + const CARD_GAP = 12; // padding + spacing between cards in a column + + let renderThreadEntry = (entry: ThreadEntry) => { + let usedTableIds = entry.usedTableIds || []; + + return ; + }; - let jumpButtons = drawerOpen ? jumpButtonsDrawerOpen : jumpButtonDrawerClosed; + // Column-based panel width: each column = CARD_WIDTH + CARD_GAP + const COLUMN_WIDTH = CARD_WIDTH + CARD_GAP; + const MIN_PANEL_WIDTH = 0; // ensure enough room for floating chat chip + const panelWidth = Math.max(actualColumns * COLUMN_WIDTH + 16, MIN_PANEL_WIDTH); - let carousel = ( - - 0 ? ( + + ) : null; + + let view = hasContent ? ( + + {/* First column: workspace panel + first batch of threads */} + - - - Data Threads - - {jumpButtons} - - - - - - { setThreadDrawerOpen(false); }}> - - - - - - - { - setThreadDrawerOpen(true); - }}> - - - - - + {workspacePanel} + {(columnLayout[0] || []).map((idx: number) => { + const entry = allThreadEntries[idx]; + return entry ? renderThreadEntry(entry) : null; + })} + {/* Remaining columns */} + {columnLayout.slice(1).map((columnIndices: number[], colIdx: number) => ( + + {columnIndices.map((idx: number) => { + const entry = allThreadEntries[idx]; + return entry ? renderThreadEntry(entry) : null; + })} + + ))} + + ) : null; - + {view} ); - - return carousel; } diff --git a/src/views/DataThreadCards.tsx b/src/views/DataThreadCards.tsx new file mode 100644 index 00000000..78c5e00a --- /dev/null +++ b/src/views/DataThreadCards.tsx @@ -0,0 +1,388 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import React, { memo } from 'react'; + +import { + Box, + Divider, + Typography, + Stack, + Card, + IconButton, + Tooltip, + ButtonGroup, + useTheme, +} from '@mui/material'; + +import { dfActions } from '../app/dfSlice'; +import { Chart, DictTable, Trigger } from "../components/ComponentType"; + +import DeleteIcon from '@mui/icons-material/Delete'; +import AddchartIcon from '@mui/icons-material/Addchart'; +import TableRowsIcon from '@mui/icons-material/TableRowsOutlined'; +import AnchorIcon from '@mui/icons-material/Anchor'; +import CloudQueueIcon from '@mui/icons-material/CloudQueue'; +import SettingsIcon from '@mui/icons-material/Settings'; +import CloseIcon from '@mui/icons-material/Close'; +import HelpOutlineIcon from '@mui/icons-material/HelpOutline'; +import CheckCircleOutlineIcon from '@mui/icons-material/CheckCircleOutline'; +import CancelOutlinedIcon from '@mui/icons-material/CancelOutlined'; + +import { TriggerCard } from './EncodingShelfCard'; +import { ThinkingBanner } from './DataThread'; +import { ComponentBorderStyle, shadow, transition } from '../app/tokens'; + + +// ─── Agent Status Box ──────────────────────────────────────────────────────── + +export const AgentStatusBox = memo<{ + tableId: string; + relevantAgentActions: any[]; + dispatch: any; +}>(({ tableId, relevantAgentActions, dispatch }) => { + + let theme = useTheme(); + + let agentStatus = undefined; + + let getAgentStatusColor = (status: string) => { + switch (status) { + case 'running': + return `${theme.palette.text.secondary} !important`; + case 'completed': + return `${theme.palette.success.main} !important`; + case 'failed': + return `${theme.palette.error.main} !important`; + case 'warning': + return `${theme.palette.warning.main} !important`; + default: + return `${theme.palette.text.secondary} !important`; + } + } + + let currentActions = relevantAgentActions; + + if (currentActions.some(a => a.status == 'running')) { + agentStatus = 'running'; + } else if (currentActions.every(a => a.status == 'completed')) { + agentStatus = 'completed'; + } else if (currentActions.every(a => a.status == 'failed')) { + agentStatus = 'failed'; + } else { + agentStatus = 'warning'; + } + + if (currentActions.length === 0) { + return null; + } + + return ( + + {( + + {agentStatus === 'running' && ThinkingBanner('thinking...', { py: 0.5 })} + {agentStatus === 'completed' && } + {agentStatus === 'failed' && } + {agentStatus === 'warning' && } + + {agentStatus === 'warning' && 'hmm...'} + {agentStatus === 'failed' && 'oops...'} + {agentStatus === 'completed' && 'completed'} + {agentStatus === 'running' && ''} + + + { + event.stopPropagation(); + dispatch(dfActions.deleteAgentWorkInProgress(relevantAgentActions[0].actionId)); + }} + > + + + + + )} + {currentActions.map((a, index, array) => { + let descriptions = String(a.description).split('\n'); + return ( + + + {descriptions.map((line: string, lineIndex: number) => ( + + + {line} + + {lineIndex < descriptions.length - 1 && } + + ))} + + {index < array.length - 1 && array.length > 1 && ( + + )} + + ) + })} + + ); +}); + +// ─── Chart Card ────────────────────────────────────────────────────────────── + +export let buildChartCard = ( + chartElement: { tableId: string, chartId: string, element: any }, + focusedChartId?: string, + unread?: boolean +) => { + let selectedClassName = focusedChartId == chartElement.chartId ? 'selected-card' : ''; + return + {chartElement.element} + +} + +// ─── Trigger Card Wrapper ──────────────────────────────────────────────────── + +export let buildTriggerCard = ( + trigger: Trigger, + focusedChartId: string | undefined, + highlighted: boolean = false, +) => { + let selectedClassName = trigger.chart?.id == focusedChartId ? 'selected-card' : ''; + + let triggerCard =
+ + + +
; + + return + {triggerCard} + ; +} + +// ─── Table Card ────────────────────────────────────────────────────────────── + +export interface BuildTableCardProps { + tableId: string; + tables: DictTable[]; + charts: Chart[]; + chartElements: { tableId: string, chartId: string, element: any }[]; + usedIntermediateTableIds: string[]; + highlightedTableIds: string[]; + agentActions: any[]; + focusedTableId: string | undefined; + focusedChartId: string | undefined; + focusedChart: Chart | undefined; + parentTable: DictTable | undefined; + tableIdList: string[]; + collapsed: boolean; + scrollRef: any; + dispatch: any; + handleOpenTableMenu: (table: DictTable, anchorEl: HTMLElement) => void; + primaryBgColor: string | undefined; +} + +export let buildTableCard = (props: BuildTableCardProps) => { + const { + tableId, tables, charts, chartElements, usedIntermediateTableIds, + highlightedTableIds, agentActions, focusedTableId, focusedChartId, focusedChart, + parentTable, tableIdList, collapsed, scrollRef, dispatch, + handleOpenTableMenu, primaryBgColor, + } = props; + + if (parentTable && tableId == parentTable.id && parentTable.anchored && tableIdList.length > 1) { + let table = tables.find(t => t.id == tableId); + return + { + event.stopPropagation(); + dispatch(dfActions.setFocusedTable(tableId)); + + // Find and set the first chart associated with this table + let firstRelatedChart = charts.find((c: Chart) => c.tableRef == tableId && c.source != "trigger"); + + if (firstRelatedChart) { + dispatch(dfActions.setFocusedChart(firstRelatedChart.id)); + } + }} + > + + + + {table?.displayId || tableId} + + + + + } + + // filter charts relevant to this + let relevantCharts = chartElements.filter(ce => ce.tableId == tableId && !usedIntermediateTableIds.includes(tableId)); + + let table = tables.find(t => t.id == tableId); + + let selectedClassName = tableId == focusedTableId ? 'selected-card' : ''; + + let collapsedProps = collapsed ? { width: '50%', "& canvas": { width: 60, maxHeight: 50 } } : { width: '100%' } + + let releventChartElements = relevantCharts.map((ce, j) => + + {buildChartCard(ce, focusedChartId, charts.find(c => c.id == ce.chartId)?.unread)} + ) + + const isHighlighted = highlightedTableIds.includes(tableId); + + let regularTableBox = c.chartId == focusedChartId) ? scrollRef : null} + sx={{ padding: '0px' }}> + { + dispatch(dfActions.setFocusedTable(tableId)); + if (focusedChart?.tableRef != tableId) { + let firstRelatedChart = charts.find((c: Chart) => c.tableRef == tableId && c.source != 'trigger'); + if (firstRelatedChart) { + dispatch(dfActions.setFocusedChart(firstRelatedChart.id)); + } + } + }}> + + + + {table?.displayId || tableId} + + + + + { + event.stopPropagation(); + handleOpenTableMenu(table!, event.currentTarget); + }} + > + + + + + { + event.stopPropagation(); + dispatch(dfActions.setFocusedTable(tableId)); + dispatch(dfActions.setFocusedChart(undefined)); + }} + > + + + + + + + + + let relevantAgentActions = agentActions.filter(a => a.tableId == tableId).filter(a => a.hidden == false); + + let agentActionBox = ( + + ) + + return [ + regularTableBox, + ...releventChartElements, + ...(relevantAgentActions.length > 0 ? [ + + {agentActionBox} + + ] : []) + ] +} diff --git a/src/views/EncodingBox.tsx b/src/views/EncodingBox.tsx index d549a715..03fd500e 100644 --- a/src/views/EncodingBox.tsx +++ b/src/views/EncodingBox.tsx @@ -50,7 +50,7 @@ import _ from 'lodash'; import '../scss/EncodingShelf.scss'; import AnimateHeight from 'react-animate-height'; import { getIconFromDtype, getIconFromType, groupConceptItems } from './ViewUtils'; -import { getUrls } from '../app/utils'; +import { getUrls, fetchWithIdentity } from '../app/utils'; import { Type } from '../data/types'; @@ -327,7 +327,7 @@ export const EncodingBox: FC = function EncodingBox({ channel, }), }; - fetch(getUrls().SORT_DATA_URL, message) + fetchWithIdentity(getUrls().SORT_DATA_URL, message) .then((response) => response.json()) .then((data) => { setAutoSortInferRunning(false); diff --git a/src/views/EncodingShelfCard.tsx b/src/views/EncodingShelfCard.tsx index cba27b61..663277ee 100644 --- a/src/views/EncodingShelfCard.tsx +++ b/src/views/EncodingShelfCard.tsx @@ -28,6 +28,7 @@ import { useTheme, SxProps, Theme, + Slider, CircularProgress, Button, Dialog, @@ -45,7 +46,7 @@ import _ from 'lodash'; import '../scss/EncodingShelf.scss'; import { createDictTable, DictTable } from "../components/ComponentType"; -import { getUrls, resolveChartFields, getTriggers, assembleVegaChart, resolveRecommendedChart } from '../app/utils'; +import { getUrls, resolveChartFields, getTriggers, assembleVegaChart, resolveRecommendedChart, fetchWithIdentity } from '../app/utils'; import { EncodingBox } from './EncodingBox'; import { ChannelGroups, CHART_TEMPLATES, getChartChannels, getChartTemplate } from '../components/ChartTemplates'; @@ -57,6 +58,8 @@ import CheckIcon from '@mui/icons-material/Check'; import { ThinkingBanner } from './DataThread'; import { AppDispatch } from '../app/store'; +import { borderColor, transition, radius } from '../app/tokens'; + import PrecisionManufacturing from '@mui/icons-material/PrecisionManufacturing'; import { Type } from '../data/types'; import DeleteIcon from '@mui/icons-material/Delete'; @@ -116,7 +119,7 @@ export const renderTextWithEmphasis = (text: string, highlightChipSx?: SxProps @@ -133,7 +136,8 @@ export const TriggerCard: FC<{ trigger: Trigger, hideFields?: boolean, mini?: boolean, - sx?: SxProps}> = function ({ className, trigger, hideFields, mini = false, sx }) { + highlighted?: boolean, + sx?: SxProps}> = function ({ className, trigger, hideFields, mini = false, highlighted = false, sx }) { let theme = useTheme(); @@ -175,8 +179,9 @@ export const TriggerCard: FC<{ ] }) @@ -191,62 +196,46 @@ export const TriggerCard: FC<{ // Process the prompt to highlight content in ** ** const processedPrompt = renderTextWithEmphasis(prompt, { - fontSize: mini ? 10 : 12, padding: '1px 4px', - borderRadius: '4px', + fontSize: mini ? 10 : 11, padding: '1px 4px', + borderRadius: radius.sm, background: alpha(theme.palette.custom.main, 0.08), }); if (mini) { return {processedPrompt} {hideFields ? "" : encodingComp} } - return - - {hideFields ? "" : {encodingComp}} - - {prompt.length > 0 && } - {processedPrompt} - - - + + {processedPrompt} + {hideFields ? "" : <>{" "}{encodingComp}} + } @@ -392,7 +381,7 @@ export const EncodingShelfCard: FC = function ({ chartId } let chartAvailable = checkChartAvailability(chart, conceptShelfItems, currentTable.rows); - let currentChartPng = chartAvailable ? await vegaLiteSpecToPng(assembleVegaChart(chart.chartType, chart.encodingMap, activeFields, currentTable.rows, currentTable.metadata, 20)) : undefined; + let currentChartPng = chartAvailable ? await vegaLiteSpecToPng(assembleVegaChart(chart.chartType, chart.encodingMap, activeFields, currentTable.rows, currentTable.metadata, 20, false, 100, 80, false, chart.config)) : undefined; let actionTables = actionTableIds.map(id => tables.find(t => t.id == id) as DictTable); @@ -405,7 +394,6 @@ export const EncodingShelfCard: FC = function ({ chartId rows: t.rows, attached_metadata: t.attachedMetadata })), - language: currentTable.virtual ? "sql" : "python", exploration_thread: explorationThread, current_data_sample: currentTable.rows.slice(0, 10), current_chart: currentChartPng, @@ -417,7 +405,7 @@ export const EncodingShelfCard: FC = function ({ chartId const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 30000); // 30 second timeout - const response = await fetch(engine, { + const response = await fetchWithIdentity(engine, { method: 'POST', headers: { 'Content-Type': 'application/json', @@ -563,9 +551,7 @@ export const EncodingShelfCard: FC = function ({ chartId chart_encodings: mode == 'formulate' ? activeSimpleEncodings : {}, extra_prompt: instruction, model: activeModel, - max_repair_attempts: config.maxRepairAttempts, - agent_coding_rules: agentRules.coding, - language: actionTables.some(t => t.virtual) ? "sql" : "python" + agent_coding_rules: agentRules.coding }) let engine = getUrls().DERIVE_DATA; @@ -598,9 +584,7 @@ export const EncodingShelfCard: FC = function ({ chartId extra_prompt: instruction, model: activeModel, additional_messages: additionalMessages, - max_repair_attempts: config.maxRepairAttempts, - agent_coding_rules: agentRules.coding, - language: actionTables.some(t => t.virtual) ? "sql" : "python" + agent_coding_rules: agentRules.coding }); engine = getUrls().DERIVE_DATA; } else { @@ -619,9 +603,7 @@ export const EncodingShelfCard: FC = function ({ chartId latest_data_sample: currentTable.rows.slice(0, 10), new_instruction: instruction, model: activeModel, - max_repair_attempts: config.maxRepairAttempts, - agent_coding_rules: agentRules.coding, - language: actionTables.some(t => t.virtual) ? "sql" : "python" + agent_coding_rules: agentRules.coding }) engine = getUrls().REFINE_DATA; } @@ -641,7 +623,7 @@ export const EncodingShelfCard: FC = function ({ chartId const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), config.formulateTimeoutSeconds * 1000); - fetch(engine, {...message, signal: controller.signal }) + fetchWithIdentity(engine, {...message, signal: controller.signal }) .then((response: Response) => response.json()) .then((data) => { @@ -715,7 +697,8 @@ export const EncodingShelfCard: FC = function ({ chartId candidateTableId, rows, { - code: code, + code: code, + outputVariable: refinedGoal['output_variable'] || 'result_df', source: actionTableIds, dialog: dialog, trigger: currentTrigger @@ -951,7 +934,7 @@ export const EncodingShelfCard: FC = function ({ chartId alignItems: 'center', gap: 1, padding: '4px 8px', - borderBottom: '1px solid rgba(0, 0, 0, 0.08)', + borderBottom: `1px solid ${borderColor.component}`, backgroundColor: 'rgba(0, 0, 0, 0.02)' }}> = function ({ chartId fontSize: 11, cursor: 'pointer', padding: '2px 6px', - borderRadius: 1, + borderRadius: radius.sm, backgroundColor: ideateMode ? 'rgba(25, 118, 210, 0.08)' : 'transparent', color: ideateMode ? 'primary.main' : 'text.secondary', fontWeight: ideateMode ? 500 : 400, - transition: 'all 0.2s ease', + transition: transition.fast, '&:hover': { backgroundColor: ideateMode ? 'rgba(25, 118, 210, 0.12)' : 'rgba(0, 0, 0, 0.04)' } @@ -1003,11 +986,11 @@ export const EncodingShelfCard: FC = function ({ chartId fontSize: 11, cursor: 'pointer', padding: '2px 6px', - borderRadius: 1, + borderRadius: radius.sm, backgroundColor: !ideateMode ? 'rgba(25, 118, 210, 0.08)' : 'transparent', color: !ideateMode ? 'primary.main' : 'text.secondary', fontWeight: !ideateMode ? 500 : 400, - transition: 'all 0.2s ease', + transition: transition.fast, '&:hover': { backgroundColor: !ideateMode ? 'rgba(25, 118, 210, 0.12)' : 'rgba(0, 0, 0, 0.04)' } @@ -1110,6 +1093,103 @@ export const EncodingShelfCard: FC = function ({ chartId })} + {/* Template-driven config property selectors */} + {(() => { + const template = getChartTemplate(chart.chartType); + const configProps = template?.configProperties; + if (!configProps || configProps.length === 0) return null; + return ( + + {configProps.map((propDef) => { + if (propDef.type === 'slider') { + const currentValue = chart.config?.[propDef.key] ?? propDef.defaultValue ?? propDef.min ?? 0; + return ( + + + {propDef.label} + + { + dispatch(dfActions.updateChartConfig({chartId, key: propDef.key, value: newValue as number})); + }} + valueLabelDisplay="auto" + sx={{ + flex: 1, height: 4, mx: 0.5, + '& .MuiSlider-thumb': { width: 12, height: 12 }, + '& .MuiSlider-valueLabel': { fontSize: 10, padding: '2px 4px', lineHeight: 1.2 }, + }} + /> + + {currentValue} + + + ); + } + if (propDef.type !== 'select' || !propDef.options) return null; + const currentValue = chart.config?.[propDef.key] ?? propDef.defaultValue; + const options = propDef.options; + // Find the index of the current value in options (deep compare via JSON) + const currentSerialized = JSON.stringify(currentValue); + let selectedIndex = options.findIndex(o => JSON.stringify(o.value) === currentSerialized); + if (selectedIndex < 0) selectedIndex = 0; + return ( + + + {propDef.label} + + + + ); + })} + + ); + })()}
{encodingBoxGroups} @@ -1123,6 +1203,7 @@ export const EncodingShelfCard: FC = function ({ chartId maxWidth: "400px", display: 'flex', flexDirection: 'column', + borderColor: borderColor.component, backgroundColor: trigger ? "rgba(255, 160, 122, 0.07)" : "" }}> diff --git a/src/views/EncodingShelfThread.tsx b/src/views/EncodingShelfThread.tsx index 415da047..a31e5312 100644 --- a/src/views/EncodingShelfThread.tsx +++ b/src/views/EncodingShelfThread.tsx @@ -3,50 +3,41 @@ import { FC, useState } from 'react' import { useSelector, useDispatch } from 'react-redux' -import { DataFormulatorState, dfActions, dfSelectors, fetchCodeExpl, fetchFieldSemanticType, generateFreshChart } from '../app/dfSlice'; +import { DataFormulatorState, dfActions, dfSelectors } from '../app/dfSlice'; import { Box, Typography, Button, - CircularProgress, - IconButton, Tooltip, Collapse, - Stack, Card, - ListItemIcon, } from '@mui/material'; import React from 'react'; -import { EncodingItem, Chart, Trigger } from "../components/ComponentType"; +import { Chart, Trigger } from "../components/ComponentType"; -import _ from 'lodash'; import '../scss/EncodingShelf.scss'; import { DictTable } from "../components/ComponentType"; import { Type } from '../data/types'; -import embed from 'vega-embed'; -import { getTriggers, assembleVegaChart } from '../app/utils'; +import { getTriggers } from '../app/utils'; import { getChartTemplate } from '../components/ChartTemplates'; import { checkChartAvailability, generateChartSkeleton } from './VisualizationView'; import TableRowsIcon from '@mui/icons-material/TableRowsOutlined'; import InsightsIcon from '@mui/icons-material/Insights'; -import AnchorIcon from '@mui/icons-material/Anchor'; import ChevronLeftIcon from '@mui/icons-material/ChevronLeft'; import ChevronRightIcon from '@mui/icons-material/ChevronRight'; -import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; -import ExpandLessIcon from '@mui/icons-material/ExpandLess'; import { AppDispatch } from '../app/store'; import { EncodingShelfCard, TriggerCard } from './EncodingShelfCard'; -import { useTheme } from '@mui/material/styles'; +import { useTheme, alpha } from '@mui/material/styles'; // Property and state of an encoding shelf export interface EncodingShelfThreadProps { @@ -80,83 +71,27 @@ export let ChartElementFC: FC<{ } - // if (chart.chartType == "Table") { - // return renderTableChart(chart, conceptShelfItems, tableRows); - // } - - // prepare the chart to be rendered - let assembledChart = assembleVegaChart(chart.chartType, chart.encodingMap, conceptShelfItems, tableRows, tableMetadata, 20); - assembledChart["background"] = "transparent"; - // chart["autosize"] = { - // "type": "fit", - // "contains": "padding" - // }; - - const id = `chart-thumbnail-${chart.id}-${(Math.random() + 1).toString(36).substring(7)}`; - const element = ; - - // Temporary fix, down sample the dataset - if (assembledChart["data"]["values"].length > 5000) { - let values = assembledChart["data"]["values"]; - assembledChart = (({ data, ...o }) => o)(assembledChart); - - let getRandom = (seed: number) => { - let x = Math.sin(seed++) * 10000; - return x - Math.floor(x); - } - let getRandomSubarray = (arr: any[], size: number) => { - let shuffled = arr.slice(0), i = arr.length, temp, index; - while (i--) { - index = Math.floor((i + 1) * getRandom(233 * i + 888)); - temp = shuffled[index]; - shuffled[index] = shuffled[i]; - shuffled[i] = temp; - } - return shuffled.slice(0, size); - } - assembledChart["data"] = { "values": getRandomSubarray(values, 5000) }; - } - - assembledChart['config'] = { - "axis": {"labelLimit": 30} + // Use cached thumbnail from ChartRenderService when available + if (chart.thumbnail) { + return ( + + {`${chart.chartType} + + ); } - embed('#' + id, assembledChart, { actions: false, renderer: "canvas" }).then(function (result) { - // Access the Vega view instance (https://vega.github.io/vega/docs/api/view/) as result.view - if (result.view.container()?.getElementsByTagName("canvas")) { - let comp = result.view.container()?.getElementsByTagName("canvas")[0]; - - // Doesn't seem like width & height are actual numbers here on Edge bug - // let width = parseInt(comp?.style.width as string); - // let height = parseInt(comp?.style.height as string); - if (comp) { - const { width, height } = comp.getBoundingClientRect(); - //console.log(`THUMB: width = ${width} height = ${height}`); - - if (width > WIDTH || height > HEIGHT) { - let ratio = width / height; - let fixedWidth = width; - if (ratio * HEIGHT < width) { - fixedWidth = ratio * HEIGHT; - } - if (fixedWidth > WIDTH) { - fixedWidth = WIDTH; - } - //console.log("THUMB: width or height are oversized"); - //console.log(`THUMB: new width = ${fixedWidth}px height = ${fixedWidth / ratio}px`) - comp?.setAttribute("style", - `max-width: ${WIDTH}px; max-height: ${HEIGHT}px; width: ${Math.round(fixedWidth)}px; height: ${Math.round(fixedWidth / ratio)}px; `); - } - } else { - console.log("THUMB: Could not get Canvas HTML5 element") - } - } - }).catch((reason) => { - // console.log(reason) - // console.error(reason) - }); - - return element; + // Fallback: skeleton while ChartRenderService is processing + return ( + + {generateChartSkeleton(chartTemplate?.icon, 48, 48, 0.3)} + + ); } export const EncodingShelfThread: FC = function ({ chartId }) { @@ -173,96 +108,105 @@ export const EncodingShelfThread: FC = function ({ cha const dispatch = useDispatch(); - const interleaveArrays: any = (a: any[], b: any[], spaceElement?: any): any[] => { - if (a.length === 0) return b; - // Filter out null/undefined and empty strings to avoid key warnings - const result = [a[0], ...interleaveArrays(b, a.slice(1), spaceElement)]; - return result.filter(x => x !== null && x !== undefined && x !== ''); - }; + const theme = useTheme(); + const TIMELINE_WIDTH = 16; + const dashedColor = 'rgba(0,0,0,0.15)'; + const dashedWidth = '1px'; + const dashedStyle = 'dashed'; let previousInstructions : any = "" - let buildTableCard = (tableId: string) => { + let buildTimelineTableRow = (tableId: string, isFirst: boolean, isLast: boolean) => { let table = tables.find(t => t.id == tableId) as DictTable; - return
- -
- } +
+
+ ); + }; + + let buildTimelineTriggerRow = (trigger: Trigger) => { + const triggerColor = alpha(theme.palette.custom.main, 0.4); + return ( + + + + + + + + + + + ); + }; + + let buildTimelineEllipsisRow = () => ( + + + + + + + + + ); let tableList = activeTableThread.map((tableId) => { let table = tables.find(t => t.id == tableId) as DictTable; if (!table) { return null; } - return buildTableCard(tableId); - }).filter(x => x !== null); + return tableId; + }).filter(x => x !== null) as string[]; let leafTable = tables.find(t => t.id == activeTableThread[activeTableThread.length - 1]) as DictTable; - let triggers = getTriggers(leafTable, tables) + let triggers = getTriggers(leafTable, tables) - let instructionCards = triggers.map((trigger, i) => { - let extractActiveFields = (t: Trigger) => { - let encodingMap = allCharts.find(c => c.id == t.chart?.id)?.encodingMap; - if (!encodingMap) { - return []; - } - return Array.from(Object.values(encodingMap)).map((enc: EncodingItem) => enc.fieldID).filter(x => x != undefined) - }; - let previousActiveFields = new Set(i == 0 ? [] : extractActiveFields(triggers[i - 1])) - let currentActiveFields = new Set(extractActiveFields(trigger)) - let fieldsIdentical = _.isEqual(previousActiveFields, currentActiveFields) - return - - - - + // Simplified timeline: source table → (…) → last trigger → current table + let timelineRows: React.ReactNode[] = []; + if (tableList.length > 0) { + // Source table + timelineRows.push(buildTimelineTableRow(tableList[0], true, false)); + // Ellipsis if intermediate steps were skipped + if (tableList.length > 2) { + timelineRows.push(buildTimelineEllipsisRow()); + } + // Most recent trigger (if any) + if (triggers.length > 0) { + timelineRows.push(buildTimelineTriggerRow(triggers[triggers.length - 1])); + } + // Current table (if different from source) + if (tableList.length > 1) { + timelineRows.push(buildTimelineTableRow(tableList[tableList.length - 1], false, true)); + } + } + + previousInstructions = ( + + {timelineRows} - }) - - let spaceElement = "" //; - - let truncated = tableList.length > 3; - - previousInstructions = truncated ? - - {tableList[0]} - - - ... - - - {tableList[tableList.length - 3]} - {instructionCards[instructionCards.length - 2]} - {tableList[tableList.length - 2]} - {instructionCards[instructionCards.length - 1]} - {tableList[tableList.length - 1]} - - : - - {interleaveArrays(tableList, instructionCards, spaceElement)} - ; + ); let postInstruction : any = ""; if (chartTrigger) { @@ -282,43 +226,39 @@ export const EncodingShelfThread: FC = function ({ cha }) postInstruction = - - - - {buildTableCard(resultTable.id)} - - - - - {endChartCards} - - + {buildTimelineTableRow(resultTable.id, false, endChartCards.length === 0)} + {endChartCards.length > 0 && ( + + + + + + {endChartCards} + + + )} + } + // Connector between previousInstructions and EncodingShelfCard + const timelineConnector = ( + + + + + + ); + const encodingShelf = ( - {[ - - {previousInstructions} - , - ]} - - - + {previousInstructions} + {timelineConnector} {postInstruction} diff --git a/src/views/ExplComponents.tsx b/src/views/ExplComponents.tsx index 22cb5784..e6a91b55 100644 --- a/src/views/ExplComponents.tsx +++ b/src/views/ExplComponents.tsx @@ -13,6 +13,7 @@ import { } from '@mui/material'; import { styled } from '@mui/material/styles'; import { alpha } from '@mui/material/styles'; +import { borderColor, shadow, transition, radius } from '../app/tokens'; import 'katex/dist/katex.min.css'; import { InlineMath, BlockMath } from 'react-katex'; @@ -129,12 +130,12 @@ const ConceptExplanationCard = styled(Card, { margin: '4px', borderRadius: '6px', - border: `1px solid ${alpha(theme.palette.divider, 0.2)}`, - boxShadow: '0 1px 2px rgba(0,0,0,0.05)', - transition: 'all 0.2s ease-in-out', + border: `1px solid ${borderColor.divider}`, + boxShadow: shadow.sm, + transition: transition.normal, backgroundColor: alpha(theme.palette.background.paper, 0.9), '&:hover': { - boxShadow: '0 2px 6px rgba(0,0,0,0.1)', + boxShadow: shadow.lg, borderColor: !secondary ? theme.palette.primary.light : theme.palette.secondary.light, transform: 'translateY(-1px)', }, @@ -224,8 +225,7 @@ export const ConceptExplCards: FC = ({ justifyContent: 'center', marginTop: 1, paddingTop: 1, - borderTop: '1px solid', - borderColor: 'divider', + borderTop: `1px solid ${borderColor.divider}`, }}> diff --git a/src/views/MessageSnackbar.tsx b/src/views/MessageSnackbar.tsx index ee17fb23..fe993aab 100644 --- a/src/views/MessageSnackbar.tsx +++ b/src/views/MessageSnackbar.tsx @@ -9,6 +9,7 @@ import CloseIcon from '@mui/icons-material/Close'; import { DataFormulatorState, dfActions } from '../app/dfSlice'; import { useDispatch, useSelector } from 'react-redux'; import { Alert, alpha, Box, Chip, Collapse, Divider, Paper, Tooltip, Typography } from '@mui/material'; +import { shadow, transition } from '../app/tokens'; import InfoIcon from '@mui/icons-material/Info'; import AssignmentIcon from '@mui/icons-material/Assignment'; import DeleteIcon from '@mui/icons-material/Delete'; @@ -136,8 +137,8 @@ export function MessageSnackbar() { }, border: '1px solid', - boxShadow: '0 0 10px rgba(0,0,0,0.1)', - transition: 'all 0.3s ease' + boxShadow: shadow.xl, + transition: transition.slow }}}> { @@ -114,7 +114,7 @@ export const ModelSelectionButton: React.FC<{}> = ({ }) => { useEffect(() => { const fetchModelOptions = async () => { try { - const response = await fetch(getUrls().CHECK_AVAILABLE_MODELS); + const response = await fetchWithIdentity(getUrls().CHECK_AVAILABLE_MODELS); const data = await response.json(); // Group models by provider @@ -163,7 +163,7 @@ export const ModelSelectionButton: React.FC<{}> = ({ }) => { model: model, }), }; - fetch(getUrls().TEST_MODEL, {...message }) + fetchWithIdentity(getUrls().TEST_MODEL, {...message }) .then((response) => response.json()) .then((data) => { let status = data["status"] || 'error'; diff --git a/src/views/RefreshDataDialog.tsx b/src/views/RefreshDataDialog.tsx index 3a51f636..063f7323 100644 --- a/src/views/RefreshDataDialog.tsx +++ b/src/views/RefreshDataDialog.tsx @@ -2,6 +2,7 @@ // Licensed under the MIT License. import React, { useState, useCallback, useRef } from 'react'; +import { borderColor, transition, radius } from '../app/tokens'; import { Dialog, DialogTitle, @@ -399,7 +400,7 @@ export const RefreshDataDialog: React.FC = ({ {isLoading && } - + @@ -429,7 +430,7 @@ export const RefreshDataDialog: React.FC = ({ p: 1, backgroundColor: alpha(theme.palette.text.secondary, 0.04), borderRadius: 1, - border: `1px solid ${alpha(theme.palette.divider, 0.5)}` + border: `1px solid ${borderColor.divider}` }}> Large content ({Math.round(pasteContent.length / 1000)}KB) • {showFullContent ? 'Full view' : 'Preview'} @@ -505,12 +506,12 @@ export const RefreshDataDialog: React.FC = ({ { true, config.defaultChartWidth, config.defaultChartHeight, - true + true, + chart.config ); // Create a temporary container for embedding @@ -718,11 +720,10 @@ export const ReportView: FC = () => { model: model, input_tables: inputTables, charts: validCharts, - style: style, - language: tables.some(t => t.virtual) ? "sql" : "python" + style: style }; - const response = await fetch(getUrls().GENERATE_REPORT_STREAM, { + const response = await fetchWithIdentity(getUrls().GENERATE_REPORT_STREAM, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(requestBody) @@ -850,13 +851,13 @@ export const ReportView: FC = () => { backgroundColor: 'rgba(255, 255, 255, 0.9)', backdropFilter: 'blur(12px)', border: '1px solid', - borderColor: 'rgba(0, 0, 0, 0.08)', - boxShadow: '0 2px 8px rgba(0, 0, 0, 0.06)', + borderColor: borderColor.view, + boxShadow: shadow.lg, '&:hover': { backgroundColor: 'rgba(255, 255, 255, 0.95)', - borderColor: 'rgba(0, 0, 0, 0.12)', - boxShadow: '0 4px 12px rgba(0, 0, 0, 0.1)', - transition: 'all 0.2s ease-in-out' + borderColor: borderColor.view, + boxShadow: shadow.xl, + transition: transition.normal }, '.MuiTypography-root': { fontSize: '1rem', @@ -1001,10 +1002,10 @@ export const ReportView: FC = () => { cursor: 'pointer', position: 'relative', overflow: 'hidden', backgroundColor: selectedChartIds.has(chart.id) ? alpha(theme.palette.primary.main, 0.08) : 'background.paper', border: selectedChartIds.has(chart.id) ? '2px solid' : '1px solid', - borderColor: selectedChartIds.has(chart.id) ? 'primary.main' : 'divider', + borderColor: selectedChartIds.has(chart.id) ? 'primary.main' : borderColor.divider, '&:hover': { backgroundColor: 'action.hover', boxShadow: 3, - transform: 'translateY(-2px)', transition: 'all 0.2s ease-in-out' + transform: 'translateY(-2px)', transition: transition.normal }, }} onClick={() => toggleChartSelection(chart.id)} @@ -1080,8 +1081,7 @@ export const ReportView: FC = () => { display: 'flex', overflowY: 'auto', flexDirection: 'column', - borderRight: 1, - borderColor: 'divider', + borderRight: `1px solid ${borderColor.view}`, height: 'fit-content', background: alpha(theme.palette.background.paper, 0.9), }}> diff --git a/src/views/SelectableDataGrid.tsx b/src/views/SelectableDataGrid.tsx index fcf04731..a344dbbd 100644 --- a/src/views/SelectableDataGrid.tsx +++ b/src/views/SelectableDataGrid.tsx @@ -2,6 +2,7 @@ // Licensed under the MIT License. import * as React from 'react'; +import { shadow, transition } from '../app/tokens'; import { TableVirtuoso } from 'react-virtuoso'; import Table from '@mui/material/Table'; import TableBody from '@mui/material/TableBody'; @@ -28,7 +29,7 @@ import CasinoIcon from '@mui/icons-material/Casino'; import ArrowUpwardIcon from '@mui/icons-material/ArrowUpward'; import ArrowDownwardIcon from '@mui/icons-material/ArrowDownward'; import UnfoldMoreIcon from '@mui/icons-material/UnfoldMore'; -import { getUrls } from '../app/utils'; +import { getUrls, fetchWithIdentity } from '../app/utils'; import { useDrag } from 'react-dnd'; import { useSelector } from 'react-redux'; import { DataFormulatorState } from '../app/dfSlice'; @@ -178,7 +179,7 @@ const DraggableHeader: React.FC = ({ ...(field && { '&:hover': { backgroundColor: hoverBackgroundColor, - boxShadow: '0 2px 4px rgba(0,0,0,0.08)', + boxShadow: shadow.md, }, }), }} @@ -298,7 +299,7 @@ export const SelectableDataGrid: React.FC = ({ } // Use the SAMPLE_TABLE endpoint with appropriate ordering - fetch(getUrls().SAMPLE_TABLE, { + fetchWithIdentity(getUrls().SAMPLE_TABLE, { method: 'POST', headers: { 'Content-Type': 'application/json', diff --git a/src/views/TableSelectionView.tsx b/src/views/TableSelectionView.tsx index a7ceab50..64b6a240 100644 --- a/src/views/TableSelectionView.tsx +++ b/src/views/TableSelectionView.tsx @@ -7,6 +7,7 @@ import { useEffect, useState, useMemo } from 'react'; import Typography from '@mui/material/Typography'; import Box from '@mui/material/Box'; import { Button, Chip } from '@mui/material'; +import { borderColor } from '../app/tokens'; import StreamIcon from '@mui/icons-material/Stream'; import { createTableFromFromObjectArray } from '../data/utils'; import { MultiTablePreview } from './MultiTablePreview'; @@ -89,8 +90,7 @@ export const DatasetSelectionView: React.FC = functio width: 180, display: 'flex', flexDirection: 'column', - borderRight: 1, - borderColor: 'divider', + borderRight: `1px solid ${borderColor.view}`, overflow: 'hidden', height: '100%' }}> diff --git a/src/views/UnifiedDataUploadDialog.tsx b/src/views/UnifiedDataUploadDialog.tsx index ecb80930..a3928c8d 100644 --- a/src/views/UnifiedDataUploadDialog.tsx +++ b/src/views/UnifiedDataUploadDialog.tsx @@ -3,6 +3,7 @@ import * as React from 'react'; import { useState, useCallback, useEffect, useRef } from 'react'; +import { borderColor, transition, radius } from '../app/tokens'; import { Box, Button, @@ -35,11 +36,12 @@ import StreamIcon from '@mui/icons-material/Stream'; import { useDispatch, useSelector } from 'react-redux'; import { DataFormulatorState, dfActions, fetchFieldSemanticType } from '../app/dfSlice'; import { AppDispatch } from '../app/store'; +import { loadTable } from '../app/tableThunks'; import { DataSourceConfig, DictTable } from '../components/ComponentType'; import { createTableFromFromObjectArray, createTableFromText, loadTextDataWrapper, loadBinaryDataWrapper } from '../data/utils'; import { DataLoadingChat } from './DataLoadingChat'; import { DatasetSelectionView, DatasetMetadata } from './TableSelectionView'; -import { getUrls } from '../app/utils'; +import { getUrls, fetchWithIdentity } from '../app/utils'; import { DBManagerPane } from './DBTableManager'; import { MultiTablePreview } from './MultiTablePreview'; import { @@ -99,11 +101,10 @@ const DataSourceCard: React.FC = ({ sx={{ p: 1.5, cursor: disabled ? 'not-allowed' : 'pointer', - border: '1px solid', - borderColor: 'divider', - borderRadius: 1, + border: `1px solid ${borderColor.divider}`, + borderRadius: radius.sm, opacity: disabled ? 0.5 : 1, - transition: 'all 0.15s ease', + transition: transition.fast, display: 'flex', alignItems: 'center', gap: 1.5, @@ -465,12 +466,16 @@ export const UnifiedDataUploadDialog: React.FC = ( const existingTables = useSelector((state: DataFormulatorState) => state.tables); const serverConfig = useSelector((state: DataFormulatorState) => state.serverConfig); const dataCleanBlocks = useSelector((state: DataFormulatorState) => state.dataCleanBlocks); + const frontendRowLimit = useSelector((state: DataFormulatorState) => state.config?.frontendRowLimit ?? 10000); const existingNames = new Set(existingTables.map(t => t.id)); const [activeTab, setActiveTab] = useState(initialTab === 'menu' ? 'menu' : initialTab); const fileInputRef = useRef(null); const urlInputRef = useRef(null); + // Store on server toggle (default: true, like normal browsing mode) + const [storeOnServer, setStoreOnServer] = useState(true); + // Paste tab state const [pasteContent, setPasteContent] = useState(""); const [isLargeContent, setIsLargeContent] = useState(false); @@ -515,7 +520,7 @@ export const UnifiedDataUploadDialog: React.FC = ( // Load sample datasets useEffect(() => { if (open && activeTab === 'explore') { - fetch(`${getUrls().EXAMPLE_DATASETS}`) + fetchWithIdentity(`${getUrls().EXAMPLE_DATASETS}`) .then((response) => response.json()) .then((result) => { let datasets: DatasetMetadata[] = result.map((info: any) => { @@ -572,7 +577,7 @@ export const UnifiedDataUploadDialog: React.FC = ( setDatasetPreviews(datasets); }); } else if (open && activeTab === 'url') { - fetch(`${window.location.origin}/api/demo-stream/info`) + fetchWithIdentity(`${window.location.origin}/api/demo-stream/info`) .then(res => res.json()) .then(data => { const demoExamples = data.demo_examples @@ -711,8 +716,11 @@ export const UnifiedDataUploadDialog: React.FC = ( if (table) { const sourceConfig: DataSourceConfig = { type: 'file', fileName: filePreviewFiles[0]?.name }; const tableWithSource = { ...table, source: sourceConfig }; - dispatch(dfActions.loadTable(tableWithSource)); - dispatch(fetchFieldSemanticType(tableWithSource)); + dispatch(loadTable({ + table: tableWithSource, + storeOnServer, + file: storeOnServer ? filePreviewFiles[filePreviewActiveIndex] || filePreviewFiles[0] : undefined, + })); handleClose(); } }; @@ -725,8 +733,11 @@ export const UnifiedDataUploadDialog: React.FC = ( const table = filePreviewTables[i]; const sourceConfig: DataSourceConfig = { type: 'file', fileName: filePreviewFiles[i]?.name || filePreviewFiles[0]?.name }; const tableWithSource = { ...table, source: sourceConfig }; - dispatch(dfActions.loadTable(tableWithSource)); - dispatch(fetchFieldSemanticType(tableWithSource)); + dispatch(loadTable({ + table: tableWithSource, + storeOnServer, + file: storeOnServer ? filePreviewFiles[i] || filePreviewFiles[0] : undefined, + })); } handleClose(); }; @@ -784,8 +795,7 @@ export const UnifiedDataUploadDialog: React.FC = ( if (table) { // Add source info for paste data const tableWithSource = { ...table, source: { type: 'paste' as const } }; - dispatch(dfActions.loadTable(tableWithSource)); - dispatch(fetchFieldSemanticType(tableWithSource)); + dispatch(loadTable({ table: tableWithSource, storeOnServer })); handleClose(); } }; @@ -883,8 +893,7 @@ export const UnifiedDataUploadDialog: React.FC = ( sourceConfig = { type: 'url', url: tableURL }; } const tableWithSource = { ...table, source: sourceConfig }; - dispatch(dfActions.loadTable(tableWithSource)); - dispatch(fetchFieldSemanticType(tableWithSource)); + dispatch(loadTable({ table: tableWithSource, storeOnServer })); handleClose(); } }; @@ -908,8 +917,7 @@ export const UnifiedDataUploadDialog: React.FC = ( sourceConfig = { type: 'url', url: tableURL }; } const tableWithSource = { ...table, source: sourceConfig }; - dispatch(dfActions.loadTable(tableWithSource)); - dispatch(fetchFieldSemanticType(tableWithSource)); + dispatch(loadTable({ table: tableWithSource, storeOnServer })); } handleClose(); }; @@ -993,8 +1001,29 @@ export const UnifiedDataUploadDialog: React.FC = ( )} + {activeTab !== 'menu' && activeTab !== 'database' && ( + + setStoreOnServer(e.target.checked)} + size="small" + /> + } + label={ + + {storeOnServer ? 'Store on server' : `Local only (≤${frontendRowLimit.toLocaleString()} rows)`} + + } + /> + + )} = ( = ( ) : ( - + )} {/* Extract Data Tab */} - + {/* Explore Sample Datasets Tab */} @@ -1509,8 +1538,7 @@ export const UnifiedDataUploadDialog: React.FC = ( // Regular example data dictTable.source = { type: 'example', url: table.url }; } - dispatch(dfActions.loadTable(dictTable)); - dispatch(fetchFieldSemanticType(dictTable)); + dispatch(loadTable({ table: dictTable, storeOnServer })); } }); } diff --git a/src/views/VisualizationView.tsx b/src/views/VisualizationView.tsx index 20641a1e..4b95efa5 100644 --- a/src/views/VisualizationView.tsx +++ b/src/views/VisualizationView.tsx @@ -35,14 +35,15 @@ import { import _ from 'lodash'; +import { borderColor } from '../app/tokens'; + import ButtonGroup from '@mui/material/ButtonGroup'; -import embed from 'vega-embed'; import '../scss/VisualizationView.scss'; import { useDispatch, useSelector } from 'react-redux'; -import { DataFormulatorState, dfActions, getSessionId } from '../app/dfSlice'; -import { assembleVegaChart, extractFieldsFromEncodingMap, getUrls, prepVisTable } from '../app/utils'; +import { DataFormulatorState, dfActions } from '../app/dfSlice'; +import { assembleVegaChart, extractFieldsFromEncodingMap, getUrls, prepVisTable, fetchWithIdentity } from '../app/utils'; import { Chart, EncodingItem, EncodingMap, FieldItem } from '../components/ComponentType'; import { DictTable } from "../components/ComponentType"; @@ -57,11 +58,10 @@ import ContentCopyIcon from '@mui/icons-material/ContentCopy'; import ZoomInIcon from '@mui/icons-material/ZoomIn'; import ZoomOutIcon from '@mui/icons-material/ZoomOut'; import TextSnippetIcon from '@mui/icons-material/TextSnippet'; -import FilterAltIcon from '@mui/icons-material/FilterAlt'; -import CheckIcon from '@mui/icons-material/Check'; -import CloudQueueIcon from '@mui/icons-material/CloudQueue'; import InfoIcon from '@mui/icons-material/Info'; import CasinoIcon from '@mui/icons-material/Casino'; +import SaveAltIcon from '@mui/icons-material/SaveAlt'; +import OpenInNewIcon from '@mui/icons-material/OpenInNew'; import { CHART_TEMPLATES, getChartTemplate } from '../components/ChartTemplates'; @@ -280,7 +280,7 @@ export let SampleSizeEditor: FC<{ } -// Simple component that only handles Vega chart rendering +// Simple component that only handles Vega chart rendering — now uses headless toSVG() const VegaChartRenderer: FC<{ chart: Chart; conceptShelfItems: FieldItem[]; @@ -292,16 +292,18 @@ const VegaChartRenderer: FC<{ chartUnavailable: boolean; }> = React.memo(({ chart, conceptShelfItems, visTableRows, tableMetadata, chartWidth, chartHeight, scaleFactor, chartUnavailable }) => { - const elementId = `focused-chart-element-${chart.id}`; - + const [svgContent, setSvgContent] = useState(null); + const [assembledSpec, setAssembledSpec] = useState(null); + useEffect(() => { if (chart.chartType === "Auto" || chart.chartType === "Table" || chartUnavailable) { + setSvgContent(null); + setAssembledSpec(null); return; } - - const assembledChart = assembleVegaChart( + const spec = assembleVegaChart( chart.chartType, chart.encodingMap, conceptShelfItems, @@ -311,21 +313,102 @@ const VegaChartRenderer: FC<{ true, chartWidth, chartHeight, - true + true, + chart.config, + scaleFactor, ); - // Use "canvas" renderer for Vega charts instead of "svg". - // Reason: Canvas provides better performance for large datasets and complex charts, - // and avoids some SVG rendering issues in certain browsers. Note that this may affect - // accessibility and text selection. If SVG features are needed, consider reverting. - embed('#' + elementId, { ...assembledChart }, { actions: true, renderer: "canvas" }) - .then(function (result) { - // any post-processing of the canvas can go here - }).catch((error) => { - //console.error('Chart rendering error:', error); - }); + if (!spec || spec === "Table") { + setSvgContent(null); + setAssembledSpec(null); + return; + } - }, [chart.id, chart.chartType, chart.encodingMap, conceptShelfItems, visTableRows, tableMetadata, chartWidth, chartHeight, scaleFactor, chartUnavailable]); + spec['background'] = 'white'; + setAssembledSpec(spec); + + // Headless render via Vega: compile VL → parse → View → toSVG() + let cancelled = false; + (async () => { + try { + const { compile: vlCompile } = await import('vega-lite'); + const vega = await import('vega'); + const vgSpec = vlCompile(spec as any).spec; + const runtime = vega.parse(vgSpec); + const view = new vega.View(runtime, { renderer: 'none' }); + await view.runAsync(); + const svg = await view.toSVG(); + view.finalize(); + if (!cancelled) { + setSvgContent(svg); + } + } catch (err) { + console.warn('VegaChartRenderer: SVG render failed', err); + if (!cancelled) { + setSvgContent(null); + } + } + })(); + + return () => { cancelled = true; }; + + }, [chart.id, chart.chartType, chart.encodingMap, chart.config, conceptShelfItems, visTableRows, tableMetadata, chartWidth, chartHeight, scaleFactor, chartUnavailable]); + + const handleSavePng = useCallback(async () => { + if (!assembledSpec) return; + try { + const { compile: vlCompile } = await import('vega-lite'); + const vega = await import('vega'); + const vgSpec = vlCompile(assembledSpec as any).spec; + const runtime = vega.parse(vgSpec); + const view = new vega.View(runtime, { renderer: 'none' }); + await view.runAsync(); + const pngUrl = await view.toImageURL('png', 2); + view.finalize(); + + // Trigger download + const link = document.createElement('a'); + link.download = `${chart.chartType}-${chart.id}.png`; + link.href = pngUrl; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + } catch (err) { + console.error('Save PNG failed:', err); + } + }, [assembledSpec, chart.chartType, chart.id]); + + const handleOpenInVegaEditor = useCallback(() => { + if (!assembledSpec) return; + // Use postMessage to pass spec to Vega Editor (same approach as vega-embed) + const editorUrl = 'https://vega.github.io/editor/'; + const editor = window.open(editorUrl); + if (!editor) return; + + const wait = 10_000; + const step = 250; + const { origin } = new URL(editorUrl); + let count = Math.floor(wait / step); + + function listen(evt: MessageEvent) { + if (evt.source === editor) { + count = 0; + window.removeEventListener('message', listen, false); + } + } + window.addEventListener('message', listen, false); + + function send() { + if (count <= 0) return; + editor!.postMessage({ + spec: JSON.stringify(assembledSpec, null, 2), + mode: 'vega-lite', + }, origin); + setTimeout(send, step); + count -= 1; + } + setTimeout(send, step); + }, [assembledSpec]); if (chart.chartType === "Auto") { return @@ -346,13 +429,47 @@ const VegaChartRenderer: FC<{ } - return ; + return ( + + {svgContent ? ( + + ) : ( + + {generateChartSkeleton(chartTemplate?.icon, 48, 48, 0.3)} + + )} + {svgContent && ( + + + + + + + + + + + + + )} + + ); }); export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { const config = useSelector((state: DataFormulatorState) => state.config); + const serverConfig = useSelector((state: DataFormulatorState) => state.serverConfig); const componentRef = useRef(null); // Add ref for the container box that holds all exploration components @@ -378,20 +495,16 @@ export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { const [codeExplViewOpen, setCodeExplViewOpen] = useState(false); const [conceptExplanationsOpen, setConceptExplanationsOpen] = useState(false); - // Add new state for the explanation mode - const [explanationMode, setExplanationMode] = useState<'none' | 'code' | 'explanation' | 'concepts'>('none'); - const [chatDialogOpen, setChatDialogOpen] = useState(false); const [localScaleFactor, setLocalScaleFactor] = useState(1); // Reset local UI state when focused chart changes useEffect(() => { - setLocalScaleFactor(1); setCodeViewOpen(false); setCodeExplViewOpen(false); setConceptExplanationsOpen(false); - setExplanationMode('none'); setChatDialogOpen(false); + setLocalScaleFactor(1); }, [focusedChartId]); // Combined useEffect to scroll to exploration components when any of them open @@ -441,8 +554,8 @@ export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { let filteredRows = rows.map(row => Object.fromEntries(visFields.filter(f => table.names.includes(f.name)).map(f => [f.name, row[f.name]]))); let visTable = prepVisTable(filteredRows, conceptShelfItems, focusedChart.encodingMap); - if (visTable.length > 5000) { - let rowSample = _.sampleSize(visTable, 5000); + if (visTable.length > serverConfig.MAX_DISPLAY_ROWS) { + let rowSample = _.sampleSize(visTable, serverConfig.MAX_DISPLAY_ROWS); visTable = rowSample; } @@ -476,13 +589,16 @@ export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { if (sampleSize == undefined) { sampleSize = 1000; } - if (table.virtual) { + // If all rows are already in browser memory, sample locally (no server call needed). + // This covers non-virtual tables and virtual tables whose rows have been fully loaded. + const allRowsInMemory = !table.virtual || table.rows.length >= (table.virtual.rowCount || 0); + if (!allRowsInMemory) { // Generate unique request ID to track this specific request const requestId = `${focusedChart.id}-${table.id}-${Date.now()}`; currentRequestRef.current = requestId; let { aggregateFields, groupByFields } = extractFieldsFromEncodingMap(focusedChart.encodingMap, conceptShelfItems); - fetch(getUrls().SAMPLE_TABLE, { + fetchWithIdentity(getUrls().SAMPLE_TABLE, { method: 'POST', headers: { 'Content-Type': 'application/json', @@ -520,15 +636,17 @@ export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { } }); } else { - // Randomly sample sampleSize rows from table.rows + // All rows available locally — sample in-memory let rowSample = _.sampleSize(table.rows, sampleSize); setVisTableRows(structuredClone(rowSample)); + setVisTableTotalRowCount(table.rows.length); setDataVersion(`${focusedChart.id}-${table.id}-${sortedVisDataFields.join("_")}`); } } useEffect(() => { - if (table.virtual && visFields.length > 0 && dataFieldsAllAvailable) { + const allRowsInMemory = !table.virtual || table.rows.length >= (table.virtual.rowCount || 0); + if (!allRowsInMemory && visFields.length > 0 && dataFieldsAllAvailable) { fetchDisplayRows(); } }, []) @@ -537,17 +655,9 @@ export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { const versionId = `${focusedChart.id}-${table.id}-${sortedVisDataFields.join("_")}`; if (visFields.length > 0 && dataFieldsAllAvailable) { - // table changed, we need to update the rows to display - if (table.virtual) { - // virtual table, we need to sample the table - fetchDisplayRows(); - } else { - // non-virtual table, update with processed data - const newProcessedData = createVisTableRowsLocal(table.rows); - setVisTableRows(newProcessedData); - setVisTableTotalRowCount(table.rows.length); - setDataVersion(versionId); - } + // table or fields changed — fetchDisplayRows handles both + // local (all rows in memory) and remote (virtual, large) cases + fetchDisplayRows(); } else { // If no fields, just use the table rows directly setVisTableRows(table.rows); @@ -633,7 +743,7 @@ export const ChartEditorFC: FC<{}> = function ChartEditorFC({}) { backgroundColor: 'rgba(0, 0, 0, 0.02)', borderRadius: 1, padding: '2px', - border: '1px solid rgba(0, 0, 0, 0.06)' + border: `1px solid ${borderColor.component}` }}> = function ChartEditorFC({}) { key="chat-dialog-btn" onClick={() => { setChatDialogOpen(!chatDialogOpen) }} sx={{ - backgroundColor: conceptExplanationsOpen ? 'rgba(25, 118, 210, 0.2)' : 'transparent', - color: conceptExplanationsOpen ? 'primary.main' : 'text.secondary', - fontWeight: conceptExplanationsOpen ? 600 : 500, + backgroundColor: chatDialogOpen ? 'rgba(25, 118, 210, 0.2)' : 'transparent', + color: chatDialogOpen ? 'primary.main' : 'text.secondary', + fontWeight: chatDialogOpen ? 600 : 500, '&:hover': { - backgroundColor: conceptExplanationsOpen ? 'rgba(25, 118, 210, 0.25)' : 'rgba(25, 118, 210, 0.08)', + backgroundColor: chatDialogOpen ? 'rgba(25, 118, 210, 0.25)' : 'rgba(25, 118, 210, 0.08)', }, }} > - chat + log