Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/snippets/integrations.mdx
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
{/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */}

export const PyRerankingMrrMultivector = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n meta: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n meta_vector: Vector(embedder.ndims()) = embedder.VectorField(source_column=\"meta\")\n\ndata = [\n {\"text\": \"hello world\", \"meta\": \"greeting message\"},\n {\"text\": \"goodbye world\", \"meta\": \"farewell message\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\n\n# Search across multiple vector columns and collect results with row IDs\nquery = \"hello\"\nrs1 = tbl.search(query, vector_column_name=\"vector\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(query, vector_column_name=\"meta_vector\").limit(10).with_row_id(True).to_arrow()\n\n# Rerank the combined results using MRR\nreranker = MRRReranker()\ncombined_results = reranker.rerank_multivector([rs1, rs2])\n";

export const PyEmbeddingAwsUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\nmodel = get_registry().get(\"bedrock-text\").create()\n\nclass TextModel(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hello world\", \"goodbye world\"]})\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"bedrock-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\n\ntbl.add(df)\nrs = tbl.search(\"hello\").limit(1).to_pandas()\nprint(rs.head())\n";

export const PyEmbeddingCohereUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nfrom lancedb.embeddings import EmbeddingFunctionRegistry\nfrom lancedb.pydantic import LanceModel, Vector\n\ncohere = (\n EmbeddingFunctionRegistry.get_instance()\n .get(\"cohere\")\n .create(name=\"embed-multilingual-v2.0\")\n)\n\nclass TextModel(LanceModel):\n text: str = cohere.SourceField()\n vector: Vector(cohere.ndims()) = cohere.VectorField()\n\ndata = [{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}]\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"cohere-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\ntbl.add(data)\n";
Expand Down Expand Up @@ -162,6 +160,8 @@ export const PyRerankingJinaUsage = "import os\n\nimport lancedb\nfrom lancedb.e

export const PyRerankingLinearCombinationUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import LinearCombinationReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = LinearCombinationReranker()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n";

export const PyRerankingMrrMultivector = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n meta: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n meta_vector: Vector(embedder.ndims()) = embedder.VectorField(source_column=\"meta\")\n\ndata = [\n {\"text\": \"hello world\", \"meta\": \"greeting message\"},\n {\"text\": \"goodbye world\", \"meta\": \"farewell message\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\n\n# Search across multiple vector columns and collect results with row IDs\nquery = \"hello\"\nrs1 = tbl.search(query, vector_column_name=\"vector\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(query, vector_column_name=\"meta_vector\").limit(10).with_row_id(True).to_arrow()\n\n# Rerank the combined results using MRR\nreranker = MRRReranker()\ncombined_results = reranker.rerank_multivector([rs1, rs2])\n";

export const PyRerankingMrrUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = MRRReranker(weight_vector=0.7, weight_fts=0.3)\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n\n# Run multivector search across multiple vector columns\nrs1 = tbl.search(\"hello\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(\"greeting\").limit(10).with_row_id(True).to_arrow()\ncombined = MRRReranker().rerank_multivector([rs1, rs2])\n";

export const PyRerankingOpenaiUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import OpenaiReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = OpenaiReranker()\n\n# Run vector search with a reranker\nresult = tbl.search(\"hello\").rerank(reranker=reranker).to_list()\n\n# Run FTS search with a reranker\nresult = tbl.search(\"hello\", query_type=\"fts\").rerank(reranker=reranker).to_list()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n";
Expand Down
2 changes: 2 additions & 0 deletions docs/snippets/storage.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ export const PyStorageConnectTimeout = "db = lancedb.connect(\n \"s3://bucket

export const PyStorageGcsServiceAccount = "db = lancedb.connect(\n \"gs://my-bucket/my-database\",\n storage_options={\n \"service_account\": \"path/to/service-account.json\",\n },\n)\n";

export const PyStorageProviderRefresh = "class StsStorageOptionsProvider:\n # LanceDB calls this method whenever credentials need refresh.\n def fetch_storage_options(self) -> dict[str, str]:\n # Replace this with your credential manager or STS call.\n return {\n \"region\": \"us-east-1\",\n \"aws_access_key_id\": \"<temp-access-key-id>\",\n \"aws_secret_access_key\": \"<temp-secret-access-key>\",\n \"aws_session_token\": \"<temp-session-token>\",\n \"expires_at_millis\": \"1735707600000\",\n \"refresh_offset_millis\": \"120000\",\n }\n\nprovider = StsStorageOptionsProvider()\ndb = lancedb.connect(\"s3://bucket/path\")\ntable = db.create_table(\n \"table_with_temp_creds\",\n [{\"id\": 1}],\n storage_options_provider=provider,\n)\n";

export const PyStorageS3Ddb = "db = lancedb.connect(\n \"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table\",\n)\n";

export const PyStorageS3Express = "db = lancedb.connect(\n \"s3://my-bucket--use1-az4--x-s3/path\",\n storage_options={\n \"region\": \"us-east-1\",\n \"s3_express\": \"true\",\n },\n)\n";
Expand Down
28 changes: 22 additions & 6 deletions docs/storage/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
PyStorageConnectGcs,
PyStorageConnectS3,
PyStorageConnectTimeout,
PyStorageProviderRefresh,
PyStorageGcsServiceAccount,
PyStorageS3Ddb,
PyStorageS3Express,
Expand Down Expand Up @@ -90,18 +91,34 @@ If you need the option to apply only to a specific table:
</CodeBlock>
</CodeGroup>

### Dynamic credentials with `StorageOptionsProvider` <Badge color="green">Python-only</Badge>
Use a storage options provider when credentials expire (for example, short-lived STS credentials). Pass any provider object that implements `fetch_storage_options()` with `storage_options_provider` on table operations such as `create_table` and `open_table`. In SDK versions that expose `StorageOptionsProvider`, you can subclass it directly.

If `fetch_storage_options()` returns `expires_at_millis`, LanceDB refreshes credentials before that timestamp. You can optionally set `refresh_offset_millis` (in milliseconds) to refresh earlier.

<CodeBlock language="Python" title="Refresh cloud credentials automatically" icon="python">
{PyStorageProviderRefresh}
</CodeBlock>

#### General object store options

| Key | Description |
| :-- | :-- |
| `allow_http` | Allow non-TLS connections. Default: `false`. |
| `allow_invalid_certificates` | Skip certificate validation. Default: `false`. |
| `connect_timeout` | Timeout for the connect phase. Default: `5s`. |
| `timeout` | Timeout for the full request. Default: `30s`. |
| `user_agent` | User agent string to send with requests. |
| `allow_http` | Allow non-TLS connections. |
| `allow_invalid_certificates` | Skip certificate validation for TLS connections. |
| `connect_timeout` | Timeout for the connect phase. |
| `timeout` | Timeout for the full request. |
| `user_agent` | User agent string sent with requests. |
| `proxy_url` | Proxy URL to route requests through. |
| `proxy_ca_certificate` | PEM-formatted CA certificate for proxy connections. |
| `proxy_excludes` | Comma-separated hosts that bypass the proxy (domains or CIDR). |
| `download_retry_count` | Number of retries when downloading objects. |
| `client_max_retries` | Maximum retries for object-store client requests. |
| `client_retry_timeout` | Total retry timeout (seconds) for object-store client requests. |

<Info title="Option support varies by backend">
These are commonly used options. Cloud-specific keys (for example `region`, `endpoint`, `service_account`, and Azure credential keys) are backend-dependent and can be provided in `storage_options` as needed.
</Info>

## AWS S3

Expand Down Expand Up @@ -204,4 +221,3 @@ Tigris exposes an S3-compatible API. Configure the endpoint and region:
</CodeGroup>

Environment variables `AWS_ENDPOINT=https://t3.storage.dev` and `AWS_DEFAULT_REGION=auto` achieve the same configuration.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"lancedb>=0.25.3",
"lancedb>=0.29.2",
"pandas>=2.3.3",
"polars>=1.35.2",
"pydantic>=2.12.4",
Expand Down
33 changes: 31 additions & 2 deletions tests/py/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ def __init__(self, uri: str, options: dict):
self.options = options
self.created_tables: list[DummyTable] = []

def create_table(self, name: str, data, storage_options: dict | None = None):
def create_table(
self,
name: str,
data,
storage_options: dict | None = None,
storage_options_provider=None,
):
table = DummyTable(name, storage_options=storage_options)
self.created_tables.append(table)
return table
Expand Down Expand Up @@ -119,7 +125,30 @@ def test_storage_snippets(fake_connect):
)
# --8<-- [end:storage_tigris_connect]

assert len(fake_connect) == 10
# --8<-- [start:storage_provider_refresh]
class StsStorageOptionsProvider:
# LanceDB calls this method whenever credentials need refresh.
def fetch_storage_options(self) -> dict[str, str]:
# Replace this with your credential manager or STS call.
return {
"region": "us-east-1",
"aws_access_key_id": "<temp-access-key-id>",
"aws_secret_access_key": "<temp-secret-access-key>",
"aws_session_token": "<temp-session-token>",
"expires_at_millis": "1735707600000",
"refresh_offset_millis": "120000",
}

provider = StsStorageOptionsProvider()
db = lancedb.connect("s3://bucket/path")
table = db.create_table(
"table_with_temp_creds",
[{"id": 1}],
storage_options_provider=provider,
)
# --8<-- [end:storage_provider_refresh]

assert len(fake_connect) == 11
assert all(
conn.uri.startswith(("s3://", "gs://", "az://", "s3+ddb://"))
for conn in fake_connect
Expand Down
Loading
Loading