Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
python -m venv venv
source venv/bin/activate
pip install maturin duckdb requests pytest pytest-benchmark
maturin develop --locked --release
maturin develop --locked --release --features datagen
- name: Build memtest
run: |
source venv/bin/activate
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 12 additions & 3 deletions python/python/ci_benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ def _format_bytes(num_bytes: int) -> str:
return f"{num_bytes:.1f} PB"


def _format_count(count: int) -> str:
"""Format a large count with commas."""
for unit in ["", "K"]:
if abs(count) < 1000.0:
return f"{count:.1f} {unit}"
count /= 1000.0
return f"{count:.1f} M"


class IOMemoryBenchmark:
"""Benchmark fixture that tracks IO and memory during execution."""

Expand Down Expand Up @@ -204,14 +213,14 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
f"{'Read IOPS':>10} {'Read Bytes':>12} "
f"{'Write IOPS':>10} {'Write Bytes':>12}"
)
terminalreporter.write_line("-" * (name_width + 72))
terminalreporter.write_line("-" * (name_width + 76))
else:
terminalreporter.write_line(
f"{'Test':<{name_width}} "
f"{'Read IOPS':>10} {'Read Bytes':>12} "
f"{'Write IOPS':>10} {'Write Bytes':>12}"
)
terminalreporter.write_line("-" * (name_width + 50))
terminalreporter.write_line("-" * (name_width + 52))

# Results sorted by read bytes (descending)
sorted_results = sorted(
Expand All @@ -224,7 +233,7 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
terminalreporter.write_line(
f"{result.name:<{name_width}} "
f"{_format_bytes(s.peak_bytes):>10} "
f"{s.total_allocations:>10,} "
f"{_format_count(s.total_allocations):>10} "
f"{s.read_iops:>10,} "
f"{_format_bytes(s.read_bytes):>12} "
f"{s.write_iops:>10,} "
Expand Down
70 changes: 70 additions & 0 deletions python/python/ci_benchmarks/benchmarks/test_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
from pathlib import Path

import lance
import pyarrow as pa
import pytest
from lance._datagen import rand_batches


@pytest.mark.parametrize(
"data_type", [pa.int64(), pa.string()], ids=["int64", "string"]
)
@pytest.mark.parametrize("index_type", ["btree", "bitmap", "zonemap", "bloomfilter"])
@pytest.mark.io_memory_benchmark()
def test_io_mem_build_scalar_index(
io_mem_benchmark, data_type: pa.DataType, index_type: str, tmp_path: Path
):
metadata = None
if index_type == "bitmap":
metadata = {b"lance-datagen:cardinality": b"1000"}
schema = pa.schema([pa.field("col", data_type, metadata=metadata)])

# 100MB
data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
ds = lance.write_dataset(data, tmp_path)

def build_index(ds):
ds.create_scalar_index("col", index_type, replace=True)

io_mem_benchmark(build_index, ds, warmup=False)


@pytest.mark.parametrize("with_positions", [True, False])
@pytest.mark.io_memory_benchmark()
def test_io_mem_build_fts(io_mem_benchmark, with_positions: bool, tmp_path: Path):
schema = pa.schema(
[
pa.field(
"text", pa.string(), metadata={"lance-datagen:content-type": "sentence"}
)
]
)
# 100MB
data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
ds = lance.write_dataset(data, tmp_path)

def build_index(ds):
ds.create_scalar_index("text", "INVERTED", with_position=True, replace=True)

io_mem_benchmark(build_index, ds, warmup=False)


@pytest.mark.io_memory_benchmark()
def test_io_mem_build_ivf_pq(io_mem_benchmark, tmp_path: Path):
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 1024))])
# 1GB
data = rand_batches(schema, num_batches=100, batch_size_bytes=10 * 1024 * 1024)
ds = lance.write_dataset(data, tmp_path)

def build_index(ds):
ds.create_index(
"vector",
index_type="IVF_PQ",
num_partitions=32,
num_sub_vectors=4,
replace=True,
)

io_mem_benchmark(build_index, ds, warmup=False)
3 changes: 2 additions & 1 deletion python/python/lance/_datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ def rand_batches(
raise NotImplementedError(
"This version of lance was not built with the datagen feature"
)
return datagen.rand_batches(schema, num_batches, batch_size_bytes)
batch_iter = datagen.rand_batches(schema, num_batches, batch_size_bytes)
return pa.RecordBatchReader.from_batches(schema, batch_iter)
1 change: 1 addition & 0 deletions rust/lance-datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ futures = { workspace = true }
half = { workspace = true }
hex = "0.4.3"
rand = { workspace = true }
rand_distr = { workspace = true }
rand_xoshiro = { workspace = true }
random_word = { version = "0.5", features = ["en"] }

Expand Down
Loading
Loading