diff --git a/CMakeLists.txt b/CMakeLists.txt index dfbbf19e3..600fa9fb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ endif() project(python-blosc2) set(BLOSC2_MIN_VERSION 3.0.0) -set(BLOSC2_BUNDLED_VERSION v3.0.3) +set(BLOSC2_BUNDLED_VERSION v3.1.2) if(WIN32 AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang") message(FATAL_ERROR "Windows builds require clang-cl. Set CC/CXX to clang-cl or configure CMake with -T ClangCL.") diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 6640b124d..1b77d1579 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -4,6 +4,17 @@ XXX version-specific blurb XXX +### Take/gather APIs + +- Added `NDArray.take()` following Array API `take` shape semantics, including + `axis=None` flattening and N-dimensional integer indices. One-dimensional + gathers use the existing sparse C-level path internally. +- Extended top-level `blosc2.take()` to dispatch to `NDArray.take()`, + `CTable.take()`, and `Column.take()` while preserving the input container + type. +- Added `CTable.take()` and `Column.take()` for logical row/value gathers that + preserve order and duplicate indices, unlike mask-based views. + ## Changes from 4.3.1 to 4.3.3 note: 4.3.2 was an internal pre-release that was not published to PyPI. diff --git a/bench/ndarray/fancy-indexes.py b/bench/ndarray/fancy-indexes.py new file mode 100644 index 000000000..901185207 --- /dev/null +++ b/bench/ndarray/fancy-indexes.py @@ -0,0 +1,348 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### +""" +Benchmark fancy indexing with a boolean array vs. a list of flat indices +(coords) on an in-memory blosc2.NDArray. + +All approaches select the same elements (determined by the same set of +random flat indices), so the comparison reflects the overhead of each path. + +Usage:: + + python bench/ndarray/fancy-indexes.py --ndim 3 --arr-size 100000000 + +Optional flags:: + + --ndim Number of dimensions (default: 3) + --arr-size Total number of elements (default: 100_000_000) + --max-idx Maximum number of indices (default: 100_000) + --output Save plot to PNG (optional, no display if set) + --profile-mem Measure peak memory instead of time + +Benchmarked paths +------------------ + +* ``bool mask`` — ``a[bool_mask]`` with automatic sparse/dense detection. +* ``coord list`` — ``blosc2.take(a, coord_list, axis=None)[:]`` + (sparse-element gather via ``b2nd_get_sparse_cbuffer``). +* ``mask→coords`` — ``np.flatnonzero(bool_mask)`` + sparse gather. +* ``lazy expr`` — ``a[a < threshold][:]``, the idiomatic lazy-expression + path (now auto-optimized internally via miniexpr + sparse take). +""" + +from __future__ import annotations + +import argparse +import sys +import threading +import time as _time +from time import perf_counter + +import matplotlib.pyplot as plt +import numpy as np +import psutil + +import blosc2 + +# --------------------------------------------------------------------------- +# plot style +# --------------------------------------------------------------------------- +plt.rcParams.update({ + "text.usetex": False, + "font.size": 14, + "figure.dpi": 150, + "savefig.dpi": 150, +}) +plt.style.use("seaborn-v0_8-paper") + +COLORS = { + "bool mask": "#1f77b4", + "coord list": "#ff7f0e", + "mask→coords": "#2ca02c", + "lazy expr": "#d62728", +} +MARKERS = { + "bool mask": "o", + "coord list": "s", + "mask→coords": "^", + "lazy expr": "D", +} + + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + +def _compute_shape(ndim: int, n_elements: int) -> tuple[int, ...]: + """Roughly-cubic shape with the given number of dimensions.""" + d = int(round(n_elements ** (1.0 / ndim))) + shape = [d] * ndim + shape[0] = max(1, n_elements // int(np.prod(shape[1:]))) + return tuple(shape) + + +def _peak_memory(func, *args, **kwargs): + """Return RSS memory increase (MB) after *func(*args, **kwargs).""" + proc = psutil.Process() + before = proc.memory_info().rss + peak = [before] + stop = threading.Event() + + def sample(): + while not stop.is_set(): + rss = proc.memory_info().rss + if rss > peak[0]: + peak[0] = rss + _time.sleep(0.001) + + t = threading.Thread(target=sample, daemon=True) + t.start() + result = func(*args, **kwargs) + stop.set() + t.join(timeout=0.1) + + after = proc.memory_info().rss + _ = result # keep alive to count retained output + delta_peak = (peak[0] - before) / (1024 * 1024) + delta_after = (after - before) / (1024 * 1024) + return max(delta_peak, delta_after) + + +def _make_bool_mask(shape, flat_indices): + """Build a boolean array of *shape* with True at *flat_indices*.""" + mask = np.zeros(np.prod(shape), dtype=np.bool_) + mask[flat_indices] = True + return mask.reshape(shape) + + +# --------------------------------------------------------------------------- +# array creation +# --------------------------------------------------------------------------- + +def create_array(shape): + """Create an in-memory blosc2 linspace array.""" + n_elements = np.prod(shape) + print(f"Shape: {shape} | n_elements: {n_elements:_} " + f"| dtype: float64 | total: {n_elements * 8 / 1e9:.2f} GB") + t0 = perf_counter() + a = blosc2.linspace(0.0, 1.0, int(n_elements), shape=shape) + t = perf_counter() - t0 + print(f"blosc2.linspace created in {t:.2f}s " + f"cratio={a.schunk.cratio:.1f}x " + f"cbytes={a.schunk.cbytes / 1e6:.1f} MB") + print() + return a + + +# --------------------------------------------------------------------------- +# benchmark runner +# --------------------------------------------------------------------------- + +def run_benchmark(a, ndim, max_idx=100_000, n_runs=3, profile_mem=False): + """Compare bool-mask, coord-list, mask→coords, and lazy-expr indexing.""" + n_elements = a.size + max_idx = min(max_idx, n_elements) + + n_indices_list = np.unique( + np.logspace(0, np.log10(max(1, max_idx)), num=12, dtype=np.int64) + ) + print(f"Index counts: {n_indices_list.tolist()}") + if profile_mem: + print("(memory-profiling mode, 1 run per point)") + print() + + rng = np.random.default_rng(42) + results = {"bool mask": [], "coord list": [], "mask→coords": [], "lazy expr": []} + actual_counts = [] + + for n_idx in n_indices_list: + flat_idx = np.unique(rng.integers(0, n_elements, size=int(n_idx))) + n_actual = len(flat_idx) + + bool_mask = _make_bool_mask(a.shape, flat_idx) + coord_list = flat_idx.tolist() + + # Lazy-expr threshold: use selectivity to get ~n_actual matches + # (linspace is uniform on [0, 1], so a < n_actual / n_elements) + threshold = n_actual / n_elements if n_actual > 0 else 0.0 + + if profile_mem: + def _bool(): + return a[bool_mask] + + def _coords(): + return blosc2.take(a, coord_list, axis=None)[:] + + def _mask_to_coords(): + idx = np.flatnonzero(bool_mask) + return blosc2.take(a, idx, axis=None)[:] + + def _lazy(): + return a[a < threshold][:] + + mem_bool = _peak_memory(_bool) + mem_coords = _peak_memory(_coords) + mem_m2c = _peak_memory(_mask_to_coords) + mem_lazy = _peak_memory(_lazy) + + results["bool mask"].append(mem_bool) + results["coord list"].append(mem_coords) + results["mask→coords"].append(mem_m2c) + results["lazy expr"].append(mem_lazy) + print( + f" n_indices={n_actual:>7}: " + f"bool_mask={mem_bool:.1f} MB " + f"coord_list={mem_coords:.1f} MB " + f"mask→coords={mem_m2c:.1f} MB " + f"lazy_expr={mem_lazy:.1f} MB" + ) + else: + # --- bool mask --- + times_bool = [] + for _ in range(n_runs): + t0 = perf_counter() + _ = a[bool_mask] + times_bool.append(perf_counter() - t0) + t_bool = np.min(times_bool) + + # --- coord list --- + times_coords = [] + for _ in range(n_runs): + t0 = perf_counter() + _ = blosc2.take(a, coord_list, axis=None)[:] + times_coords.append(perf_counter() - t0) + t_coords = np.min(times_coords) + + # --- mask → coords --- + times_m2c = [] + for _ in range(n_runs): + t0 = perf_counter() + idx = np.flatnonzero(bool_mask) + _ = blosc2.take(a, idx, axis=None)[:] + times_m2c.append(perf_counter() - t0) + t_m2c = np.min(times_m2c) + + # --- lazy expr --- + times_lazy = [] + for _ in range(n_runs): + t0 = perf_counter() + _ = a[a < threshold][:] + times_lazy.append(perf_counter() - t0) + t_lazy = np.min(times_lazy) + + results["bool mask"].append(t_bool) + results["coord list"].append(t_coords) + results["mask→coords"].append(t_m2c) + results["lazy expr"].append(t_lazy) + print( + f" n_indices={n_actual:>7}: " + f"bool_mask={t_bool:.5f}s " + f"coord_list={t_coords:.5f}s " + f"mask→coords={t_m2c:.5f}s " + f"lazy_expr={t_lazy:.5f}s" + ) + + actual_counts.append(n_actual) + + return np.array(actual_counts), results + + +# --------------------------------------------------------------------------- +# plotting +# --------------------------------------------------------------------------- + +def plot_results(n_indices, results, ndim, arr_size, output, profile_mem=False): + fig, ax = plt.subplots(figsize=(10, 6)) + + for label, times in results.items(): + ax.plot( + n_indices, times, color=COLORS[label], marker=MARKERS[label], + label=label, linewidth=2, markersize=7, + ) + + ax.set_xscale("log") + ax.set_xlabel("Number of selected elements") + if not profile_mem: + ax.set_yscale("log") + ax.set_ylabel("Peak memory (MB)" if profile_mem else "Time (s)") + title = ( + f"Bool mask vs coord list fancy indexing — " + f"ndim={ndim}, arr-size={arr_size:_}" + ) + if profile_mem: + title += " (memory)" + ax.set_title(title) + ax.legend() + ax.grid(True, which="both", alpha=0.3) + fig.tight_layout() + + if output: + fig.savefig(output) + print(f"\nPlot saved to {output}") + else: + plt.show() + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + +def parse_args(): + p = argparse.ArgumentParser( + description="Benchmark bool-mask fancy indexing vs coord-list sparse read" + ) + p.add_argument( + "--ndim", type=int, default=3, + help="Number of dimensions (default: 3)", + ) + p.add_argument( + "--arr-size", type=int, default=100_000_000, + help="Total number of elements (default: 100_000_000)", + ) + p.add_argument( + "--max-idx", type=int, default=100_000, + help="Maximum number of indices to test (default: 100_000)", + ) + p.add_argument( + "--output", type=str, default=None, + help="Save plot to this path (PNG). If omitted, display interactively.", + ) + p.add_argument( + "--profile-mem", action="store_true", + help="Measure peak memory (MB) instead of timing.", + ) + return p.parse_args() + + +def main(): + args = parse_args() + + print(f"blosc2 version: {blosc2.__version__}") + print(f"numpy version: {np.__version__}") + print(f"C-Blosc2 version: {blosc2.blosclib_version}") + print() + + shape = _compute_shape(args.ndim, args.arr_size) + print(f"Using ndim={args.ndim}, arr-size={args.arr_size:_} -> shape={shape}") + + a = create_array(shape) + + n_indices, results = run_benchmark( + a, args.ndim, max_idx=args.max_idx, profile_mem=args.profile_mem + ) + + plot_results( + n_indices, results, args.ndim, args.arr_size, + args.output, profile_mem=args.profile_mem, + ) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/bench/ndarray/lazy-index.py b/bench/ndarray/lazy-index.py new file mode 100644 index 000000000..67cdceb59 --- /dev/null +++ b/bench/ndarray/lazy-index.py @@ -0,0 +1,208 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### +""" +Profile and benchmark ``a[bool_array]`` on a blosc2 NDArray. + +Compares the lazy path ``a[a < threshold][:]`` against the concrete +boolean-array path ``a[bool_arr]`` and breaks down where the time goes. + +Usage:: + + python bench/ndarray/lazy-index2.py + +Optional flags:: + + --ndim Number of dimensions (default: 2) + --arr-size Total number of elements (default: 100_000_000) + --threshold Filter condition value (default: 5) +""" + +from __future__ import annotations + +import argparse +from time import perf_counter + +import numpy as np + +import blosc2 + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + + +def _compute_shape(ndim: int, n_elements: int) -> tuple[int, ...]: + d = int(round(n_elements ** (1.0 / ndim))) + shape = [d] * ndim + shape[0] = max(1, n_elements // int(np.prod(shape[1:]))) + return tuple(shape) + + +# --------------------------------------------------------------------------- +# profiling +# --------------------------------------------------------------------------- + + +def profile_lazy_index(ndim, arr_size, threshold): + print(f"{'='*60}") + print(f"ndim={ndim}, arr-size={arr_size:_}, threshold={threshold}") + print(f"{'='*60}") + print() + + shape = _compute_shape(ndim, arr_size) + n_elements = np.prod(shape) + + # --- create array ---------------------------------------------------- + t0 = perf_counter() + a = blosc2.arange(0, n_elements, shape=shape) + t_create = perf_counter() - t0 + print(f"Array shape: {shape}") + print(f"Total elements: {n_elements:_}") + print(f"Uncompressed size: {a.nbytes/1e9:.2f} GB") + print(f"Chunks: {a.chunks}") + print(f"Number of chunks: {a.schunk.nchunks}") + print(f"Create time: {t_create:.3f}s") + print() + + # --- path 1: a[a < threshold][:] (lazy expression) ------------------ + t0 = perf_counter() + result = a[a < threshold][:] + t_lazy = perf_counter() - t0 + + # --- path 2: bool_array = (a < threshold).compute() ; a[bool_array] -- + t0 = perf_counter() + bool_arr = (a < threshold).compute() + t_bool_compute = perf_counter() - t0 + + t0 = perf_counter() + result2 = a[bool_arr] + t_concrete = perf_counter() - t0 + + t_total_bool = t_bool_compute + t_concrete + + print(f"{'--- Path comparison ---':^50}") + print(f"{'Path':<35} {'Time (ms)':<15}") + print(f"{'-'*50}") + print(f"{'a[a < threshold][:] (lazy)':<35} {t_lazy*1000:<15.1f}") + print(f"") + print(f"{' (a8.0f} µs {t_dec*nchunks*1000:>8.1f} ms") + print(f"{'decompress + numexpr eval':<40} {t_dec_ne*1e6:>8.0f} µs {t_dec_ne*nchunks*1000:>8.1f} ms") + print( + f"{'slice bool + decompress + gather':<40} {t_bool_gather*1e6:>8.0f} µs {t_bool_gather*nchunks*1000:>8.1f} ms" + ) + print( + f"{'decompress + eval + gather (lazy)':<40} {t_dec_ne_gather*1e6:>8.0f} µs {t_dec_ne_gather*nchunks*1000:>8.1f} ms" + ) + print() + + # --- hotspot analysis ------------------------------------------------ + print(f"{'--- Hotspot analysis ---':^50}") + print() + print(f"The lazy path (a[a<{threshold}][:]) fuses the comparison into the") + print(f"chunk evaluation, calling numexpr on the decompressed chunk data.") + print() + print(f"The concrete boolean path (a[bool_arr]) was previously ~8× slower") + print(f"because NDArray.__getitem__ called process_key() which invokes") + print(f"np.nonzero() on the boolean array, scanning all {n_elements:_} elements") + print(f"and allocating index arrays — work that was immediately discarded.") + print() + print(f"With the fix (bool array check moved before process_key), the") + print(f"boolean path now takes the same fast LazyExpr route as the lazy path.") + print() + + print(f"{'='*60}") + print("SUMMARY") + print(f"{'='*60}") + print() + print(f" Query (lazy): a[a < {threshold}][:]") + print(f" Query (concrete): a[bool_arr] with bool_arr = (a<{threshold}).compute()") + print(f" Matching elements: {result.size} / {n_elements:_} ({result.size/n_elements*100:.5f}%)") + print(f" Lazy path time: {t_lazy*1000:.1f} ms") + print(f" Concrete path time: {t_concrete*1000:.1f} ms") + print(f" Ratio (concrete/lazy): {t_concrete/t_lazy:.1f}x") + print() + + +def parse_args(): + p = argparse.ArgumentParser(description="Profile concrete boolean array indexing") + p.add_argument("--ndim", type=int, default=2, help="Number of dimensions (default: 2)") + p.add_argument( + "--arr-size", type=int, default=100_000_000, help="Total number of elements (default: 100_000_000)" + ) + p.add_argument("--threshold", type=float, default=5, help="Filter threshold value (default: 5)") + return p.parse_args() + + +def main(): + args = parse_args() + print(f"blosc2 version: {blosc2.__version__}") + print(f"numpy version: {np.__version__}") + print(f"C-Blosc2 version: {blosc2.blosclib_version}") + print() + profile_lazy_index(args.ndim, args.arr_size, args.threshold) + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/bench/ndarray/take.py b/bench/ndarray/take.py new file mode 100644 index 000000000..df1db7398 --- /dev/null +++ b/bench/ndarray/take.py @@ -0,0 +1,408 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### +""" +Benchmark ``take()`` / fancy indexing across numpy, blosc2, zarr, and h5py. + +Usage:: + + python bench/ndarray/take.py --ndim 2 --arr-size 100000000 --output take_2d.png + +The script creates an array of *arr-size* elements with *ndim* dimensions, +then measures the time to gather a log-spaced range of random indices +(1 – 100 K). numpy is kept in-memory; blosc2, zarr and h5py use on-disk +storage so the benchmark reflects I/O behaviour of compressed backends. +""" + +from __future__ import annotations + +import argparse +import shutil +import sys +import tempfile +import time +from pathlib import Path + +import h5py +import hdf5plugin +import matplotlib.pyplot as plt +import numpy as np +import psutil +import threading +import time as _time +import zarr +from zarr.codecs import BloscCodec, BytesCodec + +import blosc2 + +# --------------------------------------------------------------------------- +# plot style +# --------------------------------------------------------------------------- +plt.rcParams.update({ + "text.usetex": False, + "font.size": 14, + "figure.dpi": 150, + "savefig.dpi": 150, +}) +plt.style.use("seaborn-v0_8-paper") + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + +def _compute_shape(ndim: int, n_elements: int) -> tuple[int, ...]: + """Roughly-cubic shape with the given number of dimensions.""" + d = int(round(n_elements ** (1.0 / ndim))) + shape = [d] * ndim + # tweak the first dimension so total elements ≈ n_elements + shape[0] = max(1, n_elements // int(np.prod(shape[1:]))) + return tuple(shape) + +# --------------------------------------------------------------------------- +# array creation +# --------------------------------------------------------------------------- + +def _chunks(shape): + """Chunk shape used by all backends (~1/4 of each dimension).""" + return tuple(max(s // 4, 1) for s in shape) + + +def create_arrays(shape, dtype=np.float64, del_source=False): + """Create arrays for all four libraries in a shared temp directory.""" + n_elements = np.prod(shape) + data = np.arange(n_elements, dtype=dtype).reshape(shape) + + tmpdir = Path(tempfile.mkdtemp(prefix="take_bench_")) + chunks = _chunks(shape) + + # --- blosc2 --------------------------------------------------------- + t0 = time.time() + b2path = tmpdir / "data.b2nd" + a_b2 = blosc2.asarray(data, chunks=chunks, urlpath=str(b2path), + cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5}) + print(f"Shape: {shape} | n_elements: {n_elements:_} " + f"| itemsize: {data.itemsize} | total: {data.nbytes / 1e9:.2f} GB") + print(f"Chunks: {chunks} | Blocks: {a_b2.blocks}") + print(f"Tmp dir: {tmpdir}") + print(f"blosc2 created in {time.time() - t0:.2f}s " + f"cratio={a_b2.schunk.cratio:.1f}x " + f"cbytes={a_b2.schunk.cbytes / 1e6:.1f} MB") + print() + + # --- numpy ---------------------------------------------------------- + a_np = data.copy() + + # --- zarr ----------------------------------------------------------- + t0 = time.time() + zpath = tmpdir / "data.zarr" + a_z = zarr.open_array(str(zpath), mode="w", shape=shape, dtype=dtype, chunks=chunks, + codecs=[BytesCodec(), + BloscCodec(cname="zstd", clevel=5, shuffle="shuffle")]) + a_z[:] = data + print(f"zarr created in {time.time() - t0:.2f}s") + + # --- h5py ---------------------------------------------------------- + t0 = time.time() + h5path = tmpdir / "data.h5" + h5f = h5py.File(str(h5path), "w") + a_h5 = h5f.create_dataset("data", data=data, chunks=chunks, + **hdf5plugin.Blosc2(cname="zstd", clevel=5, filters=1)) + print(f"h5py created in {time.time() - t0:.2f}s") + print() + + if del_source: + del data + + return a_b2, a_np, a_z, a_h5, tmpdir + + +# --------------------------------------------------------------------------- +# benchmark runner +# --------------------------------------------------------------------------- + +import psutil + + +def _peak_memory(func, *args, **kwargs): + """Return RSS memory increase (MB) after *func(*args, **kwargs). + + The output of *func* is held alive during measurement so its + allocations are reflected in the post-call RSS. + Returns the maximum of two measurements: + 1. Peak RSS observed by a background sampler (catches transient C malloc). + 2. Post-call RSS delta (catches retained output arrays). + """ + proc = psutil.Process() + before = proc.memory_info().rss + peak = [before] + stop = threading.Event() + + def sample(): + while not stop.is_set(): + rss = proc.memory_info().rss + if rss > peak[0]: + peak[0] = rss + _time.sleep(0.001) + + t = threading.Thread(target=sample, daemon=True) + t.start() + result = func(*args, **kwargs) + stop.set() + t.join(timeout=0.1) + + after = proc.memory_info().rss + _ = result # keep alive so retained output is counted + delta_peak = (peak[0] - before) / (1024 * 1024) + delta_after = (after - before) / (1024 * 1024) + return max(delta_peak, delta_after) + + +def _select_indices(rng, size, n_indices): + """Return a sorted, unique 1-D int64 array of ~*n_indices* random indices. + + Indices are sorted and deduplicated so that h5py (which requires + strictly increasing order) can participate fairly.""" + idx = np.unique(rng.integers(0, size, size=n_indices, dtype=np.int64)) + return idx + + +def run_benchmark(a_b2, a_np, a_z, a_h5, ndim, n_runs=3, sparse=False, + profile_mem=False): + """Run the fancy-indexing benchmark for a range of index counts.""" + shape = a_np.shape + size = a_np.size if sparse else shape[0] # flat size for sparse, axis-0 for orthogonal + max_indices = min(100_000, size) + + n_indices_list = np.unique( + np.logspace(0, np.log10(max(1, max_indices)), num=12, dtype=np.int64) + ) + print(f"Index counts: {n_indices_list.tolist()}") + + if profile_mem: + print("(memory-profiling mode, 1 run per point)") + print() + + rng = np.random.default_rng(42) + + results = { + "numpy": [], + "blosc2": [], + "zarr": [], + "h5py": [], + } + actual_counts = [] + + for n_idx in n_indices_list: + idx = _select_indices(rng, size, int(n_idx)) + n_actual = len(idx) # may be less after dedup + + if profile_mem: + # --- memory profiling --------------------------------------- + if sparse: + # zarr/h5py lack sparse gather — measure full-read + np.take + def _b2(): + return blosc2.take(a_b2, idx, axis=None)[:] + def _np(): + return np.take(a_np, idx, axis=None) + def _zarr(): + return np.take(a_z[:], idx, axis=None) + def _h5(): + return np.take(a_h5[:], idx, axis=None) + else: + def _b2(): + return blosc2.take(a_b2, idx, axis=0)[:] + def _np(): + return np.take(a_np, idx, axis=0) + def _zarr(): + if ndim == 1: + return a_z.oindex[(idx,)] + sel = (idx,) + (slice(None),) * (ndim - 1) + return a_z.oindex[sel] + def _h5(): + sel = (idx.tolist(),) + (slice(None),) * (ndim - 1) + return a_h5[sel] + + results["numpy"].append(_peak_memory(_np)) + results["blosc2"].append(_peak_memory(_b2)) + results["zarr"].append(_peak_memory(_zarr)) + results["h5py"].append(_peak_memory(_h5)) + + print( + f" n_indices={n_actual:>7}: " + f"numpy={results['numpy'][-1]:.1f} MB " + f"blosc2={results['blosc2'][-1]:.1f} MB " + f"zarr={results['zarr'][-1]:.1f} MB " + f"h5py={results['h5py'][-1]:.1f} MB" + ) + actual_counts.append(n_actual) + continue + elif sparse: + # --- sparse path (axis=None, flat element gather) ------------- + # numpy + elapsed = [] + for _ in range(n_runs): + t0 = time.perf_counter() + _ = a_np.flat[idx] + elapsed.append(time.perf_counter() - t0) + results["numpy"].append(np.min(elapsed)) + + # blosc2 — uses b2nd_get_sparse_cbuffer + elapsed = [] + for _ in range(n_runs): + t0 = time.perf_counter() + _ = blosc2.take(a_b2, idx, axis=None)[:] + elapsed.append(time.perf_counter() - t0) + results["blosc2"].append(np.min(elapsed)) + + # zarr — no native sparse; full read + numpy.take + elapsed = [] + for _ in range(n_runs): + t0 = time.perf_counter() + _ = np.take(a_z[:], idx, axis=None) + elapsed.append(time.perf_counter() - t0) + results["zarr"].append(np.min(elapsed)) + + # h5py — no native sparse; full read + numpy.take + elapsed = [] + for _ in range(n_runs): + t0 = time.perf_counter() + _ = np.take(a_h5[:], idx, axis=None) + elapsed.append(time.perf_counter() - t0) + results["h5py"].append(np.min(elapsed)) + else: + # --- orthogonal path (axis=0, row/slab selection) ------------- + # numpy + elapsed = [] + for _ in range(n_runs): + t0 = time.perf_counter() + _ = a_np[idx] + elapsed.append(time.perf_counter() - t0) + results["numpy"].append(np.min(elapsed)) + + # blosc2 — __getitem__ → _try_sparse_fancy_index → _take_numpy + elapsed = [] + for _ in range(n_runs): + t0 = time.perf_counter() + _ = a_b2[idx] + elapsed.append(time.perf_counter() - t0) + results["blosc2"].append(np.min(elapsed)) + + # zarr + elapsed = [] + if ndim == 1: + for _ in range(n_runs): + t0 = time.perf_counter() + _ = a_z.oindex[(idx,)] + elapsed.append(time.perf_counter() - t0) + else: + sel = (idx,) + (slice(None),) * (ndim - 1) + for _ in range(n_runs): + t0 = time.perf_counter() + _ = a_z.oindex[sel] + elapsed.append(time.perf_counter() - t0) + results["zarr"].append(np.min(elapsed)) + + # h5py + elapsed = [] + sel = (idx.tolist(),) + (slice(None),) * (ndim - 1) + for _ in range(n_runs): + t0 = time.perf_counter() + _ = a_h5[sel] + elapsed.append(time.perf_counter() - t0) + results["h5py"].append(np.min(elapsed)) + + print( + f" n_indices={n_actual:>7}: " + f"numpy={results['numpy'][-1]:.4f}s " + f"blosc2={results['blosc2'][-1]:.4f}s " + f"zarr={results['zarr'][-1]:.4f}s " + f"h5py={results['h5py'][-1]:.4f}s" + ) + actual_counts.append(n_actual) + + return np.array(actual_counts), results + + +# --------------------------------------------------------------------------- +# plotting +# --------------------------------------------------------------------------- + +COLORS = {"numpy": "#1f77b4", "blosc2": "#ff7f0e", "zarr": "#2ca02c", "h5py": "#d62728"} +MARKERS = {"numpy": "o", "blosc2": "s", "zarr": "^", "h5py": "D"} + + +def plot_results(n_indices, results, ndim, arr_size, output, sparse=False, profile_mem=False): + fig, ax = plt.subplots(figsize=(10, 6)) + + for label, times in results.items(): + ax.plot( + n_indices, times, color=COLORS[label], marker=MARKERS[label], + label=label, linewidth=2, markersize=7, + ) + + ax.set_xscale("log") + if not profile_mem: + ax.set_yscale("log") + ax.set_xlabel("Number of indices") + ax.set_ylabel("Peak memory (MB)" if profile_mem else "Time (s)") + mode = "sparse" if sparse else "fancy-indexing" + suffix = " — memory" if profile_mem else "" + ax.set_title(f"{mode} benchmark{suffix} — ndim={ndim}, arr-size={arr_size:_}") + ax.legend() + ax.grid(True, which="both", alpha=0.3) + + fig.tight_layout() + + if output: + fig.savefig(output) + print(f"\nPlot saved to {output}") + else: + plt.show() + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + +def parse_args(): + p = argparse.ArgumentParser(description="Benchmark take() across numpy/blosc2/zarr/h5py") + p.add_argument("--ndim", type=int, default=1, help="Number of dimensions (default: 1)") + p.add_argument( + "--arr-size", type=int, default=100_000_000, + help="Total number of elements (default: 100M)", + ) + p.add_argument("--output", type=str, default=None, + help="Path to save the plot (PNG). If omitted, the plot is shown.") + p.add_argument("--sparse", action="store_true", + help="Use axis=None (flat element gather via b2nd_get_sparse_cbuffer).") + p.add_argument("--profile-mem", action="store_true", + help="Measure peak memory (MB) per library (tracemalloc). Skips numpy.") + return p.parse_args() + + +def main(): + args = parse_args() + shape = _compute_shape(args.ndim, args.arr_size) + dtype = np.float64 + + a_b2, a_np, a_z, a_h5, tmpdir = create_arrays(shape, dtype, + del_source=args.profile_mem) + + try: + n_indices, results = run_benchmark(a_b2, a_np, a_z, a_h5, args.ndim, + sparse=args.sparse, + profile_mem=args.profile_mem) + plot_results(n_indices, results, args.ndim, args.arr_size, args.output, + sparse=args.sparse, profile_mem=args.profile_mem) + finally: + # Cleanup temp files + if tmpdir.exists(): + shutil.rmtree(tmpdir, ignore_errors=True) + + +if __name__ == "__main__": + main() diff --git a/bench/tree-store.py b/bench/tree-store.py new file mode 100644 index 000000000..f5bc8cac8 --- /dev/null +++ b/bench/tree-store.py @@ -0,0 +1,330 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +""" +Benchmark for TreeStore hierarchical creation, opening, and listing. + +Creates a hierarchy of N1 levels, each with N2 NDArray leaves and one +CTable (4 cols: bool, int, float, string) with N5 rows. Leaf ``N`` +receives an *N*-dimensional array (leaf0 is 0‑d, leaf1 is 1‑d, …) with +each side ``int(MAX_ELEMS ** (1/N))`` so that no array exceeds MAX_ELEMS +elements. Everything is written to ``tree-store.b2z`` and the script +measures: + +- Creation time (including compression) +- Opening time +- Listing time (walking all nodes and grabbing meta info) +""" + +import argparse +import dataclasses +import os +import time + +import numpy as np + +import blosc2 + +OUTPUT_FILE = "tree-store.b2z" + +# ── Row schema for the CTable ──────────────────────────────────────────── + + +@dataclasses.dataclass +class _Row: + a: bool = blosc2.field(blosc2.bool(), default=False) + b: int = blosc2.field(blosc2.int64(), default=0) + c: float = blosc2.field(blosc2.float64(), default=0.0) + d: str = "" + + +# ── Helpers ────────────────────────────────────────────────────────────── + + +def _clean(path: str) -> None: + """Remove *path* if it exists (file or directory).""" + if os.path.exists(path): + if os.path.isdir(path): + import shutil + + shutil.rmtree(path) + else: + os.remove(path) + + +def _fmt_bytes(nbytes: int) -> str: + """Human-friendly byte size.""" + for unit in ("B", "KB", "MB", "GB"): + if nbytes < 1024: + return f"{nbytes:.1f} {unit}" + nbytes /= 1024 + return f"{nbytes:.1f} TB" + + +# ── Benchmark steps ────────────────────────────────────────────────────── + + +def _leaf_shape(ndim: int, max_elems: int) -> tuple[int, ...]: + """Return a shape tuple for an *ndim*-dimensional array. + + For ndim == 0 the shape is ``()`` (scalar). Otherwise each side is + ``int(max_elems ** (1 / ndim))``, capped so the total never exceeds + *max_elems*. + """ + if ndim == 0: + return () + side = int(max_elems ** (1.0 / ndim)) + return (side,) * ndim + + +def create_store( + nlevels: int, nleaves: int, max_elems: int, nrows: int, + no_vlmeta: bool = False, +) -> tuple[float, int]: + """Create the TreeStore; return (wall_clock, total_elements_written).""" + _clean(OUTPUT_FILE) + + # Pre-build one array per unique dimensionality (leaf ``i`` → *i*‑d). + leaf_arrays_np: dict[int, np.ndarray] = {} + for ndim in range(nleaves): + shape = _leaf_shape(ndim, max_elems) + nelem = int(np.prod(shape)) if shape else 1 + if ndim == 0: + # linspace does not support 0‑d outputs; use a 0‑d array + if not no_vlmeta: + # blosc2 scalar so we can set vlmeta before storing + leaf_arrays_np[ndim] = blosc2.asarray(np.array(0.5, dtype=np.float64)) + else: + leaf_arrays_np[ndim] = np.array(0.5, dtype=np.float64) + else: + leaf_arrays_np[ndim] = blosc2.linspace(0, 1, num=nelem, + shape=shape, dtype=np.float64) + + total_elements = sum( + leaf_arrays_np[ndim].size for ndim in range(nleaves) + ) * nlevels + + # Pre-populate a single CTable that we will copy for every level. + tmpl_table = blosc2.CTable(_Row, expected_size=nrows, validate=False) + rows = [ + (i % 2 == 0, i, float(i) * 1.5, f"str_{i:06d}") for i in range(nrows) + ] + tmpl_table.extend(rows, validate=False) + + print(f"\nCreating TreeStore with {nlevels} level(s), " + f"{nleaves} leave(s) each, {nrows} CTable row(s) per level...") + print(f" Max elements per leaf: {max_elems:,}") + for ndim in range(min(nleaves, 10)): + shape = _leaf_shape(ndim, max_elems) + nelem = int(np.prod(shape)) if shape else 1 + print(f" leaf{ndim}: shape={shape}, elements={nelem:,}, " + f"uncompressed={_fmt_bytes(nelem * 8)}") + if nleaves > 10: + print(f" ... ({nleaves - 10} more)") + print(f" CTable rows: {nrows} | " + f"uncompressed table size: {_fmt_bytes(tmpl_table.nbytes)}") + + t0 = time.perf_counter() + tstore = blosc2.TreeStore(OUTPUT_FILE, mode="w") + + try: + if not no_vlmeta: + tstore.vlmeta["author"] = "benchmark" + tstore.vlmeta["purpose"] = "testing" + tstore.vlmeta["commit"] = "abc123" + for level in range(nlevels): + parent = f"/level{level}" + # Store NDArray leaves – each leaf gets the array for its dimension + for leaf in range(nleaves): + key = f"{parent}/leaf{leaf}" + arr = leaf_arrays_np[leaf] + if not no_vlmeta: + # Add diverse vlmeta types + arr.vlmeta["is_even"] = leaf % 2 == 0 # bool + arr.vlmeta["index"] = leaf # int + arr.vlmeta["value"] = float(leaf) * 0.5 # float + arr.vlmeta["complex"] = f"{leaf}+{leaf*2}j" # complex as string + arr.vlmeta["label"] = f"leaf_{leaf}" # string + arr.vlmeta["tags"] = [f"tag_{leaf}", f"tag_{leaf+1}"] # list + arr.vlmeta["coords"] = [leaf, leaf * 2] # list (vlmeta compatible) + arr.vlmeta["meta"] = {"key": f"val_{leaf}", "n": leaf} # dict + tstore[key] = arr + + # Store one CTable per level + table_key = f"{parent}/ctable" + tstore[table_key] = tmpl_table + if not no_vlmeta: + # Set vlmeta on the stored CTable while still in write mode + ct = tstore[table_key] + ct.vlmeta["description"] = f"Level {level} CTable" + ct.vlmeta["author"] = "blosc2" + ct.vlmeta["ncols"] = 4 + ct.vlmeta["has_index"] = True + ct.vlmeta["tags_list"] = ["benchmark", "testing", f"level_{level}"] + + if (level + 1) % max(1, nlevels // 10) == 0 or level == nlevels - 1: + print(f" Level {level + 1}/{nlevels} done " + f"({time.perf_counter() - t0:.2f}s so far)") + finally: + tstore.close() + + elapsed = time.perf_counter() - t0 + return elapsed, total_elements + + +def open_store() -> float: + """Open the store read-only and return wall-clock time.""" + print("\nOpening store (mode='r') ...") + t0 = time.perf_counter() + tstore = blosc2.open(OUTPUT_FILE, mode="r") + elapsed = time.perf_counter() - t0 + print(f" Opened in {elapsed:.3f}s") + tstore.close() + return elapsed + + +def list_store() -> float: + """Walk the store and grab meta info for all leaves; return elapsed time.""" + print("\nListing store (walk + meta info) ...") + t0 = time.perf_counter() + tstore = blosc2.open(OUTPUT_FILE, mode="r") + try: + n_arrays = 0 + n_tables = 0 + total_ndim_bytes = 0 + for path, children, nodes in tstore.walk("/"): + for node_name in nodes: + full_path = f"{path}/{node_name}".replace("//", "/") + node = tstore[full_path] + if hasattr(node, "shape"): + n_arrays += 1 + total_ndim_bytes += node.nbytes + elif hasattr(node, "nrows"): + n_tables += 1 + finally: + tstore.close() + + elapsed = time.perf_counter() - t0 + print(f" Walked {n_arrays} NDArray leaves ({_fmt_bytes(total_ndim_bytes)}) " + f"and {n_tables} CTable leaves") + print(f" Listed in {elapsed:.3f}s") + return elapsed + + +def open_and_list() -> tuple[float, float]: + """Open and list in one go, returning (open_time, list_time).""" + print("\nOpening + listing store ...") + t0 = time.perf_counter() + tstore = blosc2.open(OUTPUT_FILE, mode="r") + t_open = time.perf_counter() - t0 + + t1 = time.perf_counter() + n_arrays = 0 + n_tables = 0 + for path, children, nodes in tstore.walk("/"): + for node_name in nodes: + full_path = f"{path}/{node_name}".replace("//", "/") + node = tstore[full_path] + if hasattr(node, "shape"): + n_arrays += 1 + elif hasattr(node, "nrows"): + n_tables += 1 + t_list = time.perf_counter() - t1 + + tstore.close() + + print(f" Open: {t_open:.3f}s | Listing: {t_list:.3f}s " + f"({n_arrays} array(s), {n_tables} CTable(s))") + return t_open, t_list + + +# ── Main ───────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark TreeStore hierarchy creation / opening / listing", + ) + parser.add_argument( + "--nlevels", type=int, default=10, + help="Number of hierarchy levels (default: %(default)s)", + ) + parser.add_argument( + "--nleaves", type=int, default=10, + help="Number of NDArray leaves per level (default: %(default)s)", + ) + parser.add_argument( + "--max-elems", type=int, default=1_000_000, + help="Max elements per leaf; leafN gets N-d shape with " + "side = int(max_elems^(1/N)) (default: %(default)s)", + ) + parser.add_argument( + "--nrows", type=int, default=1000, + help="Number of rows in the per-level CTable (default: %(default)s)", + ) + parser.add_argument( + "--no-create", action="store_true", + help="Skip creation; only open/list an existing file", + ) + parser.add_argument( + "--no-vlmeta", action="store_true", + help="Skip adding vlmeta attributes to leaves and groups", + ) + args = parser.parse_args() + + total_elements = 0 + if not args.no_create: + t_create, total_elements = create_store( + args.nlevels, args.nleaves, args.max_elems, args.nrows, + no_vlmeta=args.no_vlmeta, + ) + else: + if not os.path.exists(OUTPUT_FILE): + parser.error( + f"--no-create was passed but {OUTPUT_FILE} does not exist." + ) + t_create = None + + t_open, t_list = open_and_list() + + # Summary + total_objects = args.nlevels * (args.nleaves + 1) # leaves + one CTable + # If we didn't create, estimate total elements from the store itself + if total_elements == 0: + total_elements = args.nlevels * sum( + int(np.prod(_leaf_shape(d, args.max_elems))) + if _leaf_shape(d, args.max_elems) else 1 + for d in range(args.nleaves) + ) + total_data_bytes = ( + total_elements * 8 + + args.nlevels * args.nrows * (1 + 8 + 8 + 16) # rough for table + ) + file_size = os.path.getsize(OUTPUT_FILE) + + print("\n" + "=" * 60) + print("BENCHMARK SUMMARY") + print("=" * 60) + print(f" Levels: {args.nlevels}") + print(f" Leaves per level: {args.nleaves}") + print(f" Max elems per leaf: {args.max_elems:,}") + print(f" CTable rows/level: {args.nrows}") + print(f" Total objects: {total_objects}") + print(f" Est. uncompressed: {_fmt_bytes(total_data_bytes)}") + print(f" File size on disk: {_fmt_bytes(file_size)}") + print(f" Compression ratio: {total_data_bytes / file_size:0.2f}x") + if t_create is not None: + print(f"\n Creation time: {t_create:0.3f}s") + print(f" Write throughput: " + f"{total_data_bytes / t_create / 1e9:0.2f} GB/s") + print(f"\n Open time: {t_open:0.3f}s") + print(f" List (walk) time: {t_list:0.3f}s") + print(f"\n Output file: {OUTPUT_FILE}") + + +if __name__ == "__main__": + main() diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst index 0d6553f4e..84b0c270e 100644 --- a/doc/reference/ctable.rst +++ b/doc/reference/ctable.rst @@ -234,7 +234,8 @@ CTable indexing is type-driven:: t["amount"] # column access t[3] # one row as a namedtuple-like object t[3:8] # row view - t[[1, 4, 7]] # gathered-row view + t[[1, 4, 7]] # gathered-row view (mask-based) + t.take([1, 4, 1]) # materialized row gather preserving order/duplicates t[mask] # filtered row view t[t.amount > 100] # LazyExpr filtered row view, like where() t[["region", "amount"]] # projected column view @@ -257,6 +258,7 @@ When a NumPy structured array is needed, materialize explicitly:: CTable.where CTable.view + CTable.take CTable.select CTable.head CTable.tail @@ -267,6 +269,7 @@ When a NumPy structured array is needed, materialize explicitly:: .. automethod:: CTable.where .. automethod:: CTable.view +.. automethod:: CTable.take .. automethod:: CTable.select .. automethod:: CTable.head .. automethod:: CTable.tail @@ -524,10 +527,12 @@ Data access .. autosummary:: Column.view + Column.take Column.iter_chunks Column.assign .. autoproperty:: Column.view +.. automethod:: Column.take .. automethod:: Column.iter_chunks .. automethod:: Column.assign diff --git a/doc/reference/ndarray.rst b/doc/reference/ndarray.rst index 8ea6a4642..de44ad4d9 100644 --- a/doc/reference/ndarray.rst +++ b/doc/reference/ndarray.rst @@ -22,6 +22,7 @@ In addition, all the functions from the :ref:`LazyArray` section can be used wit __len__ __getitem__ __setitem__ + take Utility Methods --------------- @@ -30,6 +31,7 @@ In addition, all the functions from the :ref:`LazyArray` section can be used wit .. automethod:: __len__ .. automethod:: __getitem__ .. automethod:: __setitem__ + .. automethod:: take Constructors ------------ diff --git a/examples/ctable/querying.py b/examples/ctable/querying.py index c6bf2e77b..94676012c 100644 --- a/examples/ctable/querying.py +++ b/examples/ctable/querying.py @@ -57,6 +57,15 @@ class Sale: print(f"North region + amount > 100: {len(north_big)} rows") print(north_big) +# -- materialized gather via take() ----------------------------------------- +# Unlike mask-based views, take() preserves order and duplicate positions. +priority = t.take([7, 1, 7]) +print("Priority sales (order and duplicates preserved):") +print(priority[["id", "region", "amount"]]) + +# Column.take() applies the same logical-row gather to a single column. +print("Priority amounts:", t.amount.take([7, 1, 7])[:].tolist()) + # -- column projection via [] (no data copy) -------------------------------- slim = t[["id", "amount"]] print("id + amount only:") diff --git a/examples/ndarray/fancy-indexing.py b/examples/ndarray/fancy-indexing.py new file mode 100644 index 000000000..1ac2c812a --- /dev/null +++ b/examples/ndarray/fancy-indexing.py @@ -0,0 +1,90 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Example showing fancy indexing (__getitem__) with integer arrays on +# 1-D and 3-D blosc2 NDArrays. +# +# Fancy indexing with integer arrays uses the same efficient +# b2nd_get_sparse_cbuffer() path as NDArray.take(), decompressing +# only the specific blocks holding the requested elements. +# +# This covers expressions like: +# a[[0, 3, 5]] — 1-D index on any dimensionality +# a[[[0, 3], [5, 2]]] — multi-dimensional index on any dimensionality +# +# Boolean masks and tuple fancy indexing (e.g. a[[0, 2], [1, 3]]) +# still use the existing fancy-indexing machinery. + +import numpy as np + +import blosc2 + +# ============================================================ +# 1-D array +# ============================================================ +print("=== 1-D array ===") +a = blosc2.arange(20, dtype=np.int32) + +# 1-D integer index +print(f"a = {a[:]}") +print(f"a[[0, 5, 12, 19]] = {a[[0, 5, 12, 19]]}") +print() + +# Multi-dimensional integer index (2-D) +print(f"a[[[1, 3], [5, 7]]] =\n{a[[[1, 3], [5, 7]]]}") +print() + +# ============================================================ +# 3-D array +# ============================================================ +print("=== 3-D array ===") +shape = (4, 5, 6) # 120 elements total +a = blosc2.asarray(np.arange(120, dtype=np.float64).reshape(shape), chunks=(2, 3, 4), blocks=(2, 2, 2)) + +# 1-D index selects along axis 0 +print(f"shape = {shape}") +print("a[[0, 2, 3]] — selects rows 0, 2, 3 along axis 0") +print(f"result shape: {a[[0, 2, 3]].shape}") +print(f"result:\n{a[[0, 2, 3]]}") +print() + +# 2-D index — result shape = index shape + remaining dims +print("2-D index:") +print("a[[[0, 2], [3, 1]]]") +print(f"result shape: {a[[[0, 2], [3, 1]]].shape}") +print(f"result:\n{a[[[0, 2], [3, 1]]]}") +print() + +# Negative and duplicate indices +print("Negative and duplicate indices:") +print(f"a[[-1, 0, -1, 2]] shape: {a[[-1, 0, -1, 2]].shape}") +print(f"result:\n{a[[-1, 0, -1, 2]]}") +print() + +# Empty index +print("Empty index:") +print(f"a[[]] shape: {a[[]].shape}, value: {a[[]]}") +print() + +# ============================================================ +# Boolean masks +# ============================================================ +print("=== Boolean mask ===") +mask = np.array([True, False, True, False]) +print(f"mask = {mask.tolist()}") +print(f"a[mask] shape: {a[mask].shape}") +print(f"result:\n{a[mask]}") +print() + +# ============================================================ +# In summary +# ============================================================ +print("=== Summary ===") +print("a[[0, 3, 5]] — integer array on any dims → b2nd sparse gather") +print("a[[[0, 3], [5, 2]]] — multi-dim integer array → b2nd sparse gather") +print("a[[True, False, ...]] — boolean mask → existing fancy path") +print("a[[0, 2], [1, 3]] — tuple fancy indexing → existing fancy path") diff --git a/examples/ndarray/take.py b/examples/ndarray/take.py new file mode 100644 index 000000000..23e8777bf --- /dev/null +++ b/examples/ndarray/take.py @@ -0,0 +1,99 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Example showing `blosc2.take()` on 1-D and 3-D arrays. +# +# `take()` follows the Array API shape rules: +# - axis=None : the array is flattened conceptually and the result +# has the same shape as *indices*. +# - axis= : the indexed axis is replaced by *indices.shape*. +# +# Behind the scenes `take()` uses `b2nd_get_sparse_cbuffer()`, which +# decompresses *only* the specific blocks holding the requested elements. +# This is much more efficient than decompressing entire chunks, especially +# for large multi-dimensional arrays. + +import numpy as np + +import blosc2 + +# ============================================================ +# 1-D array +# ============================================================ +print("=== 1-D array ===") +a = blosc2.arange(20, dtype=np.int32) + +# Take specific elements by flat index (axis=None is the default) +result = blosc2.take(a, [0, 5, 12, 19]) +print(f"a = {a[:]}") +print(f"blosc2.take(a, [0, 5, 12, 19]) = {result[:]}") +print() + +# Multi-dimensional index array: result shape = indices shape +result = blosc2.take(a, [[1, 3], [5, 7]]) +print(f"blosc2.take(a, [[1, 3], [5, 7]]) =\n{result[:]}") +print() + +# ============================================================ +# 3-D array, axis=None (flattened) +# ============================================================ +print("=== 3-D array, axis=None (flattened) ===") +shape = (4, 5, 6) # 120 elements total +a = blosc2.asarray(np.arange(120, dtype=np.float64).reshape(shape), chunks=(2, 3, 4), blocks=(2, 2, 2)) + +# Flat indices into the 120-element buffer +result = blosc2.take(a, [0, 50, 119]) +print(f"shape = {shape}") +print(f"blosc2.take(a, [0, 50, 119]) = {result[:]}") +print() + +# ============================================================ +# 3-D array, axis=0 (gather along first dimension) +# ============================================================ +print("=== 3-D array, axis=0 ===") +result = blosc2.take(a, [0, 2, 3], axis=0) +print(f"shape = {shape}, axis=0, indices = [0, 2, 3]") +print(f"result shape: {result.shape}") +print(f"result:\n{result[:]}") +print() + +# ============================================================ +# 3-D array, axis=1 (gather along second dimension) +# ============================================================ +print("=== 3-D array, axis=1 ===") +result = blosc2.take(a, [0, 3, 4], axis=1) +print(f"shape = {shape}, axis=1, indices = [0, 3, 4]") +print(f"result shape: {result.shape}") +print(f"result:\n{result[:]}") +print() + +# ============================================================ +# 3-D array, axis=2 (gather along third dimension) +# ============================================================ +print("=== 3-D array, axis=2 ===") +result = blosc2.take(a, [0, 3, 5], axis=2) +print(f"shape = {shape}, axis=2, indices = [0, 3, 5]") +print(f"result shape: {result.shape}") +print(f"result:\n{result[:]}") +print() + +# ============================================================ +# Multi-dimensional indices and negative/duplicate indices +# ============================================================ +print("=== Multi-dimensional indices (axis=1) ===") +result = blosc2.take(a, [[0, 3], [2, 4]], axis=1) +print(f"shape = {shape}, axis=1") +print("indices (2-D) = [[0, 3], [2, 4]]") +print(f"result shape: {result.shape}") +print(f"result:\n{result[:]}") +print() + +print("=== Negative and duplicate indices (axis=1) ===") +result = blosc2.take(a, [-1, 0, -1, 2, 2], axis=1) +print("indices = [-1, 0, -1, 2, 2]") +print(f"result shape: {result.shape}") +print(f"result:\n{result[:]}") diff --git a/plans/b2view.md b/plans/b2view.md new file mode 100644 index 000000000..b77bfe65e --- /dev/null +++ b/plans/b2view.md @@ -0,0 +1,481 @@ +# b2view: TreeStore TUI Viewer Plan + +## Goal + +Create a read-only terminal user interface named `b2view` for browsing Blosc2 `TreeStore` hierarchies stored as `.b2d` directories or `.b2z` files. The viewer should allow users to navigate groups, arrays, and ctable/table-like objects, inspect metadata, and preview data without eagerly loading large datasets into memory. + +## Primary Use Cases + +- Open a `.b2d` or `.b2z` TreeStore from the command line. +- Browse the hierarchical structure interactively. +- Distinguish groups, arrays, and ctable/table objects visually. +- Inspect object metadata such as shape, dtype, chunks, compression, filters, and user attributes. +- Preview small slices of arrays. +- Preview rows and columns of ctables. +- Navigate large objects safely using paging/slicing controls. + +## Proposed Command + +```bash +b2view path/to/store.b2d +b2view path/to/store.b2z +``` + +Optional future flags: + +```bash +b2view store.b2d --path /experiments/run_001 +b2view store.b2d --readonly +b2view store.b2d --preview-rows 50 +b2view store.b2d --theme dark +``` + +## Recommended Technology + +Use **Textual** as the TUI framework, with **Rich** for rendering metadata, tables, and formatted values. + +Reasons: + +- Built-in tree widgets are suitable for TreeStore hierarchy browsing. +- Supports split-pane layouts, tabs, scrollable panels, modals, keybindings, and mouse interaction. +- Rich integration is excellent for tables, pretty-printed dicts, JSON-like metadata, and styled output. +- Easier to maintain than raw curses or urwid. +- Async/background task support is useful for lazy metadata/data loading. + +Alternatives considered: + +- `curses`: too low-level for this UI. +- `urwid`: mature, but more cumbersome for modern layouts. +- `prompt_toolkit`: excellent for prompts/REPLs, less ideal for a full-screen browser. + +## High-Level UI Layout + +Initial layout: + +```text +┌──────────────────── TreeStore ────────────────────┬────────────────────────────┐ +│ / │ Object info │ +│ ├── experiments │ path: /experiments/run_001 │ +│ │ ├── run_001 │ type: NDArray │ +│ │ │ ├── signal │ shape: (10000, 128) │ +│ │ │ └── events │ dtype: float32 │ +│ │ └── run_002 │ chunks: ... │ +│ └── metadata │ compression: zstd │ +├─────────────────────────────────────────────────────┴────────────────────────────┤ +│ Data preview │ +│ │ +│ array/table contents here │ +└───────────────────────────────────────────────────────────────────────────────────┘ +``` + +Core panels: + +1. **Hierarchy tree** + - Shows groups and children. + - Uses different icons/styles for groups, arrays, ctables, and unknown objects. + - Loads children lazily when nodes are expanded. + +2. **Metadata/details panel** + - Updates when a node is selected. + - Shows core metadata and storage/compression information. + - Shows user metadata/attributes if present. + +3. **Data preview panel** + - Shows a small preview of the selected object. + - For arrays, shows a bounded slice. + - For ctables, shows the first page of rows and selected columns. + - Should never materialize a large full object by default. + +Potential future panels: + +- Search/find panel. +- Slice/query input panel. +- Statistics panel. +- Histogram/summary visualization panel. +- Export dialog. + +## Read-Only First + +The first version should be strictly read-only. + +Avoid: + +- Editing metadata. +- Deleting nodes. +- Renaming nodes. +- Writing modified arrays/tables. + +This keeps the first implementation safe and avoids accidental mutation of user stores. + +## Lazy Loading Requirements + +Lazy loading is central to the design. + +Startup should: + +1. Validate/open the store. +2. Populate only the root node and immediate children if cheap. +3. Avoid recursively scanning the entire tree. +4. Avoid loading array/table data. + +On tree expansion: + +- Load only the selected node's children. +- Cache child listings where appropriate. +- Provide a refresh command later if the underlying store changes. + +On node selection: + +- Load lightweight metadata. +- Render metadata immediately. +- Load data preview separately, ideally in a background task. + +On data preview: + +- Use small bounded reads. +- Provide paging or slicing controls. +- Catch and display errors without crashing the TUI. + +## Suggested Package Structure + +If included inside `python-blosc2`: + +```text +src/blosc2/b2view/ + __init__.py + cli.py # console entry point + app.py # Textual App subclass and layout + model.py # TreeStore adapter / browser abstraction + widgets.py # custom widgets/panels + render.py # Rich renderables for metadata and previews + keys.py # keybinding constants/help text, optional +``` + +Potential tests: + +```text +tests/test_b2view_model.py +tests/test_b2view_render.py +``` + +Console script: + +```toml +[project.scripts] +b2view = "blosc2.b2view.cli:main" +``` + +If Textual is considered too heavy for the base install, make it an optional dependency: + +```toml +[project.optional-dependencies] +tui = ["textual", "rich"] +``` + +Then document installation as: + +```bash +pip install "blosc2[tui]" +``` + +## Backend Abstraction + +The UI should not directly depend on many TreeStore internals. Add a small model layer that exposes a stable browsing API. + +Example sketch: + +```python +@dataclass +class NodeInfo: + path: str + name: str + kind: str # group, ndarray, ctable, unknown + has_children: bool | None = None + + +@dataclass +class ObjectInfo: + path: str + kind: str + metadata: dict + user_attrs: dict | None = None + + +class StoreBrowser: + def __init__(self, urlpath: str): ... + + def list_children(self, path: str) -> list[NodeInfo]: ... + + def get_info(self, path: str) -> ObjectInfo: ... + + def preview( + self, path: str, *, start: int = 0, stop: int = 20, columns=None, slices=None + ): ... +``` + +Benefits: + +- Keeps Textual code clean. +- Makes unit testing easier. +- Allows later support for other stores/backends. +- Centralizes object kind detection and safe preview logic. + +## Object Kind Detection + +The browser layer should classify nodes as: + +- `group`: hierarchy-only container. +- `ndarray`: Blosc2 array object. +- `ctable`: ctable/table-like object. +- `scalar` or `metadata`: optional future classification. +- `unknown`: fallback for unsupported objects. + +Detection should be robust and avoid expensive reads. Prefer metadata/type information available from TreeStore before opening or materializing objects. + +## Metadata Display + +Metadata panel should group information into sections. + +Suggested sections: + +### General + +- Path +- Name +- Object kind +- Shape +- Number of dimensions +- Dtype +- Number of rows, for tables +- Number of columns, for tables +- Logical size / nbytes when available + +### Storage + +- Store type: `.b2d` or `.b2z` +- Chunks/blockshape +- Chunk count if available cheaply +- Contiguity / urlpath details +- Compression codec +- Compression level +- Filters +- Split mode / special parameters if relevant + +### Table Schema + +For ctables: + +- Column names +- Column dtypes +- Column shapes if nested or multidimensional columns are supported +- Nullable/missing-value information if applicable + +### User Metadata + +- Attributes +- Application metadata +- Any serialized user metadata stored with the object + +Use Rich renderables: + +- `rich.table.Table` for key/value metadata. +- `rich.tree.Tree` or nested tables for structured metadata. +- `rich.pretty.Pretty` for dict-like values. +- JSON syntax highlighting for JSON-compatible metadata. + +## Data Preview Behavior + +### NDArray Preview + +Default behavior should depend on dimensionality: + +- 0-D: show scalar value. +- 1-D: show `arr[:N]`. +- 2-D: show `arr[:R, :C]`. +- N-D: show a 2-D plane using default slices, e.g. first index for leading dimensions and bounded rows/columns for the last two dimensions. + +Example defaults: + +```python +max_rows = 20 +max_cols = 10 +``` + +For high-dimensional arrays, display the active slice spec: + +```text +slice: 0, 0, :, :20 +``` + +Future controls: + +- Edit slice expression. +- Increment/decrement selected axis. +- Page through rows/columns. +- Toggle NumPy-like repr vs table view. + +### CTable Preview + +Default behavior: + +- Show first N rows. +- Show all columns if the count is small. +- Truncate or horizontally scroll if many columns. +- Preserve column names and dtypes. + +Controls: + +- Page down/up by rows. +- Jump to start/end. +- Select visible columns. +- Show one row in detail view. + +Future query support: + +- Simple column projection. +- Row filtering expressions. +- Sorting if supported cheaply. +- Export current view. + +## Keybindings + +Initial keybindings: + +```text +q quit +? show help +enter expand/collapse tree node or open selected item +space expand/collapse tree node +up/down move selection +left/right collapse/expand or move focus +Tab switch focus between tree, metadata, preview +r refresh selected node metadata/preview +PgUp/PgDn page preview rows +Home/End jump within preview +/ search paths, future +s edit slice/query, future +e export selected preview, future +``` + +Keybindings should be shown in a help modal. + +## Error Handling + +The TUI should handle errors gracefully: + +- Invalid path. +- Unsupported store format. +- Corrupt or partially missing nodes. +- Permission errors. +- Preview read failures. +- Unsupported object kinds. + +Errors should appear in a status bar or modal panel, not as raw tracebacks unless debug mode is enabled. + +Optional debug flag: + +```bash +b2view store.b2d --debug +``` + +## Testing Strategy + +Focus tests on non-UI logic first. + +### Unit tests + +- `StoreBrowser` opens `.b2d` and `.b2z` stores. +- Root children are listed correctly. +- Nested children are listed correctly. +- Object kind classification works for groups, arrays, and ctables. +- Metadata extraction returns expected keys. +- Array preview uses bounded slices. +- CTable preview uses bounded row ranges. +- Missing/invalid paths raise controlled exceptions. + +### Rendering tests + +- Metadata dicts render without crashing. +- Array previews render for 0-D, 1-D, 2-D, and N-D arrays. +- Table previews render with many columns and many rows. + +### TUI smoke tests + +If Textual testing utilities are available: + +- App starts with a temporary TreeStore. +- Root node appears. +- Expanding a node loads children. +- Selecting an array updates metadata and preview panels. + +## Implementation Milestones + +### Milestone 1: Backend browser prototype + +- Add `StoreBrowser` model. +- Implement opening `.b2d` and `.b2z` stores. +- Implement child listing. +- Implement object kind detection. +- Implement metadata extraction. +- Add unit tests. + +### Milestone 2: Rendering helpers + +- Add Rich renderers for metadata. +- Add array preview renderer. +- Add ctable preview renderer. +- Add tests for renderers. + +### Milestone 3: Minimal Textual app + +- Add CLI entry point. +- Build layout with tree, metadata panel, and preview panel. +- Populate root node. +- Update metadata and preview on selection. +- Add basic keybindings. + +### Milestone 4: Lazy expansion and paging + +- Load tree children on expansion. +- Add preview paging for arrays/tables. +- Add status bar and loading/error indicators. + +### Milestone 5: Polish + +- Add help modal. +- Add path search. +- Add configurable preview row/column limits. +- Improve style/theme. +- Document usage. + +## Documentation + +Add user documentation covering: + +- Installation, including optional TUI dependency if applicable. +- Basic usage. +- Keybindings. +- Safety/read-only behavior. +- Preview limitations. +- Examples with `.b2d` and `.b2z` stores. + +Possible locations: + +```text +doc/b2view.rst +examples/b2view_create_sample_store.py +``` + +## Open Questions + +- Should `textual` be a required dependency or optional extra? +- What is the exact public API for TreeStore child listing and object metadata? +- How should ctable objects be detected robustly? +- Should the first version live inside `blosc2` or as a separate package? +- Should `.b2z` random access limitations affect preview behavior? +- What object metadata should be considered stable/public versus implementation detail? +- Is write support ever desired, or should this remain permanently read-only? + +## Recommendation + +Start with a read-only, lazy-loading Textual app and a well-tested `StoreBrowser` abstraction. Keep the first version focused on safe hierarchy browsing, metadata inspection, and small bounded previews. Add richer querying, slicing controls, export, and statistics only after the core browser is reliable. diff --git a/pyproject.toml b/pyproject.toml index 8871e6fd5..f55150c16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,9 +51,11 @@ documentation = "https://www.blosc.org/python-blosc2/python-blosc2.html" [project.optional-dependencies] parquet = ["pyarrow"] +tui = ["textual", "rich"] [project.scripts] parquet-to-blosc2 = "blosc2.cli.parquet_to_blosc2:main" +b2view = "blosc2.b2view.cli:main" [dependency-groups] dev = [ diff --git a/src/blosc2/b2view/__init__.py b/src/blosc2/b2view/__init__.py new file mode 100644 index 000000000..d9215506f --- /dev/null +++ b/src/blosc2/b2view/__init__.py @@ -0,0 +1,5 @@ +"""Terminal viewer for Blosc2 TreeStore bundles.""" + +from blosc2.b2view.model import DataSliceLayout, NodeInfo, ObjectInfo, StoreBrowser + +__all__ = ["DataSliceLayout", "NodeInfo", "ObjectInfo", "StoreBrowser"] diff --git a/src/blosc2/b2view/app.py b/src/blosc2/b2view/app.py new file mode 100644 index 000000000..defe7fa08 --- /dev/null +++ b/src/blosc2/b2view/app.py @@ -0,0 +1,1055 @@ +"""Textual application for b2view.""" + +from __future__ import annotations + +from typing import Any, ClassVar + +from textual.app import App, ComposeResult +from textual.binding import Binding +from textual.containers import Horizontal, Vertical, VerticalScroll +from textual.screen import ModalScreen +from textual.widgets import DataTable, Footer, Header, Input, Static, Tree + +from blosc2.b2view.model import DataSliceLayout, StoreBrowser +from blosc2.b2view.render import format_cell, make_metadata_renderable, make_preview_renderables + +_KIND_ICONS = { + "group": "📁", + "ndarray": "▦", + "c2array": "▦", + "ctable": "▤", + "schunk": "▣", + "unknown": "?", +} + + +class B2ViewPanel(Vertical): + """Pane container that can be maximized.""" + + ALLOW_MAXIMIZE = True + + +class BufferedDataTable(DataTable): + """DataTable with app-controlled page changes at row boundaries.""" + + def action_cursor_down(self) -> None: + app = self.app + if getattr(app, "_dim_mode", False): + getattr(app, "_dim_adjust", lambda _: None)(-1) + return + if self.cursor_row >= self.row_count - 1 and getattr(app, "page_table", lambda _: False)(1): + return + super().action_cursor_down() + + def action_cursor_up(self) -> None: + app = self.app + if getattr(app, "_dim_mode", False): + getattr(app, "_dim_adjust", lambda _: None)(1) + return + if self.cursor_row <= 0 and getattr(app, "page_table", lambda _: False)(-1): + return + super().action_cursor_up() + + def action_cursor_right(self) -> None: + app = self.app + if getattr(app, "_dim_mode", False): + getattr(app, "_dim_cursor", lambda _: None)(1) + return + if self.cursor_column >= len(self.columns) - 1 and getattr( + app, "page_grid_columns", lambda _: False + )(1): + return + super().action_cursor_right() + + def action_cursor_left(self) -> None: + app = self.app + if getattr(app, "_dim_mode", False): + getattr(app, "_dim_cursor", lambda _: None)(-1) + return + if self.cursor_column <= 0 and getattr(app, "page_grid_columns", lambda _: False)(-1): + return + super().action_cursor_left() + + def action_page_down(self) -> None: + if getattr(self.app, "page_table", lambda _: False)(1): + return + super().action_page_down() + + def action_page_up(self) -> None: + if getattr(self.app, "page_table", lambda _: False)(-1): + return + super().action_page_up() + + def action_page_right(self) -> None: + if getattr(self.app, "page_grid_columns", lambda _: False)(1): + return + super().action_page_right() + + def action_page_left(self) -> None: + if getattr(self.app, "page_grid_columns", lambda _: False)(-1): + return + super().action_page_left() + + def action_select_cursor(self) -> None: + app = self.app + if getattr(app, "_dim_mode", False): + getattr(app, "action_dim_toggle_nav", lambda: None)() + return + super().action_select_cursor() + + def action_scroll_home(self) -> None: + if getattr(self.app, "_grid_col_home", lambda: False)(): + pass + else: + super().action_scroll_home() + + def action_scroll_end(self) -> None: + if getattr(self.app, "_grid_col_end", lambda: False)(): + pass + else: + super().action_scroll_end() + + +class GoToRowScreen(ModalScreen[int | None]): + """Small modal asking for a global row number.""" + + CSS = """ + GoToRowScreen { + align: center middle; + } + #goto-dialog { + width: 50; + height: auto; + border: thick $accent; + background: $surface; + padding: 1 2; + } + #goto-title { + text-style: bold; + margin-bottom: 1; + } + """ + + BINDINGS: ClassVar = [("escape", "cancel", "Cancel")] + + def __init__(self, *, nrows: int, current: int): + super().__init__() + self.nrows = nrows + self.current = current + + def compose(self) -> ComposeResult: + with Vertical(id="goto-dialog"): + yield Static(f"Go to row 0..{self.nrows - 1} (current: {self.current})", id="goto-title") + yield Input(placeholder="row number", id="goto-input") + + def on_mount(self) -> None: + input_widget = self.query_one("#goto-input", Input) + input_widget.value = str(self.current) + input_widget.focus() + + def on_input_submitted(self, event: Input.Submitted) -> None: + value = event.value.strip().replace("_", "") + try: + row = int(value) + except ValueError: + self.query_one("#goto-title", Static).update("Please enter an integer row number") + return + if not 0 <= row < self.nrows: + self.query_one("#goto-title", Static).update(f"Row must be in range 0..{self.nrows - 1}") + return + self.dismiss(row) + + def action_cancel(self) -> None: + self.dismiss(None) + + +class B2ViewApp(App): + """Browse TreeStore hierarchy and preview objects.""" + + CSS = """ + #main { height: 1fr; } + #tree-pane { width: 35%; border: solid $primary; } + #right-pane { width: 65%; } + #top-row { height: 40%; } + #meta-pane, #vlmeta-pane { width: 50%; border: solid $secondary; } + #data-pane { height: 60%; border: solid $secondary; } + #tree { height: 1fr; } + #data-header { height: auto; padding: 0 1; } + #data-table-row { height: 1fr; } + #data-table { width: 1fr; height: 1fr; } + #row-scrollbar { width: 1; height: 1fr; color: $accent; } + #col-scrollbar { height: 1; width: 1fr; color: $accent; } + #meta-scroll, #vlmeta-scroll, #data-scroll { height: 1fr; padding: 0 1; } + #tree-pane:focus-within, #meta-pane:focus-within, #vlmeta-pane:focus-within, #data-pane:focus-within { border: heavy $accent; } + B2ViewPanel.-maximized, + #tree-pane.-maximized, + #meta-pane.-maximized, + #data-pane.-maximized { width: 1fr; height: 1fr; } + """ + + BINDINGS: ClassVar = [ + ("q", "quit", "Quit"), + ("tab", "focus_next_panel", "Next panel"), + ("shift+tab", "focus_previous_panel", "Previous panel"), + Binding("g", "go_to_row", "Go to row", show=False), + ("m", "maximize_panel", "Maximize"), + ("r", "restore_or_refresh", "Restore/Refresh"), + Binding("t", "grid_row_top", "Top", show=False), + Binding("b", "grid_row_bottom", "Bottom", show=False), + Binding("d", "dim_cycle", "Dim mode", show=False), + Binding("enter", "dim_toggle_nav", "Toggle nav", show=False), + Binding("escape", "dim_exit", "Exit dim mode", show=False), + ] + + def __init__( + self, + urlpath: str, + *, + start_path: str = "/", + start_panel: str = "tree", + preview_rows: int = 20, + preview_cols: int = 10, + ): + super().__init__() + self.urlpath = urlpath + self.start_path = start_path + self.start_panel = start_panel + self.preview_rows = preview_rows + self.preview_cols = preview_cols + self.browser: StoreBrowser | None = None + self.loaded_paths: set[str] = set() + self.selected_path = "/" + self.table_page: dict | None = None + self.table_buffer: dict | None = None + self.grid_col_start = 0 + self._data_layout: DataSliceLayout | None = None + self._active_dim = 0 + self._dim_mode = False + self.loading_table_page = False + + def compose(self) -> ComposeResult: + yield Header() + with Horizontal(id="main"): + with B2ViewPanel(id="tree-pane") as tree_pane: + tree_pane.border_title = "tree" + yield Tree("/", id="tree") + with Vertical(id="right-pane"): + with Horizontal(id="top-row"): + with B2ViewPanel(id="meta-pane") as meta_pane: + meta_pane.border_title = "meta" + with VerticalScroll(id="meta-scroll", can_focus=True): + yield Static("Select a node", id="metadata") + with B2ViewPanel(id="vlmeta-pane") as vlmeta_pane: + vlmeta_pane.border_title = "vlmeta" + with VerticalScroll(id="vlmeta-scroll", can_focus=True): + yield Static("", id="vlmetadata") + with B2ViewPanel(id="data-pane") as data_pane: + data_pane.border_title = "data" + data_pane.border_subtitle = "d(im mode) | t(op) - b(ottom) - g(oto)" + yield Static("", id="data-header") + with Horizontal(id="data-table-row"): + yield BufferedDataTable(id="data-table", show_row_labels=True, zebra_stripes=True) + yield Static("", id="row-scrollbar") + yield Static("", id="col-scrollbar") + with VerticalScroll(id="data-scroll", can_focus=True): + yield Static("", id="preview") + yield Footer() + + def on_mount(self) -> None: + self.browser = StoreBrowser(self.urlpath) + tree = self.query_one("#tree", Tree) + tree.root.data = "/" + self.load_children(tree.root) + tree.root.expand() + self.query_one("#data-table-row", Horizontal).display = False + self.query_one("#col-scrollbar", Static).display = False + + if self.start_path and self.start_path != "/": + self._navigate_to_path(self.start_path) + else: + self.call_after_refresh(self.update_panels, "/") + tree.focus() + + # Override focus after render settles, when starting panel is not the tree + if self.start_panel != "tree": + self.set_timer(0.05, lambda: self._focus_panel_by_name(self.start_panel)) + + def _focus_panel_by_name(self, name: str) -> None: + """Focus a panel by its user-facing name.""" + panel_map = { + "tree": lambda: self.query_one("#tree", Tree), + "meta": lambda: self.query_one("#meta-scroll", VerticalScroll), + "vlmeta": lambda: self.query_one("#vlmeta-scroll", VerticalScroll), + "data": lambda: ( + self.query_one("#data-table", DataTable) + if self.query_one("#data-table-row", Horizontal).display + else self.query_one("#data-scroll", VerticalScroll) + ), + } + getter = panel_map.get(name) + if getter is not None: + getter().focus() + + def _navigate_to_path(self, path: str) -> None: + """Expand the tree and select the node at *path*.""" + tree = self.query_one("#tree", Tree) + parts = [p for p in path.split("/") if p] + node = tree.root + # Walk down the tree expanding each level + for part in parts: + self.load_children(node) + found = None + for child in node.children: + if child.label and child.label.plain.endswith(f" {part}"): + found = child + break + if found is None: + # Path not found — fall back to root + self.call_after_refresh(self.update_panels, "/") + tree.focus() + return + if found.allow_expand: + self.load_children(found) + found.expand() + node = found + + # Selecting the node fires NodeSelected → on_tree_node_selected → update_panels + def _do_select(): + tree.select_node(node) + tree.scroll_to_node(node) + tree.focus() + + self.call_after_refresh(_do_select) + + def on_unmount(self) -> None: + if self.browser is not None: + self.browser.close() + + def load_children(self, node) -> None: + path = node.data or "/" + if self.browser is None or path in self.loaded_paths: + return + for child in self.browser.list_children(path): + icon = _KIND_ICONS.get(child.kind, "?") + node.add(f"{icon} {child.name}", data=child.path, allow_expand=child.has_children) + self.loaded_paths.add(path) + + def on_tree_node_expanded(self, event: Tree.NodeExpanded) -> None: + self.load_children(event.node) + + def on_tree_node_selected(self, event: Tree.NodeSelected) -> None: + path = event.node.data or "/" + self.selected_path = path + self.update_panels(path) + if event.node.allow_expand: + self.load_children(event.node) + + def update_panels(self, path: str) -> None: + if self.browser is None: + return + metadata = self.query_one("#metadata", Static) + data_header = self.query_one("#data-header", Static) + data_table_row = self.query_one("#data-table-row", Horizontal) + data_scroll = self.query_one("#data-scroll", VerticalScroll) + preview = self.query_one("#preview", Static) + vlmeta_pane = self.query_one("#vlmeta-pane", B2ViewPanel) + vlmeta_widget = self.query_one("#vlmetadata", Static) + try: + info = self.browser.get_info(path) + metadata.update(make_metadata_renderable(info)) + self.table_buffer = None + self.grid_col_start = 0 + self._data_layout = None + self._active_dim = 0 + self._dim_mode = False + if info.kind == "group": + data_header.display = False + data_table_row.display = False + data_scroll.display = True + self.query_one("#col-scrollbar", Static).display = False + data_header.update("") + preview.update("Group node; select an array or table to preview.") + self._update_vlmeta(vlmeta_pane, vlmeta_widget, path) + else: + if self._uses_grid_preview(info): + data_header.display = True + data_table_row.display = True + data_scroll.display = False + preview.update("") + shape = tuple(info.metadata.get("shape", ()) or ()) + ndim = len(shape) + if ndim >= 1 and self._data_layout is None: + self._data_layout = DataSliceLayout.from_shape(shape) + self._active_dim = 0 + data = self._load_table_page(path, 0) + else: + data = self.browser.preview(path, max_rows=self.preview_rows, max_cols=self.preview_cols) + if self._is_table_preview(data): + self._update_data_table(data) + self._update_data_header(data) + else: + header, body = make_preview_renderables(data) + data_header.display = header is not None + data_table_row.display = False + data_scroll.display = True + self.query_one("#col-scrollbar", Static).display = False + data_header.update("" if header is None else header) + preview.update(body) + self._update_vlmeta(vlmeta_pane, vlmeta_widget, path) + self._reset_panel_scroll() + except Exception as exc: + metadata.update(f"Error reading {path}: {exc}") + data_header.display = False + data_table_row.display = False + data_scroll.display = True + self.query_one("#col-scrollbar", Static).display = False + data_header.update("") + preview.update("") + self._update_vlmeta(vlmeta_pane, vlmeta_widget, None) + self._reset_panel_scroll() + + @staticmethod + def _format_vlmeta_value(value: Any) -> str: + """Format a vlmeta value for display.""" + if isinstance(value, bool): + return str(value) + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, (list, tuple)): + return ", ".join(str(v) for v in value) + if isinstance(value, dict): + return ", ".join(f"{k}: {v}" for k, v in value.items()) + return str(value) + + def _update_vlmeta(self, pane, widget: Static, path: str | None) -> None: + """Populate the vlmeta pane with variable-length metadata.""" + pane.display = True + if path is None or self.browser is None: + widget.update("") + return + try: + info = self.browser.get_info(path) + if info.user_attrs is None: + widget.update("") + elif not info.user_attrs: + widget.update("") + else: + from rich.table import Table + + table = Table(show_header=False, box=None, expand=True) + table.add_column("key", style="bold cyan", no_wrap=True) + table.add_column("value") + for k, v in info.user_attrs.items(): + table.add_row(str(k), self._format_vlmeta_value(v)) + widget.update(table) + except Exception: + widget.update("") + + @staticmethod + def _is_table_preview(data) -> bool: + return isinstance(data, dict) and "data" in data and "columns" in data + + @staticmethod + def _uses_grid_preview(info) -> bool: + # 1D, 2D, 3D+ NDArray/C2Array all use grid preview + return info.kind == "ctable" or ( + info.kind in {"ndarray", "c2array"} and info.metadata.get("ndim", 0) >= 1 + ) + + def _col_page_size(self) -> int: + """Return the number of columns that fit in the current data table width.""" + table = self.query_one("#data-table", DataTable) + width = table.size.width + if width <= 1: + return self.preview_cols + # Each column uses roughly 9 characters (float format width) + 2 padding. + # Row labels take about 6 characters. + col_width = 11 + # Subtract row-label column space + usable = max(1, width - 6) + return max(1, usable // col_width) + + def _table_page_size(self) -> int: + table = self.query_one("#data-table", DataTable) + # Keep only rows likely to be visible. The DataTable header consumes one + # line; fall back to the CLI limit before layout has assigned sizes. + height = table.size.height + if height <= 1: + height = self.query_one("#data-pane", Vertical).size.height - 2 + return max(1, height - 1) if height > 1 else max(1, self.preview_rows) + + def _load_table_page(self, path: str, start: int) -> dict: + if self.browser is None: + raise RuntimeError("Store browser is not open") + page_size = self._table_page_size() + start = max(0, start) + layout = self._data_layout + + if self.table_buffer is not None: + buffer_start = self.table_buffer["start"] + buffer_stop = self.table_buffer["stop"] + same_columns = self.table_buffer.get("source_kind") not in {"ndarray2d", "ndarray_slice"} or ( + self.table_buffer.get("col_start") == self.grid_col_start + and self.table_buffer.get("slice_indices") + == ( + [ + layout.fixed_values.get(i, 0) + for i in range(len(layout.shape)) + if i in layout.fixed_values + ] + if layout is not None + else [] + ) + ) + if same_columns and buffer_start <= start and start + page_size <= buffer_stop: + data = self._slice_table_buffer(start, page_size) + self.table_page = data + return data + + buffer_size = page_size * 10 + buffer_start = max(0, start - page_size * 4) + + if layout is not None and len(layout.shape) >= 1: + # Use the layout-based preview for all array types (1D+) + # Scalar view (0 navigable dims) always starts at 0 + if not layout.navigable_dims: + start = 0 + self._sync_layout_scroll(start, layout) + data = self.browser.preview( + path, + max_rows=buffer_size, + max_cols=self._col_page_size(), + layout=layout, + ) + else: + # CTable or non-array objects — use legacy preview + data = self.browser.preview( + path, + start=buffer_start, + stop=buffer_start + buffer_size, + max_rows=buffer_size, + max_cols=self._col_page_size(), + col_start=self.grid_col_start, + ) + self.table_buffer = data + data = self._slice_table_buffer(start, page_size) + self.table_page = data + return data + + def _sync_layout_scroll(self, start: int, layout: DataSliceLayout) -> None: + """Update the layout's row/col scroll positions to match the page start.""" + if layout is None: + return + navigable = layout.navigable_dims + if len(navigable) >= 1: + row_dim = navigable[0] + total = layout.shape[row_dim] + layout.row_start = max(0, min(start, total)) + layout.row_stop = min(layout.row_start + self._table_page_size() * 10, total) + if len(navigable) >= 2: + col_dim = navigable[1] + total = layout.shape[col_dim] + layout.col_start = max(0, min(self.grid_col_start, total)) + layout.col_stop = min(layout.col_start + self._col_page_size(), total) + + def _slice_table_buffer(self, start: int, page_size: int) -> dict: + if self.table_buffer is None: + raise RuntimeError("No table buffer loaded") + buffer = self.table_buffer + offset = start - buffer["start"] + available = max(0, buffer["stop"] - start) + count = min(page_size, available) + stop = start + count + return { + "start": start, + "stop": stop, + "nrows": buffer["nrows"], + "columns": buffer["columns"], + "hidden_columns": buffer["hidden_columns"], + "data": {name: values[offset : offset + count] for name, values in buffer["data"].items()}, + **{ + key: buffer[key] + for key in ( + "source_kind", + "shape", + "col_start", + "col_stop", + "ncols", + "slice_indices", + "n_slices_per_dim", + ) + if key in buffer + }, + } + + def _update_data_table(self, data: dict, *, cursor_row: int = 0, cursor_col: int = 0) -> None: + table = self.query_one("#data-table", DataTable) + self.loading_table_page = True + try: + table.clear(columns=True) + for name in data["columns"]: + table.add_column(name, key=name) + nrows = data["stop"] - data["start"] + for i in range(nrows): + table.add_row( + *[format_cell(data["data"][name][i]) for name in data["columns"]], + label=str(data["start"] + i), + ) + nrows = data["stop"] - data["start"] + cursor_row = min(max(0, cursor_row), max(0, nrows - 1)) + cursor_col = min(max(0, cursor_col), max(0, len(data["columns"]) - 1)) + table.cursor_coordinate = (cursor_row, cursor_col) + table.scroll_home(animate=False) + self._update_global_row_scrollbar(data) + self._update_global_col_scrollbar(data) + finally: + self.call_after_refresh(self._finish_table_page_load) + + def _finish_table_page_load(self) -> None: + self.loading_table_page = False + + def page_table(self, direction: int) -> bool: + if self.loading_table_page or self.table_page is None: + return False + page = self.table_page + page_size = self._table_page_size() + if direction > 0: + if page["stop"] >= page["nrows"]: + return False + data = self._load_table_page(self.selected_path, page["stop"]) + cursor_row = 0 + else: + if page["start"] <= 0: + return False + start = max(0, page["start"] - page_size) + data = self._load_table_page(self.selected_path, start) + cursor_row = data["stop"] - data["start"] - 1 + self._update_data_table(data, cursor_row=cursor_row) + self._update_data_header(data) + return True + + def page_grid_columns(self, direction: int) -> bool: + if self.loading_table_page or self.table_page is None: + return False + page = self.table_page + if page.get("source_kind") not in ("ndarray2d", "ndarray_slice"): + return False + page_cols = max(1, len(page["columns"])) + ncols = page["ncols"] + col_start = page["col_start"] + if direction > 0: + if page["col_stop"] >= ncols: + return False + self.grid_col_start = min(ncols - 1, col_start + page_cols) + cursor_col = 0 + else: + if col_start <= 0: + return False + self.grid_col_start = max(0, col_start - page_cols) + cursor_col = page_cols - 1 + self.table_buffer = None + data = self._load_table_page(self.selected_path, page["start"]) + cursor_row = self.query_one("#data-table", DataTable).cursor_row + self._update_data_table(data, cursor_row=cursor_row, cursor_col=cursor_col) + self._update_data_header(data) + return True + + def _grid_col_home(self) -> bool: + if self.table_page is None or self.table_page.get("source_kind") not in ( + "ndarray2d", + "ndarray_slice", + ): + return False + self.grid_col_start = 0 + self.table_buffer = None + data = self._load_table_page(self.selected_path, self.table_page["start"]) + cursor_row = self.query_one("#data-table", DataTable).cursor_row + self._update_data_table(data, cursor_row=cursor_row, cursor_col=0) + self._update_data_header(data) + return True + + def _grid_col_end(self) -> bool: + if self.table_page is None or self.table_page.get("source_kind") not in ( + "ndarray2d", + "ndarray_slice", + ): + return False + page = self.table_page + page_cols = max(1, len(page["columns"])) + self.grid_col_start = max(0, page["ncols"] - page_cols) + self.table_buffer = None + data = self._load_table_page(self.selected_path, page["start"]) + cursor_row = self.query_one("#data-table", DataTable).cursor_row + self._update_data_table(data, cursor_row=cursor_row, cursor_col=page_cols - 1) + self._update_data_header(data) + return True + + def _update_data_header(self, data: dict) -> None: + layout = self._data_layout + header_parts: list[str] = [] + + if layout is not None and len(layout.shape) >= 1: + ndim = len(layout.shape) + for i in range(ndim): + is_active = i == self._active_dim + + if i in layout.fixed_values: + idx = layout.fixed_values[i] + part = f"d{i} [{idx}]" + elif i in layout.navigable_dims: + pos = layout.navigable_dims.index(i) + if pos == 0: + s, e = data["start"], data["stop"] + else: + s, e = data.get("col_start", 0), data.get("col_stop", 0) + part = f"d{i}[{s}:{e}]" + else: + part = f"d{i} ?" + + if is_active and self._dim_mode: + part = f"[bold]{part}[/bold]" + header_parts.append(part) + + if self._dim_mode: + header_parts.append("[reverse] DIM MODE [/reverse]") + header_parts.append("←→dim ↑↓val fix/nav exit") + else: + header_parts.append(f"rows {data['start']}:{data['stop']} of {data['nrows']}") + if "col_start" in data: + header_parts.append(f"cols {data['col_start']}:{data['col_stop']} of {data['ncols']}") + + line = ", ".join(header_parts) + if self._dim_mode and layout is not None: + line = f"[reverse]{line}[/reverse]" + self.query_one("#data-header", Static).update(line) + + def _make_global_scrollbar(self, *, start: int, stop: int, total: int, size: int, track: str) -> str: + size = max(1, size) + total = max(1, total) + start = min(max(0, start), total) + stop = min(max(start, stop), total) + visible = max(1, stop - start) + thumb_size = max(1, round(size * min(1.0, visible / total))) + if total <= visible: + thumb_start = 0 + thumb_size = size + else: + thumb_start = round((size - thumb_size) * (start / (total - visible))) + thumb_stop = min(size, thumb_start + thumb_size) + return "".join("█" if thumb_start <= i < thumb_stop else track for i in range(size)) + + def _update_global_row_scrollbar(self, data: dict) -> None: + scrollbar = self.query_one("#row-scrollbar", Static) + height = max(1, self.query_one("#data-table", DataTable).size.height) + bar = self._make_global_scrollbar( + start=int(data["start"]), + stop=int(data["stop"]), + total=int(data["nrows"]), + size=height, + track="│", + ) + scrollbar.update("\n".join(bar)) + + def _update_global_col_scrollbar(self, data: dict) -> None: + scrollbar = self.query_one("#col-scrollbar", Static) + if data.get("source_kind") not in ("ndarray2d", "ndarray_slice"): + scrollbar.display = False + scrollbar.update("") + return + scrollbar.display = True + width = max(1, self.query_one("#data-table", DataTable).size.width) + scrollbar.update( + self._make_global_scrollbar( + start=int(data["col_start"]), + stop=int(data["col_stop"]), + total=int(data["ncols"]), + size=width, + track="─", + ) + ) + + def _reset_panel_scroll(self) -> None: + for selector in ("#meta-scroll", "#data-scroll"): + self.query_one(selector, VerticalScroll).scroll_home(animate=False) + data_table_row = self.query_one("#data-table-row", Horizontal) + if data_table_row.display: + self.query_one("#data-table", DataTable).scroll_home(animate=False) + if self.table_page is not None: + self._update_global_row_scrollbar(self.table_page) + self._update_global_col_scrollbar(self.table_page) + + def _focusable_panels(self): + data_table_row = self.query_one("#data-table-row", Horizontal) + data_panel = ( + self.query_one("#data-table", DataTable) + if data_table_row.display + else self.query_one("#data-scroll", VerticalScroll) + ) + return [ + self.query_one("#tree", Tree), + self.query_one("#meta-scroll", VerticalScroll), + self.query_one("#vlmeta-scroll", VerticalScroll), + data_panel, + ] + + def _focus_panel(self, step: int) -> None: + panels = self._focusable_panels() + focused = self.focused + try: + index = panels.index(focused) + except ValueError: + index = 0 if step > 0 else len(panels) - 1 + panels[(index + step) % len(panels)].focus() + + def action_focus_next_panel(self) -> None: + self._focus_panel(1) + + def action_focus_previous_panel(self) -> None: + self._focus_panel(-1) + + def _in_data_grid(self) -> bool: + """Return True if focus is inside the data pane and a grid is active.""" + if self.table_page is None: + return False + if not self.query_one("#data-table-row", Horizontal).display: + return False + focused = self.focused + if focused is None: + return False + pane = self.query_one("#data-pane", Vertical) + return focused is pane or pane in focused.ancestors + + def action_go_to_row(self) -> None: + if not self._in_data_grid(): + return + current = self.table_page["start"] + self.query_one("#data-table", DataTable).cursor_row + screen = GoToRowScreen(nrows=self.table_page["nrows"], current=current) + self.push_screen(screen, self._go_to_row) + + def _focused_pane(self): + focused = self.focused + if focused is None: + return None + for selector in ("#tree-pane", "#meta-pane", "#vlmeta-pane", "#data-pane"): + pane = self.query_one(selector, Vertical) + if focused is pane or pane in focused.ancestors: + return pane + return None + + def action_maximize_panel(self) -> None: + pane = self._focused_pane() + if pane is None: + self.notify("Focus a pane before maximizing", severity="warning") + return + if self.screen.maximize(pane, container=False): + self.call_after_refresh(self._reload_table_for_current_viewport) + + def action_restore_or_refresh(self) -> None: + if self.screen.maximized is not None: + self.screen.maximized = None + self.call_after_refresh(self._reload_table_for_current_viewport) + return + self.action_refresh() + + def _reload_table_for_current_viewport(self) -> None: + """Reload the table page after layout changes such as maximize/restore.""" + if self.table_page is None or not self.query_one("#data-table-row", Horizontal).display: + return + current = self.table_page["start"] + self.query_one("#data-table", DataTable).cursor_row + page_size = self._table_page_size() + start = (current // page_size) * page_size + self.table_buffer = None + data = self._load_table_page(self.selected_path, start) + self._update_data_table(data, cursor_row=current - data["start"]) + self._update_data_header(data) + + def _go_to_row(self, row: int | None) -> None: + if row is None or self.table_page is None: + return + page_size = self._table_page_size() + start = (row // page_size) * page_size + data = self._load_table_page(self.selected_path, start) + self._update_data_table(data, cursor_row=row - data["start"]) + self._update_data_header(data) + self.query_one("#data-table", DataTable).focus() + + def action_refresh(self) -> None: + tree = self.query_one("#tree", Tree) + node = tree.cursor_node or tree.root + self.loaded_paths.discard(node.data or "/") + node.remove_children() + self.load_children(node) + self.update_panels(node.data or "/") + + def _adjust_fixed_value(self, direction: int) -> None: + """Adjust the fixed value of the active dimension (if it is fixed). + + In DIM mode the value wraps around at boundaries (0 ↔ max). + """ + layout = self._data_layout + if layout is None or self.table_page is None: + return + dim = self._active_dim + if dim not in layout.fixed_values: + return + total = layout.shape[dim] + if total <= 0: + return + current = layout.fixed_values[dim] + if self._dim_mode and total > 1: + # Cycle: up at max → 0, down at 0 → max-1 + new_val = (current + direction) % total + else: + # Clamp at boundaries (normal mode) + if direction > 0: + if current >= total - 1: + return + new_val = current + 1 + else: + if current <= 0: + return + new_val = current - 1 + new_fixed = dict(layout.fixed_values) + new_fixed[dim] = new_val + self._data_layout = layout.copy_with(fixed_values=new_fixed) + self.table_buffer = None + data = self._load_table_page(self.selected_path, self.table_page["start"]) + cursor_row = self.query_one("#data-table", DataTable).cursor_row + self._update_data_table(data, cursor_row=cursor_row) + self._update_data_header(data) + + def _rebuild_layout(self, navigable: list[int]) -> DataSliceLayout: + """Return a copy of the current layout with the given *navigable* dims. + + All non-navigable dimensions are fixed at their previous value (or 0). + """ + layout = self._data_layout + if layout is None: + raise RuntimeError("No layout available") + new_fixed: dict[int, int] = {} + for d in range(len(layout.shape)): + if d in navigable: + continue + if d in layout.fixed_values: + new_fixed[d] = layout.fixed_values[d] + else: + new_fixed[d] = 0 + return layout.copy_with(fixed_values=new_fixed, navigable_dims=navigable) + + def _dim_toggle(self) -> None: + """: key — toggle active dim between fixed and navigable.""" + layout = self._data_layout + if layout is None or self.table_page is None: + return + dim = self._active_dim + if dim not in range(len(layout.shape)): + return + + if dim in layout.navigable_dims: + # Navigable → fixed (at index 0) + new_nav = [d for d in layout.navigable_dims if d != dim] + self._data_layout = self._rebuild_layout(new_nav) + elif dim in layout.fixed_values: + # Fixed → navigable (if room) + if len(layout.navigable_dims) >= 2: + self.notify("At most 2 navigable dimensions are allowed") + return + new_nav = sorted(layout.navigable_dims + [dim]) + self._data_layout = self._rebuild_layout(new_nav) + else: + return + + # Refresh the display (DataTable for 1-2 nav dims, same path for 0) + self.table_buffer = None + data = self._load_table_page(self.selected_path, self.table_page["start"]) + cursor_row = self.query_one("#data-table", DataTable).cursor_row + self._update_data_table(data, cursor_row=cursor_row) + self._update_data_header(data) + + def _dim_cursor(self, direction: int) -> None: + """In dim mode, move the active dimension up (+1) or down (-1).""" + layout = self._data_layout + if layout is None or len(layout.shape) < 1: + return + ndim = len(layout.shape) + self._active_dim = (self._active_dim + direction) % ndim + if self.table_page is not None: + self._update_data_header(self.table_page) + + def _dim_adjust(self, direction: int) -> None: + """In DIM mode, adjust the active dim: fixed value or navigable viewport.""" + layout = self._data_layout + if layout is None or self.table_page is None: + return + dim = self._active_dim + if dim in layout.fixed_values: + self._adjust_fixed_value(direction) + elif dim in layout.navigable_dims: + self._scroll_navigable_viewport(direction) + + def _scroll_navigable_viewport(self, direction: int) -> None: + """Shift the viewport of a navigable dimension by one step (wraps).""" + layout = self._data_layout + if layout is None or self.table_page is None: + return + dim = self._active_dim + if dim not in layout.navigable_dims: + return + + pos = layout.navigable_dims.index(dim) + page = self.table_page + total = layout.shape[dim] + + if pos == 0: + # Row navigable dim — shift start by one row (wraps) + new_start = (page["start"] + direction) % total + self.table_buffer = None + data = self._load_table_page(self.selected_path, new_start) + else: + # Column navigable dim — shift col_start by one column (wraps) + new_col = (page["col_start"] + direction) % total + self.grid_col_start = new_col + self.table_buffer = None + data = self._load_table_page(self.selected_path, page["start"]) + + self._update_data_table(data) + self._update_data_header(data) + + def action_dim_cycle(self) -> None: + """d key — toggle DIM mode on/off.""" + if not self._in_data_grid(): + return + layout = self._data_layout + if layout is None or len(layout.shape) < 1: + self.notify("No dimensions to navigate") + return + + self._dim_mode = not self._dim_mode + if self.table_page is not None: + self._update_data_header(self.table_page) + + def action_dim_toggle_nav(self) -> None: + """Enter — toggle active dim between fixed and navigable (in dim mode).""" + if not self._in_data_grid() or not self._dim_mode: + return + self._dim_toggle() + + def action_dim_exit(self) -> None: + """Escape: exit dim mode.""" + if not self._dim_mode: + return + self._dim_mode = False + if self.table_page is not None: + self._update_data_header(self.table_page) + + def action_grid_row_top(self) -> None: + """Jump to the first row of the table.""" + if not self._in_data_grid(): + return + self._go_to_row(0) + + def action_grid_row_bottom(self) -> None: + """Jump to the last row of the table.""" + if not self._in_data_grid(): + return + self._go_to_row(self.table_page["nrows"] - 1) diff --git a/src/blosc2/b2view/cli.py b/src/blosc2/b2view/cli.py new file mode 100644 index 000000000..d45f4599e --- /dev/null +++ b/src/blosc2/b2view/cli.py @@ -0,0 +1,50 @@ +"""Command line entry point for b2view.""" + +from __future__ import annotations + +import argparse +import sys + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Browse a Blosc2 TreeStore bundle in the terminal.") + parser.add_argument("urlpath", help="Path to a .b2d directory or .b2z file") + parser.add_argument("path", nargs="?", default="/", help="Optional starting path inside the bundle") + parser.add_argument("--preview-rows", type=int, default=20, help="Maximum preview rows") + parser.add_argument("--preview-cols", type=int, default=10, help="Maximum preview columns") + parser.add_argument( + "--panel", + choices=["tree", "meta", "vlmeta", "data"], + default="tree", + help="Panel to focus on startup", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + try: + from blosc2.b2view.app import B2ViewApp + except ImportError as exc: + print( + "b2view requires the optional TUI dependencies. Install them with:\n" + "\n" + ' pip install "blosc2[tui]"\n', + file=sys.stderr, + ) + print(f"Original import error: {exc}", file=sys.stderr) + return 2 + + app = B2ViewApp( + args.urlpath, + start_path=args.path, + start_panel=args.panel, + preview_rows=args.preview_rows, + preview_cols=args.preview_cols, + ) + app.run() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/blosc2/b2view/model.py b/src/blosc2/b2view/model.py new file mode 100644 index 000000000..3732aeb41 --- /dev/null +++ b/src/blosc2/b2view/model.py @@ -0,0 +1,676 @@ +"""Read-only browsing helpers for b2view.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import PurePosixPath +from typing import Any + +import numpy as np + +import blosc2 + + +@dataclass(frozen=True) +class NodeInfo: + """Lightweight description of one TreeStore child.""" + + path: str + name: str + kind: str + has_children: bool + + +@dataclass(frozen=True) +class ObjectInfo: + """Metadata for a TreeStore object or group.""" + + path: str + kind: str + metadata: dict[str, Any] + user_attrs: dict[str, Any] | None = None + + +@dataclass +class DataSliceLayout: + """Describes the fixed/navigable state for slicing an N-D array into a 2-D table view. + + At most 2 dimensions can be navigable (shown as table rows/columns). + All other dimensions must be fixed at a specific index value. + """ + + shape: tuple[int, ...] + fixed_values: dict[int, int] # dim_index → fixed index value + navigable_dims: list[int] # sorted list of up to 2 navigable dim indices + + # Current scroll positions for navigable dims + # (index 0 → rows, index 1 → cols if present) + row_start: int = 0 + row_stop: int = 0 + col_start: int = 0 + col_stop: int = 0 + + @classmethod + def from_shape(cls, shape: tuple[int, ...]) -> DataSliceLayout: + """Create a default layout: leading dims fixed at 0, last up-to-2 dims navigable.""" + ndim = len(shape) + if ndim <= 2: + navigable = list(range(ndim)) + fixed: dict[int, int] = {} + else: + navigable = list(range(ndim - 2, ndim)) + fixed = dict.fromkeys(range(ndim - 2), 0) + return cls( + shape=shape, + fixed_values=fixed, + navigable_dims=navigable, + ) + + def make_slices(self, max_rows: int = 20, max_cols: int = 10) -> tuple[int | slice, ...]: + """Build the tuple of index expressions for slicing into the array. + + Uses *max_rows* and *max_cols* to size the navigable dimensions when + ``row_stop <= row_start`` (i.e. no explicit stop was set). + """ + slices: list[int | slice] = [] + for i in range(len(self.shape)): + if i in self.fixed_values: + slices.append(self.fixed_values[i]) + elif self.navigable_dims and i == self.navigable_dims[0]: + start = max(0, min(self.row_start, self.shape[i])) + if self.row_stop > self.row_start: + stop = min(self.row_stop, self.shape[i]) + else: + stop = min(start + max_rows, self.shape[i]) + slices.append(slice(start, stop)) + elif len(self.navigable_dims) > 1 and i == self.navigable_dims[1]: + start = max(0, min(self.col_start, self.shape[i])) + if self.col_stop > self.col_start: + stop = min(self.col_stop, self.shape[i]) + else: + stop = min(start + max_cols, self.shape[i]) + slices.append(slice(start, stop)) + else: + slices.append(slice(0, self.shape[i])) + return tuple(slices) + + def copy_with( + self, + *, + fixed_values: dict[int, int] | None = None, + navigable_dims: list[int] | None = None, + row_start: int | None = None, + row_stop: int | None = None, + col_start: int | None = None, + col_stop: int | None = None, + ) -> DataSliceLayout: + """Return a new layout with specified fields overridden.""" + return DataSliceLayout( + shape=self.shape, + fixed_values=self.fixed_values if fixed_values is None else fixed_values, + navigable_dims=list(self.navigable_dims) if navigable_dims is None else navigable_dims, + row_start=self.row_start if row_start is None else row_start, + row_stop=self.row_stop if row_stop is None else row_stop, + col_start=self.col_start if col_start is None else col_start, + col_stop=self.col_stop if col_stop is None else col_stop, + ) + + def total_for_dim(self, dim: int) -> int: + """Return the total size of *dim*.""" + if 0 <= dim < len(self.shape): + return self.shape[dim] + return 0 + + +class StoreBrowser: + """Small, read-only adapter used by the b2view UI. + + The adapter intentionally exposes a narrow API so the TUI does not depend + on TreeStore internals. It accepts either a TreeStore hierarchy or a + single top-level Blosc2 object (for example a standalone CTable). It + performs bounded previews only; callers must explicitly request pages or + slices. + """ + + def __init__(self, urlpath: str): + self.urlpath = urlpath + self.store = blosc2.open(urlpath, mode="r") + self.is_tree = isinstance(self.store, blosc2.TreeStore) + + def close(self) -> None: + close = getattr(self.store, "close", None) + if close is not None: + close() + + def __enter__(self) -> StoreBrowser: + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.close() + + @staticmethod + def normalize_path(path: str) -> str: + """Return an absolute TreeStore path.""" + if not path: + return "/" + if not path.startswith("/"): + path = "/" + path + normalized = str(PurePosixPath(path)) + return "/" if normalized == "." else normalized + + def list_children(self, path: str = "/") -> list[NodeInfo]: + """Return direct children for *path*.""" + path = self.normalize_path(path) + if not self.is_tree: + self._check_root_path(path) + return [] + + children = [] + for child_path in self.store.get_children(path): + descendants = self.store.get_descendants(child_path) + has_children = bool(descendants) + kind = "group" if has_children else self.kind(child_path) + children.append( + NodeInfo( + path=child_path, + name=child_path.rsplit("/", 1)[-1] or "/", + kind=kind, + has_children=has_children, + ) + ) + return children + + def kind(self, path: str) -> str: + """Classify a browser path.""" + path = self.normalize_path(path) + if not self.is_tree: + self._check_root_path(path) + return object_kind(self.store) + if path == "/" or self.store.get_descendants(path): + return "group" + obj = self.store[path] + return object_kind(obj) + + def get_info(self, path: str) -> ObjectInfo: + """Return metadata for *path*.""" + path = self.normalize_path(path) + kind = self.kind(path) + if kind == "group": + metadata: dict[str, Any] = { + "type": "TreeStore group", + "children": len(self.store.get_children(path)), + "descendants": len(self.store.get_descendants(path)), + } + user_attrs = self._vlmeta_dict(self.store.vlmeta) + return ObjectInfo(path=path, kind=kind, metadata=metadata, user_attrs=user_attrs) + + obj = self._get_object(path) + metadata = object_metadata(obj) + metadata.setdefault("type", type(obj).__name__) + user_attrs = self._vlmeta_dict(getattr(obj, "vlmeta", None)) + if user_attrs is None and self.is_tree: + user_attrs = self._vlmeta_dict(self.store.vlmeta) + return ObjectInfo(path=path, kind=kind, metadata=metadata, user_attrs=user_attrs) + + def preview( + self, + path: str, + *, + start: int = 0, + stop: int | None = None, + columns: list[str] | None = None, + slices: tuple[Any, ...] | None = None, + max_rows: int = 20, + max_cols: int = 10, + col_start: int = 0, + slice_indices: list[int] | None = None, + layout: DataSliceLayout | None = None, + ) -> Any: + """Return a bounded data preview for *path*. + + For N-D arrays (N >= 3) a *layout* may be provided instead of the + legacy *slice_indices*, *start*/*stop*, *col_start* parameters. + """ + path = self.normalize_path(path) + obj = self._get_object(path) + kind = object_kind(obj) + if kind in {"ndarray", "c2array"}: + shape = tuple(getattr(obj, "shape", ()) or ()) + if slices is None: + if layout is not None: + return preview_array_from_layout( + obj, layout=layout, max_rows=max_rows, max_cols=max_cols + ) + if len(shape) >= 3: + return preview_array_nd_slice( + obj, + slice_indices=slice_indices, + start=start, + stop=stop, + col_start=col_start, + max_cols=max_cols, + ) + if len(shape) == 2: + stop = min(start + max_rows, shape[0]) if stop is None else stop + return preview_array_2d( + obj, start=start, stop=stop, col_start=col_start, max_cols=max_cols + ) + if len(shape) == 1: + stop = min(start + max_rows, shape[0]) if stop is None else stop + return preview_array_1d(obj, start=start, stop=stop) + return preview_array(obj, slices=slices, max_rows=max_rows, max_cols=max_cols) + if kind == "ctable": + stop = min(start + max_rows, len(obj)) if stop is None else stop + return preview_ctable(obj, start=start, stop=stop, columns=columns, max_cols=max_cols) + if kind == "schunk": + return {"message": "SChunk byte preview is not implemented yet."} + return {"message": f"Preview is not supported for {kind!r} objects."} + + def _get_object(self, path: str) -> Any: + """Return the object represented by *path*.""" + path = self.normalize_path(path) + if self.is_tree: + return self.store[path] + self._check_root_path(path) + return self.store + + @staticmethod + def _check_root_path(path: str) -> None: + if path != "/": + raise KeyError(f"Standalone objects only expose the root path '/', got {path!r}") + + _INTERNAL_VLMETA_KEYS = frozenset( + { + "kind", + "version", + "schema", + "n_rows", + "value_epoch", + "computed_columns", + "materialized_columns", + } + ) + + @staticmethod + def _vlmeta_dict(vlmeta) -> dict[str, Any] | None: + if vlmeta is None: + return None + try: + data = vlmeta[:] + except Exception: + try: + data = {name: vlmeta[name] for name in vlmeta} + except Exception: + return None + if data is None: + return None + # Filter out internal blosc2 metadata keys (schema, version, etc.) + return {k: v for k, v in data.items() if k not in StoreBrowser._INTERNAL_VLMETA_KEYS} + + +def object_kind(obj: Any) -> str: + """Return a stable b2view kind string for *obj*.""" + if isinstance(obj, blosc2.TreeStore): + return "group" + if isinstance(obj, blosc2.NDArray): + return "ndarray" + if isinstance(obj, blosc2.CTable): + return "ctable" + if hasattr(blosc2, "C2Array") and isinstance(obj, blosc2.C2Array): + return "c2array" + if isinstance(obj, blosc2.SChunk): + return "schunk" + return "unknown" + + +def object_metadata(obj: Any) -> dict[str, Any]: + """Extract lightweight metadata from a supported object.""" + kind = object_kind(obj) + if kind in {"ndarray", "c2array"}: + return { + "shape": getattr(obj, "shape", None), + "ndim": len(getattr(obj, "shape", ()) or ()), + "dtype": str(getattr(obj, "dtype", None)), + "chunks": getattr(obj, "chunks", None), + "blocks": getattr(obj, "blocks", None), + "nbytes": getattr(obj, "nbytes", None), + "cbytes": getattr(obj, "cbytes", None), + } + if kind == "ctable": + try: + return dict(obj.info_items) + except Exception: + return { + "rows": getattr(obj, "nrows", len(obj)), + "columns": getattr(obj, "ncols", len(getattr(obj, "col_names", []))), + "schema": { + name: str(getattr(obj[name], "dtype", None)) for name in getattr(obj, "col_names", []) + }, + } + if kind == "schunk": + return { + "chunks": getattr(obj, "nchunks", None), + "nbytes": getattr(obj, "nbytes", None), + "cbytes": getattr(obj, "cbytes", None), + } + return {"repr": repr(obj)} + + +def preview_array_from_layout( + obj: Any, + *, + layout: DataSliceLayout, + max_rows: int = 20, + max_cols: int = 10, +) -> dict[str, Any]: + """Return a bounded preview for an N-D array using a *layout*. + + The layout describes which dimensions are fixed (slider) vs navigable + (table rows/columns). At most 2 navigable dimensions are allowed. + """ + shape = tuple(getattr(obj, "shape", ()) or ()) + if len(shape) != len(layout.shape): + raise ValueError(f"Layout shape {layout.shape} does not match object shape {shape}") + ndim = len(shape) + navigable = layout.navigable_dims + + # Determine row and col navigable dims + row_dim = navigable[0] if len(navigable) >= 1 else None + col_dim = navigable[1] if len(navigable) >= 2 else None + + # Page sizes + nrows = shape[row_dim] if row_dim is not None else 1 + ncols = shape[col_dim] if col_dim is not None else 1 + + # Clamp fixed values + fixed_values = {} + for d, val in layout.fixed_values.items(): + total = shape[d] + fixed_values[d] = max(0, min(val, total - 1)) if total > 0 else 0 + + # Ensure every non-navigable dim is fixed at 0 (safety catch) + for i in range(ndim): + if i not in fixed_values and (row_dim is None or i != row_dim) and (col_dim is None or i != col_dim): + fixed_values[i] = 0 + + # Build slicing tuple + idx: list[int | slice] = [] + for i in range(ndim): + if i in fixed_values: + idx.append(fixed_values[i]) + elif row_dim is not None and i == row_dim: + start = max(0, min(layout.row_start, nrows)) + stop = min(max(start, start + max_rows), nrows) + idx.append(slice(start, stop)) + elif col_dim is not None and i == col_dim: + col_start = max(0, min(layout.col_start, ncols)) + col_stop = min(col_start + max_cols, ncols) + idx.append(slice(col_start, col_stop)) + else: + # Shouldn't happen: non-navigable dims are caught above + idx.append(slice(0, shape[i])) + + values = np.asarray(obj[tuple(idx)]) + + # Build column labels — match data keys below + if col_dim is not None: + col_start = max(0, min(layout.col_start, ncols)) + col_stop = min(col_start + max_cols, ncols) + columns = [str(i) for i in range(col_start, col_stop)] + elif row_dim is not None: + columns = ["value"] + else: + columns = ["value"] + + # Extract 2-D data from result + data: dict[str, Any] = {} + if row_dim is not None and col_dim is not None: + # 2-D navigable → 2-D table + col_start = max(0, min(layout.col_start, ncols)) + col_stop = min(col_start + max_cols, ncols) + for i, c in enumerate(range(col_start, col_stop)): + data[str(c)] = values[:, i] + elif row_dim is not None: + # Only rows navigable → 1-D view + data["value"] = values + else: + # 0 navigable → scalar + data["value"] = np.asarray([values.item()]) if np.ndim(values) == 0 else np.asarray([values]) + + row_start_val = max(0, min(layout.row_start, nrows)) if row_dim is not None else 0 + row_stop_val = min(row_start_val + max_rows, nrows) if row_dim is not None else 1 + col_start_val = max(0, min(layout.col_start, ncols)) if col_dim is not None else 0 + col_stop_val = min(col_start_val + max_cols, ncols) if col_dim is not None else 1 + + result: dict[str, Any] = { + "start": row_start_val, + "stop": row_stop_val, + "nrows": nrows, + "columns": columns, + "hidden_columns": max(0, ncols - (col_stop_val - col_start_val)), + "data": data, + "source_kind": "ndarray_slice", + "shape": shape, + "col_start": col_start_val, + "col_stop": col_stop_val, + "ncols": ncols, + "layout": layout, + "slice_indices": [fixed_values.get(i, 0) for i in range(min(ndim - 2, ndim))], + "n_slices_per_dim": [shape[i] for i in range(ndim) if i in fixed_values], + } + # Keep legacy fields for backward compat + result["slice_indices"] = [fixed_values.get(i, 0) for i in range(ndim) if i in fixed_values] + result["n_slices_per_dim"] = [shape[i] for i in range(ndim) if i in fixed_values] + return result + + +def preview_array_nd_slice( + obj: Any, + *, + slice_indices: list[int] | None = None, + start: int = 0, + stop: int = 20, + col_start: int = 0, + max_cols: int = 10, +) -> dict[str, Any]: + """Return a bounded 2-D slice preview for N-D arrays (N >= 3).""" + shape = tuple(getattr(obj, "shape", ()) or ()) + ndim = len(shape) + if ndim < 3: + raise ValueError(f"Expected an N-D array with N >= 3, got shape {shape!r}") + n_leading = ndim - 2 + n_slices_per_dim = list(shape[:n_leading]) + if slice_indices is None or len(slice_indices) != n_leading: + slice_indices = [0] * n_leading + # Clamp + slice_indices = [ + min(max(0, idx), n_slices_per_dim[i] - 1) if n_slices_per_dim[i] > 0 else 0 + for i, idx in enumerate(slice_indices) + ] + nrows, ncols = shape[-2], shape[-1] + if stop is None: + stop = min(start + 20, nrows) + start = max(0, min(start, nrows)) + stop = min(max(start, stop), nrows) + col_start = max(0, min(col_start, ncols)) + col_stop = min(col_start + max_cols, ncols) + columns = [str(i) for i in range(col_start, col_stop)] + idx = tuple(slice_indices) + (slice(start, stop), slice(col_start, col_stop)) + values = np.asarray(obj[idx]) + data = {str(col): values[:, i] for i, col in enumerate(range(col_start, col_stop))} + return { + "start": start, + "stop": stop, + "nrows": nrows, + "columns": columns, + "hidden_columns": max(0, ncols - (col_stop - col_start)), + "data": data, + "source_kind": "ndarray_slice", + "shape": shape, + "col_start": col_start, + "col_stop": col_stop, + "ncols": ncols, + "slice_indices": slice_indices, + "n_slices_per_dim": n_slices_per_dim, + } + + +def preview_array_2d( + obj: Any, *, start: int = 0, stop: int = 20, col_start: int = 0, max_cols: int = 10 +) -> dict[str, Any]: + """Return a bounded row/column preview for a 2-D array.""" + shape = tuple(getattr(obj, "shape", ()) or ()) + if len(shape) != 2: + raise ValueError(f"Expected a 2-D array, got shape {shape!r}") + nrows, ncols = shape + start = max(0, min(start, nrows)) + stop = min(max(start, stop), nrows) + col_start = max(0, min(col_start, ncols)) + col_stop = min(col_start + max_cols, ncols) + columns = [str(i) for i in range(col_start, col_stop)] + values = np.asarray(obj[(slice(start, stop), slice(col_start, col_stop))]) + data = {str(col): values[:, i] for i, col in enumerate(range(col_start, col_stop))} + return { + "start": start, + "stop": stop, + "nrows": nrows, + "columns": columns, + "hidden_columns": max(0, ncols - (col_stop - col_start)), + "data": data, + "source_kind": "ndarray2d", + "shape": shape, + "col_start": col_start, + "col_stop": col_stop, + "ncols": ncols, + } + + +def preview_array_1d(obj: Any, *, start: int = 0, stop: int = 20, **kwargs) -> dict[str, Any]: + """Return a bounded row preview for a 1-D array.""" + shape = tuple(getattr(obj, "shape", ()) or ()) + if len(shape) != 1: + raise ValueError(f"Expected a 1-D array, got shape {shape!r}") + nrows = shape[0] + start = max(0, min(start, nrows)) + stop = min(max(start, stop), nrows) + data = { + "value": np.asarray(obj[start:stop]), + } + return { + "start": start, + "stop": stop, + "nrows": nrows, + "columns": ["value"], + "hidden_columns": 0, + "data": data, + "source_kind": "ndarray1d", + "shape": shape, + } + + +def preview_array( + obj: Any, *, slices: tuple[Any, ...] | None = None, max_rows: int = 20, max_cols: int = 10 +): + """Return a small NumPy preview from an NDArray/C2Array-like object.""" + shape = tuple(getattr(obj, "shape", ()) or ()) + if slices is None: + if len(shape) == 0: + slices = () + elif len(shape) == 1: + slices = (slice(0, min(shape[0], max_rows)),) + elif len(shape) == 2: + slices = (slice(0, min(shape[0], max_rows)), slice(0, min(shape[1], max_cols))) + else: + leading = tuple(0 for _ in shape[:-2]) + slices = leading + ( + slice(0, min(shape[-2], max_rows)), + slice(0, min(shape[-1], max_cols)), + ) + return np.asarray(obj[slices]) + + +def preview_ctable( + obj: Any, + *, + start: int = 0, + stop: int = 20, + columns: list[str] | None = None, + max_cols: int = 10, + include_expensive: bool = False, +) -> dict[str, Any]: + """Return a bounded column-oriented preview from a CTable. + + Complex nested/list/object columns may require one variable-length block + read per row. By default, keep table navigation responsive by showing a + placeholder for those columns instead of decoding them eagerly. + """ + all_columns = list(getattr(obj, "col_names", [])) + visible_columns = all_columns if columns is None else [name for name in columns if name in all_columns] + hidden_columns = max(0, len(visible_columns) - max_cols) + visible_columns = visible_columns[:max_cols] + start = max(0, start) + stop = min(max(start, stop), len(obj)) + data = {} + skipped_columns = {} + nrows = stop - start + for name in visible_columns: + if not include_expensive and is_expensive_ctable_column(obj, name): + label = ctable_column_label(obj, name) + placeholder = f"<{label}; skipped>" + data[name] = np.full(nrows, placeholder, dtype=object) + skipped_columns[name] = label + else: + data[name] = safe_asarray(obj[name][start:stop]) + return { + "start": start, + "stop": stop, + "nrows": len(obj), + "columns": visible_columns, + "hidden_columns": hidden_columns, + "skipped_columns": skipped_columns, + "data": data, + } + + +def is_expensive_ctable_column(obj: Any, name: str) -> bool: + """Return whether previewing a CTable column is likely row-by-row expensive.""" + try: + schema = obj.schema_dict() + except Exception: + return False + for column in schema.get("columns", []): + if column.get("name") != name: + continue + return column.get("kind") in {"list", "struct", "object", "ndarray"} + return False + + +def ctable_column_label(obj: Any, name: str) -> str: + """Return a compact schema label for *name*.""" + try: + schema = dict(obj.info_items).get("schema", {}) + label = schema.get(name) + if label is not None: + return str(label) + except Exception: + pass + try: + for column in obj.schema_dict().get("columns", []): + if column.get("name") == name: + return str(column.get("kind", "complex")) + except Exception: + pass + return "complex" + + +def safe_asarray(values: Any) -> np.ndarray: + """Convert preview values to an array, preserving ragged/nested values. + + NumPy 2 raises for ragged nested sequences unless ``dtype=object`` is + requested explicitly. CTable columns can legitimately contain list/struct + values, so previews must keep those as object cells instead of failing. + """ + try: + return np.asarray(values) + except ValueError: + return np.asarray(values, dtype=object) diff --git a/src/blosc2/b2view/render.py b/src/blosc2/b2view/render.py new file mode 100644 index 000000000..207b9c110 --- /dev/null +++ b/src/blosc2/b2view/render.py @@ -0,0 +1,140 @@ +"""Rich render helpers for b2view.""" + +from __future__ import annotations + +from pprint import pformat +from textwrap import wrap +from typing import Any + +import numpy as np + + +def make_metadata_renderable(info): + """Return a Rich renderable for ObjectInfo metadata.""" + from rich.table import Table + + table = Table(show_header=False, box=None, expand=True) + table.add_column("key", style="bold cyan", no_wrap=True) + table.add_column("value") + table.add_row("path", info.path) + table.add_row("kind", info.kind) + for key, value in info.metadata.items(): + table.add_row(str(key), _format_metadata_value(value)) + return table + + +def make_preview_renderable(preview: Any): + """Return a single Rich renderable for a preview object.""" + _, body = make_preview_renderables(preview) + return body + + +def make_preview_renderables(preview: Any): + """Return ``(header, body)`` Rich renderables for a preview object. + + CTable previews get a separate header renderable so the UI can keep column + titles fixed while only the row body scrolls. Other preview kinds return + ``None`` for the header. + """ + from rich.pretty import Pretty + from rich.table import Table + from rich.text import Text + + if isinstance(preview, np.ndarray): + return None, Text(np.array2string(preview, threshold=200, edgeitems=5), no_wrap=False) + + if isinstance(preview, dict) and "data" in preview and "columns" in preview: + widths = _preview_column_widths(preview) + header = _make_ctable_header(preview, widths) + body = Table(expand=True, show_header=False, show_lines=False) + for name in preview["columns"]: + body.add_column(name, width=widths[name], overflow="fold") + nrows = preview["stop"] - preview["start"] + for i in range(nrows): + body.add_row(*[_format_cell(preview["data"][name][i]) for name in preview["columns"]]) + if preview.get("hidden_columns", 0): + body.caption = f"{preview['hidden_columns']} columns hidden" + return header, body + + if isinstance(preview, dict) and "message" in preview: + return None, Text(str(preview["message"])) + + return None, Pretty(preview) + + +def _make_ctable_header(preview: dict[str, Any], widths: dict[str, int]): + from rich.align import Align + from rich.console import Group + from rich.text import Text + + title = Align.center(Text(f"rows {preview['start']}:{preview['stop']} of {preview['nrows']}")) + wrapped_columns = [] + for name in preview["columns"]: + width = widths[name] + parts = wrap(name, width=width, break_long_words=True, break_on_hyphens=False) or [""] + wrapped_columns.append(parts) + height = max(len(parts) for parts in wrapped_columns) if wrapped_columns else 0 + lines = [] + for row in range(height): + cells = [] + for name, parts in zip(preview["columns"], wrapped_columns, strict=True): + width = widths[name] + text = parts[row] if row < len(parts) else "" + cells.append(f" {text:<{width}} ") + lines.append("│".join(cells)) + return Group(title, Text("\n".join(lines))) + + +def _preview_column_widths(preview: dict[str, Any], *, max_width: int = 40) -> dict[str, int]: + widths = {} + nrows = preview["stop"] - preview["start"] + for name in preview["columns"]: + values = preview["data"][name] + width = len(name) + for i in range(nrows): + width = max(width, min(max_width, len(_format_cell(values[i])))) + widths[name] = min(max_width, max(4, width)) + return widths + + +def _format_metadata_value(value: Any) -> str: + if isinstance(value, dict): + return "\n".join(f"{key}: {val}" for key, val in value.items()) or "{}" + if isinstance(value, (list, tuple)): + return repr(value) + return str(value) + + +def format_cell(value: Any) -> str: + if isinstance(value, np.generic): + value = value.item() + if isinstance(value, np.ndarray): + text = np.array2string(value, threshold=20, formatter={"float_kind": lambda x: _fmt_float(x)}) + elif isinstance(value, (list, tuple, dict)): + text = pformat(value, compact=True, width=80) + elif isinstance(value, float): + text = _fmt_float(value) + else: + text = str(value) + text = " ".join(text.splitlines()) + return text if len(text) <= 200 else text[:197] + "..." + + +def _fmt_float(x: float) -> str: + """Show floats with a fixed width of 9 characters and up to 6 decimal digits, right-aligned.""" + if abs(x) >= 1e9 or (abs(x) < 1e-6 and abs(x) > 0): + return f"{x: .6e}" + if abs(x) == 0: + return " 0.0" + abs_x = abs(x) + # Choose format to keep total width ~9 chars including leading space for sign + if abs_x < 10: + return f"{x:9.6f}"[:9] + if abs_x < 1000: + return f"{x:9.3f}"[:9] + if abs_x < 1e6: + return f"{x:9.0f}"[:9] + return f"{x:9.0f}"[:9] + + +_format_cell = format_cell diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index bddb5a0a6..7ef078cdd 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -618,6 +618,8 @@ cdef extern from "b2nd.h": const int64_t *stop) int b2nd_from_cbuffer(b2nd_context_t *ctx, b2nd_array_t **array, void *buffer, int64_t buffersize) int b2nd_to_cbuffer(b2nd_array_t *array, void *buffer, int64_t buffersize) + int b2nd_get_sparse_cbuffer(b2nd_array_t *array, int64_t ncoords, const int64_t *coords, + void *buffer, int64_t buffersize) int b2nd_from_cframe(uint8_t *cframe, int64_t cframe_len, c_bool copy, b2nd_array_t ** array); int b2nd_to_cframe(const b2nd_array_t *array, uint8_t ** cframe, int64_t *cframe_len, c_bool *needs_free); @@ -774,8 +776,22 @@ ctypedef struct udf_udata: int64_t chunks_in_array[B2ND_MAX_DIM] int64_t blocks_in_chunk[B2ND_MAX_DIM] +ctypedef enum: + ME_CACHE_EMPTY + ME_CACHE_LOADING + ME_CACHE_READY + ME_CACHE_ERROR + +ctypedef struct me_input_cache_s: + uint8_t* data + int64_t nchunk + int state + PyThread_type_lock state_lock + PyThread_type_lock ready_lock + ctypedef struct me_udata: b2nd_array_t** inputs + me_input_cache_s* input_chunk_caches int ninputs me_eval_params* eval_params b2nd_array_t* array @@ -811,14 +827,9 @@ cdef _check_comp_length(comp_name, comp_len): blosc2_init() -cdef PyThread_type_lock chunk_cache_lock = PyThread_allocate_lock() -if chunk_cache_lock == NULL: - raise MemoryError("Could not allocate chunk cache lock") @atexit.register def destroy(): - if chunk_cache_lock != NULL: - PyThread_free_lock(chunk_cache_lock) blosc2_destroy() @@ -2300,6 +2311,9 @@ cdef class SChunk: cdef udf_udata* udf_data cdef user_filters_udata* udata cdef mm_udata* mm_data + cdef me_udata* me_data + cdef me_input_cache_s* input_cache + cdef int i if func_name is not None and func_name in blosc2.prefilter_funcs: del blosc2.prefilter_funcs[func_name] @@ -2309,12 +2323,22 @@ cdef class SChunk: if self.schunk.storage.cparams.preparams != NULL: me_data = self.schunk.storage.cparams.preparams.user_data if me_data != NULL: - if me_data.inputs != NULL: + if me_data.input_chunk_caches != NULL: for i in range(me_data.ninputs): - if me_data.inputs[i].chunk_cache.data != NULL: - free(me_data.inputs[i].chunk_cache.data) - me_data.inputs[i].chunk_cache.data = NULL - me_data.inputs[i].chunk_cache.nchunk = -1 + input_cache = &me_data.input_chunk_caches[i] + if input_cache.data != NULL: + free(input_cache.data) + input_cache.data = NULL + input_cache.nchunk = -1 + input_cache.state = ME_CACHE_EMPTY + if input_cache.state_lock != NULL: + PyThread_free_lock(input_cache.state_lock) + input_cache.state_lock = NULL + if input_cache.ready_lock != NULL: + PyThread_free_lock(input_cache.ready_lock) + input_cache.ready_lock = NULL + free(me_data.input_chunk_caches) + if me_data.inputs != NULL: free(me_data.inputs) if me_data.miniexpr_handle != NULL: # XXX do we really need the conditional? me_free(me_data.miniexpr_handle) @@ -2362,15 +2386,11 @@ cdef class SChunk: if self.schunk.storage.dparams.postfilter != NULL: self.remove_postfilter(func_name=None, _new_ctx=False) - # Release the GIL while freeing the C-Blosc2 super-chunk. - # blosc2_schunk_free -> blosc2_free_ctx -> release_threadpool - # joins worker pthreads; holding the GIL here can cause hangs - # when thousands of SChunks are finalized at once (e.g. during - # gc.collect() in Python 3.14+ where gen-2 threshold is 0). + # Free the C-Blosc2 super-chunk with the GIL held so threadpool + # teardown cannot race with active miniexpr workers. schunk_ptr = self.schunk self.schunk = NULL - with nogil: - blosc2_schunk_free(schunk_ptr) + blosc2_schunk_free(schunk_ptr) # postfilter @@ -2433,6 +2453,8 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock, cdef uint8_t* src cdef uint8_t* chunk cdef c_bool needs_free + cdef uint8_t* loaded_chunk + cdef me_input_cache_s* input_cache cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes cdef int start, blocknitems, expected_blocknitems cdef int64_t valid_nitems @@ -2467,30 +2489,58 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock, if ndarr.sc.storage.urlpath == NULL: src = ndarr.sc.data[nchunk] else: - # We need to get the chunk from disk/network - if ndarr.chunk_cache.nchunk != nchunk: - PyThread_acquire_lock(chunk_cache_lock, 1) - # We need to check again, as another thread may have updated the cache already - if ndarr.chunk_cache.nchunk != nchunk: - if ndarr.chunk_cache.data != NULL: - free(ndarr.chunk_cache.data) - ndarr.chunk_cache.data = NULL - rc = blosc2_schunk_get_chunk(ndarr.sc, nchunk, &chunk, &needs_free) - if rc < 0: - PyThread_release_lock(chunk_cache_lock) - raise ValueError("miniexpr: error getting chunk") - if not needs_free: - src = malloc(rc) - if src == NULL: - PyThread_release_lock(chunk_cache_lock) - raise MemoryError("miniexpr: cannot allocate chunk copy") - memcpy(src, chunk, rc) - else: - src = chunk - ndarr.chunk_cache.data = src - ndarr.chunk_cache.nchunk = nchunk - PyThread_release_lock(chunk_cache_lock) - src = ndarr.chunk_cache.data + input_cache = &udata.input_chunk_caches[i] + if input_cache.state_lock == NULL or input_cache.ready_lock == NULL: + raise MemoryError("miniexpr: cache locks not assigned") + while True: + PyThread_acquire_lock(input_cache.state_lock, 1) + if input_cache.state == ME_CACHE_READY and input_cache.nchunk == nchunk and input_cache.data != NULL: + src = input_cache.data + PyThread_release_lock(input_cache.state_lock) + break + if input_cache.state == ME_CACHE_ERROR and input_cache.nchunk == nchunk: + PyThread_release_lock(input_cache.state_lock) + raise ValueError("miniexpr: error getting chunk") + if input_cache.state == ME_CACHE_LOADING: + PyThread_release_lock(input_cache.state_lock) + PyThread_acquire_lock(input_cache.ready_lock, 1) + PyThread_release_lock(input_cache.ready_lock) + continue + PyThread_acquire_lock(input_cache.ready_lock, 1) + input_cache.state = ME_CACHE_LOADING + input_cache.nchunk = nchunk + PyThread_release_lock(input_cache.state_lock) + + rc = blosc2_schunk_get_chunk(ndarr.sc, nchunk, &chunk, &needs_free) + if rc < 0: + PyThread_acquire_lock(input_cache.state_lock, 1) + input_cache.state = ME_CACHE_ERROR + PyThread_release_lock(input_cache.state_lock) + PyThread_release_lock(input_cache.ready_lock) + raise ValueError("miniexpr: error getting chunk") + + if not needs_free: + loaded_chunk = malloc(rc) + if loaded_chunk == NULL: + PyThread_acquire_lock(input_cache.state_lock, 1) + input_cache.state = ME_CACHE_ERROR + PyThread_release_lock(input_cache.state_lock) + PyThread_release_lock(input_cache.ready_lock) + raise MemoryError("miniexpr: cannot allocate chunk copy") + memcpy(loaded_chunk, chunk, rc) + else: + loaded_chunk = chunk + + PyThread_acquire_lock(input_cache.state_lock, 1) + if input_cache.data != NULL: + free(input_cache.data) + input_cache.data = loaded_chunk + input_cache.nchunk = nchunk + input_cache.state = ME_CACHE_READY + src = input_cache.data + PyThread_release_lock(input_cache.state_lock) + PyThread_release_lock(input_cache.ready_lock) + break rc = blosc2_cbuffer_sizes(src, &chunk_nbytes, &chunk_cbytes, &block_nbytes) if rc < 0: raise ValueError("miniexpr: error getting cbuffer sizes") @@ -3086,7 +3136,7 @@ def open(urlpath, mode, offset, **kwargs): if is_ndarray: res = blosc2.NDArray(_schunk=PyCapsule_New(array.sc, "blosc2_schunk*", NULL), - _array=PyCapsule_New(array, "b2nd_array_t*", NULL)) + _array=PyCapsule_New(array, "b2nd_array_t*", NULL), mode=mode) if cparams is not None: res.schunk.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) else: @@ -3560,6 +3610,23 @@ cdef class NDArray: return arr + def get_sparse_numpy(self, arr, coords): + cdef np.ndarray[np.int64_t, ndim=1, mode="c"] coords_ = np.ascontiguousarray(coords, dtype=np.int64) + cdef Py_buffer view + cdef int64_t ncoords = coords_.shape[0] + cdef int rc + + PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE) + if view.len < ncoords * self.array.sc.typesize: + PyBuffer_Release(&view) + raise ValueError("destination buffer is smaller than the requested sparse selection") + + rc = b2nd_get_sparse_cbuffer(self.array, ncoords, coords_.data, + view.buf, view.len) + PyBuffer_Release(&view) + _check_rc(rc, "Error while getting the sparse selection") + return arr + def get_oindex_numpy(self, arr, key): """ Orthogonal indexing. Key is a tuple of lists of integer indices. @@ -3751,17 +3818,78 @@ cdef class NDArray: return udata cdef me_udata *_fill_me_udata(self, inputs, fp_accuracy, aux_reduc, jit=None): - cdef me_udata *udata = malloc(sizeof(me_udata)) + cdef me_udata *udata = calloc(1, sizeof(me_udata)) + cdef me_eval_params* eval_params + cdef b2nd_array_t** inputs_ + cdef me_input_cache_s* input_chunk_caches + cdef void* aux_reduc_ptr = NULL + cdef int i + if aux_reduc is not None: + if not isinstance(aux_reduc, np.ndarray): + raise TypeError("aux_reduc must be a NumPy array") + aux_reduc_ptr = np.PyArray_DATA( aux_reduc) operands = list(inputs.values()) ninputs = len(operands) - cdef b2nd_array_t** inputs_ = malloc(ninputs * sizeof(b2nd_array_t*)) + if udata == NULL: + raise MemoryError("Cannot allocate miniexpr user data") + inputs_ = NULL + if ninputs > 0: + inputs_ = malloc(ninputs * sizeof(b2nd_array_t*)) + if inputs_ == NULL: + free(udata) + raise MemoryError("Cannot allocate miniexpr input table") for i, operand in enumerate(operands): inputs_[i] = operand.c_array - inputs_[i].chunk_cache.nchunk = -1 - inputs_[i].chunk_cache.data = NULL udata.inputs = inputs_ udata.ninputs = ninputs - cdef me_eval_params* eval_params = malloc(sizeof(me_eval_params)) + input_chunk_caches = NULL + if ninputs > 0: + input_chunk_caches = calloc(ninputs, sizeof(me_input_cache_s)) + if input_chunk_caches == NULL: + free(inputs_) + free(udata) + raise MemoryError("Cannot allocate miniexpr chunk caches") + for i in range(ninputs): + input_chunk_caches[i].nchunk = -1 + input_chunk_caches[i].state = ME_CACHE_EMPTY + input_chunk_caches[i].state_lock = PyThread_allocate_lock() + if input_chunk_caches[i].state_lock == NULL: + while i > 0: + i -= 1 + if input_chunk_caches[i].state_lock != NULL: + PyThread_free_lock(input_chunk_caches[i].state_lock) + if input_chunk_caches[i].ready_lock != NULL: + PyThread_free_lock(input_chunk_caches[i].ready_lock) + free(input_chunk_caches) + free(inputs_) + free(udata) + raise MemoryError("Cannot allocate miniexpr chunk cache state lock") + input_chunk_caches[i].ready_lock = PyThread_allocate_lock() + if input_chunk_caches[i].ready_lock == NULL: + PyThread_free_lock(input_chunk_caches[i].state_lock) + input_chunk_caches[i].state_lock = NULL + while i > 0: + i -= 1 + if input_chunk_caches[i].state_lock != NULL: + PyThread_free_lock(input_chunk_caches[i].state_lock) + if input_chunk_caches[i].ready_lock != NULL: + PyThread_free_lock(input_chunk_caches[i].ready_lock) + free(input_chunk_caches) + free(inputs_) + free(udata) + raise MemoryError("Cannot allocate miniexpr chunk cache ready lock") + udata.input_chunk_caches = input_chunk_caches + eval_params = malloc(sizeof(me_eval_params)) + if eval_params == NULL: + for i in range(ninputs): + if input_chunk_caches[i].state_lock != NULL: + PyThread_free_lock(input_chunk_caches[i].state_lock) + if input_chunk_caches[i].ready_lock != NULL: + PyThread_free_lock(input_chunk_caches[i].ready_lock) + free(input_chunk_caches) + free(inputs_) + free(udata) + raise MemoryError("Cannot allocate miniexpr eval params") eval_params.disable_simd = False eval_params.simd_ulp_mode = ME_SIMD_ULP_3_5 if fp_accuracy == blosc2.FPAccuracy.MEDIUM else ME_SIMD_ULP_1 if jit is None: @@ -3772,11 +3900,6 @@ cdef class NDArray: eval_params.jit_mode = ME_JIT_OFF udata.eval_params = eval_params udata.array = self.array - cdef void* aux_reduc_ptr = NULL - if aux_reduc is not None: - if not isinstance(aux_reduc, np.ndarray): - raise TypeError("aux_reduc must be a NumPy array") - aux_reduc_ptr = np.PyArray_DATA( aux_reduc) udata.aux_reduc_ptr = aux_reduc_ptr # Save these in udf_udata to avoid computing them for each block for i in range(self.array.ndim): diff --git a/src/blosc2/c2array.py b/src/blosc2/c2array.py index c662740d7..4f2a3bda1 100644 --- a/src/blosc2/c2array.py +++ b/src/blosc2/c2array.py @@ -246,6 +246,18 @@ def __init__(self, path: str, /, urlbase: str | None = None, auth_token: str | N cparams.pop("filters, meta", None) self._cparams = blosc2.CParams(**cparams) + def __enter__(self) -> C2Array: + """Enter a context manager and return this remote array.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Exit a context manager. + + ``C2Array`` does not currently hold explicit closeable resources, so this + is a logical no-op kept for API consistency with :func:`blosc2.open`. + """ + return False + def _to_b2object_payload(self) -> dict: payload = encode_b2object_payload(self) if payload is None: diff --git a/src/blosc2/cli/parquet_to_blosc2.py b/src/blosc2/cli/parquet_to_blosc2.py index 8f5c25070..3f6520bfa 100644 --- a/src/blosc2/cli/parquet_to_blosc2.py +++ b/src/blosc2/cli/parquet_to_blosc2.py @@ -48,6 +48,7 @@ DEFAULT_BATCH_SIZE = 2048 MAX_ELEMENT_WRITE_BATCH = 5_000_000 # cap on flattened elements yielded per write +UNNAMED_ROOT_CAPACITY_SAFETY = 1.15 # first-batch estimates are often a little low def require_pyarrow(): @@ -1060,7 +1061,9 @@ def import_unnamed_root_separate_cols( avg_per_outer_row = n_elems_sampled / n_outer_sampled estimated_batch_rows = max(1, round(args.parquet_batch_size * avg_per_outer_row)) estimate = round(total_parquet_rows * avg_per_outer_row) - if args.max_rows is not None: + if args.max_rows is None: + estimate = round(estimate * UNNAMED_ROOT_CAPACITY_SAFETY) + else: estimate = min(estimate, args.max_rows) capacity_hint = max(1, estimate) except Exception: diff --git a/src/blosc2/core.py b/src/blosc2/core.py index a00e17251..e0c4c4a35 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1435,15 +1435,15 @@ def get_chunksize(blocksize, l3_minimum=4 * 2**20, l3_maximum=2**26, reduc_facto # in L3 cache (reduc_factor will account for this). chunksize //= reduc_factor - # Chunksize should be at least the size of L2 + # Chunksize should be at least the size of L2 / reduc_factor so that + # multi-operand expressions can keep all operands in cache. l2_cache_size = cpu_info.get("l2_cache_size", "Not found") if isinstance(l2_cache_size, int) and l2_cache_size > chunksize: - # Apple Silicon has a large L2 cache, and memory bandwidth is high, - # so we can use a larger chunksize based on L2 cache size. - # chunksize = l2_cache_size * 4 - # But experiments show that using such a large chunksize - # can make indexes too large. Going back to using just L2. - chunksize = l2_cache_size + if platform.system() == "Darwin": + # On macOS, using the full L2 as a floor has shown better overall behavior + chunksize = l2_cache_size + else: + chunksize = max(l2_cache_size // reduc_factor, chunksize) # Ensure a minimum size if chunksize < l3_minimum: diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 2e28fe75c..8f0883075 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -10,7 +10,6 @@ from __future__ import annotations -import ast import contextlib import contextvars import copy @@ -31,6 +30,7 @@ import blosc2 from blosc2 import compute_chunks_blocks +from blosc2.ctable_indexing import _CTableIndexingMixin from blosc2.ctable_storage import ( FileTableStorage, InMemoryTableStorage, @@ -180,7 +180,9 @@ def sentinel_for_arrow_type(self, pa, pa_type): "display_precision": 6, "fancy": False, } -_SMALL_SORT_MATERIALIZE_LIMIT = 4096 +_SMALL_NROWS_LIMIT = 10_000_000 +_SMALL_SORT_MATERIALIZE_LIMIT = _SMALL_NROWS_LIMIT +_MAX_GROWTH_ROWS = 1_048_576 def get_null_policy() -> NullPolicy: @@ -275,52 +277,6 @@ def null_policy(policy: NullPolicy): } -class _FakeVlMeta: - """Minimal vlmeta stand-in that accepts writes without touching a real SChunk.""" - - def __init__(self): - self._data: dict = {} - - def __getitem__(self, key): - return self._data[key] - - def __setitem__(self, key, value): - self._data[key] = value - - def get(self, key, default=None): - return self._data.get(key, default) - - -class _FakeSchunk: - """Minimal SChunk stand-in whose vlmeta stores in memory.""" - - def __init__(self): - self.vlmeta = _FakeVlMeta() - - -class _CTableBuildProxy: - """Minimal shim that lets the ``indexing`` module build sidecars for a - CTable column without touching the column's own ``schunk.vlmeta``. - - Attributes mirror those required by the internal build functions: - ``urlpath``, ``schunk``, ``shape``, ``ndim``, ``dtype``, ``chunks``, - ``blocks``, and item access via ``__getitem__``. - """ - - def __init__(self, col_array: blosc2.NDArray, anchor_urlpath: str | None) -> None: - self._col_array = col_array - self.urlpath = anchor_urlpath # controls sidecar placement - self.schunk = _FakeSchunk() - self.shape = col_array.shape - self.ndim = col_array.ndim - self.dtype = col_array.dtype - self.chunks = col_array.chunks - self.blocks = col_array.blocks - - def __getitem__(self, key): - return self._col_array[key] - - class _CTableInfoReporter(InfoReporter): """Info reporter that also preserves the historic ``t.info()`` call style.""" @@ -1024,6 +980,18 @@ def view(self) -> ColumnViewIndexer: """ return ColumnViewIndexer(self) + def take(self, indices, /) -> Column: + """Return a column containing values at the requested logical positions. + + Indices are relative to the live values visible through this column + (including any column view mask). The result preserves the order of + ``indices`` and any duplicates. + """ + if self.is_computed: + raise ValueError("Column.take is not supported for computed columns yet.") + table_view = self._table.view(self._valid_rows).select([self._col_name]) + return table_view.take(indices)[self._col_name] + def __setitem__(self, key: int | slice | list | np.ndarray, value): # noqa: C901 """Set one or more live column values; accepts the same index forms as :meth:`__getitem__`.""" if self._table._read_only: @@ -1585,7 +1553,7 @@ def _dictionary_isin(self, values) -> np.ndarray: target_codes.add(dc.value_to_code(v)) if not target_codes: return np.zeros(len(live_pos), dtype=bool) - live_codes = np.asarray(dc.codes[live_pos], dtype=np.int32) + live_codes = dc.codes[live_pos] mask = np.zeros(len(live_codes), dtype=bool) for code in target_codes: mask |= live_codes == np.int32(code) @@ -2703,7 +2671,7 @@ def __delitem__(self, name: str) -> None: dict.__delitem__(self, name) -class CTable(Generic[RowT]): +class CTable(_CTableIndexingMixin, Generic[RowT]): """Columnar compressed table with typed columns and row-oriented access.""" #: Ordered list of stored column names. Computed columns are **not** @@ -2748,12 +2716,19 @@ def _iter_live_positions_chunks(self): def _live_positions_from_valid_rows_chunks(self) -> np.ndarray: """Return live physical row positions by scanning the validity NDArray chunk-wise.""" + cached = getattr(self, "_cached_live_positions", None) + if cached is not None: + return cached positions = list(self._iter_live_positions_chunks()) if not positions: - return np.empty(0, dtype=np.intp) - if len(positions) == 1: - return positions[0] - return np.concatenate(positions).astype(np.intp, copy=False) + result = np.empty(0, dtype=np.intp) + elif len(positions) == 1: + result = positions[0] + else: + result = np.concatenate(positions).astype(np.intp, copy=False) + if self.base is not None: + self._cached_live_positions = result + return result def __init__( self, @@ -2784,6 +2759,9 @@ def __init__( self._computed_cols: dict[str, dict] = {} # virtual/computed columns self._materialized_cols: dict[str, dict] = {} # stored columns auto-filled from expressions self._expr_index_arrays: dict[str, blosc2.NDArray] = {} + self._cached_index_catalog: dict | None = None + self._cached_index_catalog_revision: int | None = None + self._cached_live_positions: np.ndarray | None = None self._col_widths: dict[str, int] = {} self.col_names: list[str] = [] self.auto_compact = compact @@ -2823,6 +2801,10 @@ def __init__( cc = self._schema.columns_by_name[name] self._col_widths[name] = max(len(name), cc.display_width) self._n_rows = None + # Restore cached row count from saved metadata so that + # where() can skip the _valid_rows intersection for all-valid tables. + if "n_rows" in schema_dict: + self._n_rows_cached = schema_dict["n_rows"] self._last_pos = None # resolve lazily on first write # ---- Restore computed/materialized column metadata (if any) ---- self._computed_cols = {} @@ -2856,12 +2838,26 @@ def __init__( if new_data is not None: self._load_initial_data(new_data) + # Persist the row count so subsequent opens can skip the + # _valid_rows intersection in where(). + self._save_n_rows_to_meta() def close(self) -> None: """Close any persistent backing store held by this table.""" storage = getattr(self, "_storage", None) + # Persist row count for root tables so subsequent opens can skip + # the _valid_rows intersection in where() for all-valid tables. + if not self._read_only and self.base is None: + self._save_n_rows_to_meta() + # Persist user vlmeta if a dedicated SChunk was created + if storage is not None: + uv = getattr(storage, "_vlmeta", None) + if uv is not None and hasattr(storage, "save_vlmeta"): + storage.save_vlmeta(uv) try: self._flush_varlen_columns() + if not self._read_only and self.base is None: + self.trim_capacity() except Exception: with contextlib.suppress(Exception): if storage is not None and hasattr(storage, "close"): @@ -3303,18 +3299,47 @@ def _resolve_last_pos(self) -> int: self._last_pos = last_true_pos + 1 return self._last_pos + def trim_capacity(self) -> None: + """Shrink fixed-width physical storage to the last live row position. + + This removes unused append capacity while preserving holes left by deletes + before the last live row. List and variable-length scalar columns already + grow to their logical length and are left untouched. + """ + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise ValueError("Cannot trim capacity of a view.") + + target = self._resolve_last_pos() + if target <= 0 or target >= len(self._valid_rows): + return + + for name, col_arr in self._cols.items(): + cc = self._schema.columns_by_name[name] + if self._is_list_column(cc) or self._is_varlen_scalar_column(cc): + continue + if self._is_dictionary_column(cc): + col_arr.resize((target,)) + continue + col_arr.resize(self._column_physical_shape(cc, target)) + self._valid_rows.resize((target,)) + self._last_pos = target + def _grow(self) -> None: - """Double the scalar-column capacity and the valid_rows mask.""" + """Grow scalar-column capacity and the valid_rows mask by one table chunk.""" c = len(self._valid_rows) + growth_rows = min(c, _MAX_GROWTH_ROWS) + new_capacity = c + growth_rows for name, col_arr in self._cols.items(): cc = self._schema.columns_by_name[name] if self._is_list_column(cc) or self._is_varlen_scalar_column(cc): continue if self._is_dictionary_column(cc): - col_arr.resize((c * 2,)) + col_arr.resize((new_capacity,)) continue - col_arr.resize(self._column_physical_shape(cc, c * 2)) - self._valid_rows.resize((c * 2,)) + col_arr.resize(self._column_physical_shape(cc, new_capacity)) + self._valid_rows.resize((new_capacity,)) # ------------------------------------------------------------------ # Display @@ -4101,7 +4126,7 @@ def _save_to_storage(self, storage: TableStorage) -> None: disk_dc.flush() # Copy live codes if n_live > 0: - raw_codes = np.asarray(src_dc.codes[live_pos], dtype=np.int32) + raw_codes = src_dc.codes[live_pos] disk_dc.codes[:n_live] = raw_codes continue shape = self._column_physical_shape(col, capacity) @@ -4199,6 +4224,10 @@ def _open_from_storage(cls, storage: TableStorage) -> CTable: obj._col_widths[name] = max(len(name), cc.display_width) obj._n_rows = None + # Restore cached row count from saved metadata so that + # where() can skip the _valid_rows intersection for all-valid tables. + if "n_rows" in schema_dict: + obj._n_rows_cached = schema_dict["n_rows"] obj._last_pos = None obj._computed_cols = {} obj._materialized_cols = {} @@ -4369,6 +4398,8 @@ def _make_view(cls, parent: CTable, new_valid_rows: blosc2.NDArray) -> CTable: obj._computed_cols = parent._computed_cols # shared — LazyExpr refs remain valid obj._materialized_cols = parent._materialized_cols obj._expr_index_arrays = parent._expr_index_arrays + obj._cached_index_catalog = None + obj._cached_live_positions = None obj._col_widths = parent._col_widths obj.col_names = parent.col_names obj.auto_compact = parent.auto_compact @@ -4380,6 +4411,23 @@ def _make_view(cls, parent: CTable, new_valid_rows: blosc2.NDArray) -> CTable: obj._last_pos = None return obj + def _view_from_positions(self, positions: np.ndarray) -> CTable: + """Return a row-filter view from physical row positions.""" + positions = np.asarray(positions, dtype=np.intp) + total = len(self._valid_rows) + if len(positions): + positions = positions[(positions >= 0) & (positions < total)] + if len(positions) and self._known_n_rows() != total: + keep = np.asarray(self._valid_rows[positions], dtype=bool) + positions = positions[keep] + mask = np.zeros(total, dtype=np.bool_) + if len(positions): + mask[positions] = True + result = CTable._make_view(self, blosc2.asarray(mask)) + result._cached_live_positions = positions + result._n_rows = len(positions) + return result + def view(self, new_valid_rows): """Return a row-filter view backed by a boolean mask array without copying data.""" if isinstance(new_valid_rows, np.ndarray) and new_valid_rows.dtype == np.bool_: @@ -4401,6 +4449,62 @@ def view(self, new_valid_rows): return CTable._make_view(self, new_valid_rows) + @staticmethod + def _normalize_row_take_indices(indices, size: int) -> np.ndarray: + if isinstance(indices, blosc2.NDArray): + indices = indices[()] + indices = np.asarray(indices) + if indices.ndim == 0: + indices = indices.reshape(1) + if indices.ndim != 1: + raise ValueError("CTable.take indices must be a 1-D integer array") + if indices.size == 0: + return np.ascontiguousarray(indices, dtype=np.int64) + if not np.issubdtype(indices.dtype, np.integer): + raise TypeError("CTable.take indices must be integers") + normalized = np.ascontiguousarray(indices, dtype=np.int64) + negative = normalized < 0 + if np.any(negative): + normalized = normalized.copy() + normalized[negative] += size + if np.any((normalized < 0) | (normalized >= size)): + raise IndexError("CTable.take index out of bounds") + return normalized + + def take(self, indices, /) -> CTable: + """Return a compact table containing rows at the requested positions. + + Indices are interpreted as logical row positions among live rows. The + returned table preserves the order of ``indices`` and any duplicates, + unlike mask-based views. + """ + logical_pos = self._normalize_row_take_indices(indices, self.nrows) + physical_pos = self._live_positions_from_valid_rows_chunks()[logical_pos] + n = len(physical_pos) + + result = self._empty_copy(capacity=n) + for col in self._schema.columns: + col_name = col.name + arr = self._cols[col_name] + if self._is_list_column(col): + result._cols[col_name].extend((arr[int(pos)] for pos in physical_pos), validate=False) + result._cols[col_name].flush() + elif self._is_varlen_scalar_column(col): + result._cols[col_name].extend(arr[int(pos)] for pos in physical_pos) + result._cols[col_name].flush() + elif self._is_dictionary_column(col): + for v in arr.dictionary: + result._cols[col_name].encode(v) + result._cols[col_name].codes[:n] = arr.codes._take_numpy(physical_pos, axis=0) + else: + result._cols[col_name][:n] = arr._take_numpy(physical_pos, axis=0) + + result._valid_rows[:n] = True + result._valid_rows[n:] = False + result._n_rows = n + result._last_pos = n - 1 if n > 0 else None + return result + def head(self, N: int = 5) -> CTable: """Return a view of the first *N* live rows (default 5).""" if N <= 0: @@ -4528,6 +4632,8 @@ def select(self, cols: list[str]) -> CTable: name: dict(self._materialized_cols[name]) for name in cols if name in self._materialized_cols } obj._expr_index_arrays = self._expr_index_arrays + obj._cached_index_catalog = None + obj._cached_live_positions = getattr(self, "_cached_live_positions", None) # Computed columns — share the same definitions (LazyExpr refs remain valid) obj._computed_cols = { @@ -4928,7 +5034,7 @@ def iter_arrow_batches( # noqa: C901 pa.DictionaryArray.from_arrays(pa_indices, pa_dict, ordered=spec.ordered) ) else: - raw_codes = np.asarray(dc.codes[batch_real_pos], dtype=np.int32) + raw_codes = dc.codes[batch_real_pos] null_mask = raw_codes == np.int32(spec.null_code) safe_codes = raw_codes.copy() safe_codes[null_mask] = 0 @@ -5495,6 +5601,12 @@ def normalize_struct(value, field_normalizers=field_normalizers): return None + @classmethod + def _trim_arrow_import_capacity(cls, obj, n_rows: int) -> None: + """Shrink append-only Arrow-import columns from capacity to actual row count.""" + obj._last_pos = n_rows + obj.trim_capacity() + @classmethod def _write_arrow_batches(cls, obj, batches, columns, new_cols, new_valid) -> None: pos = 0 @@ -5516,6 +5628,7 @@ def _write_arrow_batches(cls, obj, batches, columns, new_cols, new_valid) -> Non or cls._is_dictionary_column(col) ): new_cols[col.name].flush() + cls._trim_arrow_import_capacity(obj, pos) obj._n_rows = pos obj._last_pos = pos @@ -6893,12 +7006,13 @@ def drop_column(self, name: str) -> None: + ". Drop those columns first." ) - catalog = self._storage.load_index_catalog() + catalog = self._get_index_catalog() if name in catalog: descriptor = catalog.pop(name) self._validate_index_descriptor(name, descriptor) self._drop_index_descriptor(name, descriptor) self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() if isinstance(self._storage, FileTableStorage): self._storage.delete_column(name) @@ -6969,7 +7083,7 @@ def rename_column(self, old: str, new: str) -> None: + ". Drop those computed columns first." ) - catalog = self._storage.load_index_catalog() + catalog = self._get_index_catalog() rebuild_kwargs = None if old in catalog: descriptor = catalog.pop(old) @@ -6977,6 +7091,7 @@ def rename_column(self, old: str, new: str) -> None: rebuild_kwargs = self._index_create_kwargs_from_descriptor(descriptor) self._drop_index_descriptor(old, descriptor) self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() if isinstance(self._storage, FileTableStorage): self._cols[new] = self._storage.rename_column(old, new) @@ -7092,6 +7207,9 @@ def _fetch_col_at_positions(self, name: str, positions: np.ndarray): def _schema_dict_with_computed(self) -> dict: """Return the schema dict extended with computed/materialized metadata.""" d = schema_to_dict(self._schema) + n_rows = self._known_n_rows() + if n_rows is not None: + d["n_rows"] = n_rows if self._computed_cols: d["computed_columns"] = [ { @@ -7120,6 +7238,37 @@ def _schema_dict_with_computed(self) -> dict: d["materialized_columns"] = materialized return d + def _save_n_rows_to_meta(self) -> None: + """Persist the cached row count into the _meta SChunk's vlmeta. + + Updates the vlmeta of the existing _meta SChunk directly and writes + it back to its backing store. This avoids going through save_schema() + which can route through the embed store where SChunk slice writes may + fail when the backing store has chunksize=-1. + """ + n_rows = self._known_n_rows() + if n_rows is None: + return + storage = self._storage + if not hasattr(storage, "_open_meta"): + return + try: + meta = storage._open_meta() + schema_raw = meta.vlmeta.get("schema") + if schema_raw is None: + return + schema_dict = json.loads(schema_raw) + schema_dict["n_rows"] = n_rows + meta.vlmeta["schema"] = json.dumps(schema_dict) + # Persist: for FileTableStorage, rewrite the external _meta.b2f file. + if hasattr(storage, "_meta_path"): + meta.save(urlpath=storage._meta_path, mode="w") + elif hasattr(storage, "_write_leaf"): + # TreeStoreTableStorage + storage._write_leaf("/_meta", meta, ".b2f") + except Exception: + pass # best-effort; failure must not prevent close() + def _load_computed_cols_from_schema(self, schema_dict: dict) -> None: """Reconstruct ``_computed_cols`` from persisted metadata. @@ -7915,10 +8064,6 @@ def drop_computed_column(self, name: str) -> None: def _all_strings(seq) -> bool: return all(isinstance(v, str) for v in seq) - @staticmethod - def _all_ints(seq) -> bool: - return all(isinstance(v, (int, np.integer)) and not isinstance(v, (bool, np.bool_)) for v in seq) - def _getitem_arraylike(self, key): if len(key) == 0: return self._run_row_logic(key) @@ -8155,7 +8300,7 @@ def compact(self): continue if self._is_dictionary_column(col): # Keep dictionary values intact; just compact the codes. - live_codes = np.asarray(v.codes[real_poss[: self._n_rows]], dtype=np.int32) + live_codes = v.codes[real_poss[: self._n_rows]] v.codes[: self._n_rows] = live_codes continue start = 0 @@ -8241,7 +8386,7 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd queries and is much slower for full-table streaming. """ root = self._root_table - catalog = root._storage.load_index_catalog() + catalog = root._get_index_catalog() descriptor = None if name in root._cols: @@ -8451,7 +8596,7 @@ def _sorted_small_copy_from_live_positions( for col in self._schema.columns: arr = self._cols[col.name] if self._is_dictionary_column(col): - gathered[col.name] = np.asarray(arr.codes[live_pos], dtype=np.int32) + gathered[col.name] = arr.codes[live_pos] else: gathered[col.name] = arr[live_pos] @@ -8507,7 +8652,7 @@ def _sort_by_inplace(self, sorted_pos: np.ndarray, n: int) -> None: new_arr.flush() self._cols[col.name] = new_arr elif self._is_dictionary_column(col): - sorted_codes = np.asarray(arr.codes[sorted_pos], dtype=np.int32) + sorted_codes = arr.codes[sorted_pos] arr.codes[:n] = sorted_codes else: arr[:n] = arr[sorted_pos] @@ -8530,7 +8675,7 @@ def _sorted_copy_from_positions(self, sorted_pos: np.ndarray, n: int) -> CTable: # Copy dictionary values, then sorted codes. for v in arr.dictionary: result._cols[col_name].encode(v) - sorted_codes = np.asarray(arr.codes[sorted_pos], dtype=np.int32) + sorted_codes = arr.codes[sorted_pos] result._cols[col_name].codes[:n] = sorted_codes else: result._cols[col_name][:n] = arr[sorted_pos] @@ -8614,7 +8759,7 @@ def copy( for v in arr.dictionary: result._cols[col_name].encode(v) pos_slice = live_pos if compact else np.arange(n, dtype=np.int64) - raw_codes = np.asarray(arr.codes[pos_slice], dtype=np.int32) + raw_codes = arr.codes[pos_slice] result._cols[col_name].codes[:n] = raw_codes else: result._cols[col_name][:n] = arr[live_pos] if compact else arr[:n] @@ -8754,6 +8899,56 @@ def schema(self) -> CompiledSchema: """The compiled schema that drives this table's columns and validation.""" return self._schema + @property + def vlmeta(self): + """Variable-length metadata attached to this table. + + Returns a mapping-like proxy that supports item access, iteration, + and the ``[:]`` bulk getter. Values are serialised via msgpack, so + all standard types (int, float, str, bool, list, dict) are supported. + The metadata is stored separately from the internal schema metadata + and persists through ``close()`` / reopen for disk-backed tables. + + Examples + -------- + >>> import blosc2 + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + >>> t = blosc2.CTable(Row) + >>> t.vlmeta["author"] = "Alice" + >>> t.vlmeta["tags"] = ["alpha", "beta"] + >>> t.vlmeta["count"] = 42 + >>> print(t.vlmeta["author"]) + Alice + >>> print(t.vlmeta[:]) + {'author': 'Alice', 'tags': ['alpha', 'beta'], 'count': 42} + >>> del t.vlmeta["count"] + >>> for name in t.vlmeta: + ... print(name, t.vlmeta[name]) + ... + author Alice + tags ['alpha', 'beta'] + """ + storage = getattr(self, "_storage", None) + if storage is None: + raise AttributeError("CTable has no storage backend") + if not hasattr(storage, "_open_meta"): + # In-memory table: create a simple SChunk to hold vlmeta lazily + _tmp = getattr(storage, "_vlmeta_schunk", None) + if _tmp is None: + storage._vlmeta_schunk = blosc2.SChunk() + return storage._vlmeta_schunk.vlmeta + # Persistent table: use the dedicated user-vlmeta SChunk + meta = storage._open_vlmeta() + if meta is None: + # First access — create an in-memory SChunk; it will be saved + # to disk when the table is closed. + meta = blosc2.SChunk() + storage._vlmeta = meta + return meta.vlmeta + def column_schema(self, name: str) -> CompiledColumn: """Return the :class:`CompiledColumn` descriptor for *name*. @@ -8772,960 +8967,9 @@ def schema_dict(self) -> dict[str, Any]: return schema_to_dict(self._schema) # ------------------------------------------------------------------ - # Index management + # Info reporting # ------------------------------------------------------------------ - @property - def _root_table(self) -> CTable: - """Return the root (non-view) table; *self* if not a view.""" - t = self - while t.base is not None: - t = t.base - return t - - def _mark_all_indexes_stale(self) -> None: - """Bump value_epoch and mark every catalog entry stale on the root table.""" - root = self._root_table - root._storage.bump_value_epoch() - catalog = root._storage.load_index_catalog() - if not catalog: - return - changed = False - for desc in catalog.values(): - if not desc.get("stale", False): - desc["stale"] = True - changed = True - if changed: - root._storage.save_index_catalog(catalog) - - @staticmethod - def _validate_index_descriptor(col_name: str, descriptor: dict) -> None: - """Raise ValueError when an index catalog entry is malformed.""" - if not isinstance(descriptor, dict): - raise ValueError(f"Malformed index metadata for column {col_name!r}: descriptor must be a dict.") - token = descriptor.get("token") - if not isinstance(token, str) or not token: - raise ValueError(f"Malformed index metadata for column {col_name!r}: missing token.") - kind = descriptor.get("kind") - if kind not in {"summary", "bucket", "partial", "full", "opsi"}: - raise ValueError(f"Malformed index metadata for column {col_name!r}: invalid kind {kind!r}.") - if kind == "bucket" and not isinstance(descriptor.get("bucket"), dict): - raise ValueError(f"Malformed index metadata for column {col_name!r}: missing bucket payload.") - if kind == "partial" and not isinstance(descriptor.get("partial"), dict): - raise ValueError(f"Malformed index metadata for column {col_name!r}: missing partial payload.") - if kind == "full" and not isinstance(descriptor.get("full"), dict): - raise ValueError(f"Malformed index metadata for column {col_name!r}: missing full payload.") - - def _drop_index_descriptor(self, col_name: str, descriptor: dict) -> None: - """Delete sidecars/cache for a catalog descriptor without touching the column mapping.""" - from pathlib import Path - - from blosc2.indexing import ( - _IN_MEMORY_INDEXES, - _PERSISTENT_INDEXES, - _array_key, - _clear_cached_data, - _drop_descriptor_sidecars, - _is_persistent_array, - ) - - token = descriptor["token"] - col_arr = None - with contextlib.suppress(Exception): - col_arr = self._index_target_array(col_name, descriptor) - - if col_arr is not None: - _clear_cached_data(col_arr, token) - - if col_arr is not None and _is_persistent_array(col_arr): - arr_key = _array_key(col_arr) - store = _PERSISTENT_INDEXES.get(arr_key) - if store is not None: - store["indexes"].pop(token, None) - elif col_arr is not None: - store = _IN_MEMORY_INDEXES.get(id(col_arr)) - if store is not None: - store["indexes"].pop(token, None) - - _drop_descriptor_sidecars(descriptor) - self._root_table._expr_index_arrays.pop(token, None) - - expr_values_path = descriptor.get("expr_values_path") - if expr_values_path is not None: - with contextlib.suppress(OSError): - os.remove(expr_values_path) - - anchor = self._storage.index_anchor_path(col_name) - if anchor is not None: - proxy_key = ("persistent", str(Path(anchor).resolve())) - _PERSISTENT_INDEXES.pop(proxy_key, None) - with contextlib.suppress(OSError): - os.rmdir(os.path.dirname(anchor)) - - def _index_create_kwargs_from_descriptor(self, descriptor: dict) -> dict[str, Any]: - """Return create_index kwargs that rebuild an existing descriptor.""" - build = "ooc" if bool(descriptor.get("ooc", False)) else "memory" - kwargs = { - "kind": descriptor["kind"], - "optlevel": int(descriptor.get("optlevel", 5)), - "name": descriptor.get("name") or None, - "build": build, - "cparams": descriptor.get("cparams"), - } - if descriptor.get("kind") == "full": - kwargs["method"] = descriptor.get("full", {}).get("build_method", "global-sort") - if descriptor.get("kind") == "opsi": - kwargs["opsi_max_cycles"] = descriptor.get("opsi", {}).get("max_cycles") - target = descriptor.get("target") or {} - if target.get("source") == "expression": - kwargs["expression"] = target.get("expression") - return kwargs - - def _normalize_table_expression_target( - self, expression: str, operands: dict | None = None - ) -> tuple[dict, np.dtype]: - """Normalize a same-table expression target and infer its dtype.""" - if operands is None: - operands = self._cols - try: - ast.parse(expression, mode="eval") - except SyntaxError as exc: - raise ValueError("expression is not valid Python syntax") from exc - - owned_ids = {id(arr): name for name, arr in self._root_table._cols.items()} - dependencies: list[str] = [] - valid = True - - class _Canonicalizer(ast.NodeTransformer): - def visit_Name(self_inner, node: ast.Name) -> ast.AST: - nonlocal valid - operand = operands.get(node.id) - if operand is None or not isinstance(operand, blosc2.NDArray): - return node - cname = owned_ids.get(id(operand)) - if cname is None: - valid = False - return node - dependencies.append(cname) - return ast.copy_location(ast.Name(id=cname, ctx=node.ctx), node) - - normalized = _Canonicalizer().visit( - ast.fix_missing_locations(ast.parse(expression, mode="eval")).body - ) - if not valid or not dependencies: - raise ValueError("expression indexes require operands from stored columns of the same table") - dependencies = list(dict.fromkeys(dependencies)) - expression_key = ast.unparse(normalized) - lazy = blosc2.lazyexpr(expression_key, {dep: self._root_table._cols[dep] for dep in dependencies}) - sample_stop = min( - len(self._root_table._valid_rows), max(1, int(self._root_table._valid_rows.blocks[0])) - ) - sample = lazy[:sample_stop] - if isinstance(sample, blosc2.NDArray): - sample = sample[:] - sample = np.asarray(sample) - dtype = np.dtype(sample.dtype) - if sample.ndim != 1: - raise ValueError("expression indexes require expressions returning a 1-D scalar stream") - target = { - "source": "expression", - "expression": expression, - "expression_key": expression_key, - "dependencies": dependencies, - } - return target, dtype - - def _expression_index_values_path(self, token: str) -> str | None: - anchor = self._storage.index_anchor_path(token) - if anchor is None: - return None - return os.path.join(os.path.dirname(anchor), "values.b2nd") - - def _build_expression_values_array(self, target: dict, dtype: np.dtype, cparams=None) -> blosc2.NDArray: - """Build a physical 1-D values array for a table expression target.""" - from blosc2.indexing import _target_token - - root = self._root_table - capacity = len(root._valid_rows) - chunks, blocks = compute_chunks_blocks((capacity,), dtype=dtype) - urlpath = root._expression_index_values_path(_target_token(target)) - if urlpath is not None: - os.makedirs(os.path.dirname(urlpath), exist_ok=True) - arr = blosc2.zeros( - (capacity,), dtype=dtype, urlpath=urlpath, mode="w", chunks=chunks, blocks=blocks - ) - else: - arr = blosc2.zeros((capacity,), dtype=dtype, chunks=chunks, blocks=blocks) - lazy = blosc2.lazyexpr( - target["expression_key"], {dep: root._cols[dep] for dep in target["dependencies"]} - ) - step = int(root._valid_rows.chunks[0]) if root._valid_rows.chunks else 65536 - for start in range(0, capacity, step): - stop = min(start + step, capacity) - values = lazy[start:stop] - if isinstance(values, blosc2.NDArray): - values = values[:] - arr[start:stop] = np.asarray(values, dtype=dtype) - root._expr_index_arrays[_target_token(target)] = arr - return arr - - def _index_target_array(self, lookup_key: str, descriptor: dict) -> blosc2.NDArray: - """Return the physical array backing a column or expression index.""" - target = descriptor.get("target") or {} - if target.get("source") != "expression": - return self._root_table._cols[lookup_key] - token = descriptor["token"] - root = self._root_table - arr = root._expr_index_arrays.get(token) - if arr is not None: - return arr - path = descriptor.get("expr_values_path") - if path is None: - raise KeyError(f"No backing array found for expression index {token!r}.") - arr = blosc2.open(path, mode="r" if root._read_only else "a") - root._expr_index_arrays[token] = arr - return arr - - def _resolve_index_catalog_entry( - self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None - ) -> tuple[str, dict]: - """Resolve an index catalog entry by column, expression, or label.""" - catalog = self._root_table._storage.load_index_catalog() - if col_name is not None and expression is not None: - raise ValueError("col_name and expression are mutually exclusive") - if col_name is not None: - col_name = self._logical_to_physical_name(col_name) - if col_name not in catalog: - raise KeyError(f"No index found for column {col_name!r}.") - return col_name, catalog[col_name] - if expression is not None: - from blosc2.indexing import _target_token - - target, _ = self._normalize_table_expression_target(expression) - token = _target_token(target) - if token not in catalog: - raise KeyError(f"No index found for expression {expression!r}.") - return token, catalog[token] - if name is not None: - matches = [(key, desc) for key, desc in catalog.items() if desc.get("name") == name] - if not matches: - raise KeyError(f"No index found with name {name!r}.") - if len(matches) > 1: - raise ValueError(f"Multiple indexes found with name {name!r}; specify a target explicitly.") - return matches[0] - raise TypeError("must specify col_name, expression, or name") - - def _build_index_persistent( - self, - col_name: str, - col_arr: blosc2.NDArray, - *, - kind: str, - optlevel: int, - name_hint: str | None, - build: str, - tmpdir: str | None, - cparams_obj, - method: str | None = None, - opsi_max_cycles: int | None = None, - ) -> dict: - """Build index sidecar files for a persistent-table column; return the descriptor.""" - import tempfile - from pathlib import Path - - from blosc2.indexing import ( - _PERSISTENT_INDEXES, - _array_key, - _build_bucket_descriptor, - _build_bucket_descriptor_ooc, - _build_descriptor, - _build_full_descriptor, - _build_full_descriptor_ooc, - _build_levels_descriptor, - _build_levels_descriptor_ooc, - _build_opsi_descriptor, - _build_partial_descriptor, - _build_partial_descriptor_ooc, - _copy_descriptor, - _field_target_descriptor, - _resolve_full_index_tmpdir, - _resolve_ooc_mode, - _target_token, - _values_for_target, - ) - - anchor = self._storage.index_anchor_path(col_name) - os.makedirs(os.path.dirname(anchor), exist_ok=True) - proxy = _CTableBuildProxy(col_arr, anchor) - proxy_key = _array_key(proxy) - _PERSISTENT_INDEXES.pop(proxy_key, None) # clear any stale cache entry - - target = _field_target_descriptor(None) - token = _target_token(target) - persistent = True - dtype = col_arr.dtype - use_ooc = _resolve_ooc_mode(kind, build) - if opsi_max_cycles is None: - opsi_max_cycles = max(1, optlevel if optlevel < 8 else optlevel * 2) - - if use_ooc: - resolved_tmpdir = _resolve_full_index_tmpdir(proxy, tmpdir) - levels = _build_levels_descriptor_ooc(proxy, target, token, kind, dtype, persistent, cparams_obj) - bucket = ( - _build_bucket_descriptor_ooc( - proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj - ) - if kind == "bucket" - else None - ) - partial = ( - _build_partial_descriptor_ooc( - proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj - ) - if kind == "partial" - else None - ) - full = None - opsi = None - if kind == "full": - with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-", dir=resolved_tmpdir) as td: - full = _build_full_descriptor_ooc( - proxy, target, token, kind, dtype, persistent, Path(td), cparams_obj, optlevel - ) - full["build_method"] = "global-sort" - if kind == "opsi": - opsi = _build_opsi_descriptor( - proxy, target, token, kind, dtype, persistent, cparams_obj, opsi_max_cycles, optlevel - ) - descriptor = _build_descriptor( - proxy, - target, - token, - kind, - optlevel, - persistent, - True, - name_hint, - dtype, - levels, - bucket, - partial, - full, - cparams_obj, - opsi, - ) - else: - values = _values_for_target(proxy, target) - levels = _build_levels_descriptor( - proxy, target, token, kind, dtype, values, persistent, cparams_obj - ) - bucket = ( - _build_bucket_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) - if kind == "bucket" - else None - ) - partial = ( - _build_partial_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) - if kind == "partial" - else None - ) - full = None - opsi = None - if kind == "full": - full = _build_full_descriptor(proxy, token, kind, values, persistent, cparams_obj, optlevel) - full["build_method"] = "global-sort" - if kind == "opsi": - opsi = _build_opsi_descriptor( - proxy, target, token, kind, dtype, persistent, cparams_obj, opsi_max_cycles, optlevel - ) - descriptor = _build_descriptor( - proxy, - target, - token, - kind, - optlevel, - persistent, - False, - name_hint, - dtype, - levels, - bucket, - partial, - full, - cparams_obj, - opsi, - ) - - result = _copy_descriptor(descriptor) - _PERSISTENT_INDEXES.pop(proxy_key, None) # evict proxy to avoid memory leak - return result - - def create_index( # noqa: C901 - self, - col_name: str | None = None, - *, - field: str | None = None, - expression: str | None = None, - operands: dict | None = None, - kind: blosc2.IndexKind = blosc2.IndexKind.BUCKET, - optlevel: int = 5, - name: str | None = None, - build: str = "auto", - tmpdir: str | None = None, - **kwargs, - ) -> blosc2.Index: - """Build and register an index for a stored column or table expression. - - For tables with **nested (dotted) column names**, pass the dotted leaf - name directly:: - - t.create_index("trip.begin.lon") - t.where("trip.begin.lon > -87.7").nrows # index is used automatically - - .. rubric:: Choosing an index kind - - ``BUCKET`` (the default) is the cheapest to build and store. - It accelerates single‑column ``where`` queries and ``sort_by`` - reuse with approximate ordering derived from value - quantization. Sufficient for most workloads. - - ``FULL`` builds a globally sorted index that returns exact - row positions for any range predicate. It enables the - **cross‑column refinement** planner path: when a multi‑column - conjunction such as ``(tips > 100) & (km > 0) & (sec > 0)`` - indexes only the most selective column, the planner obtains - compact exact positions from ``FULL`` and evaluates the - remaining predicates on just those rows. ``FULL`` is also - ideal for ``sort_by`` reuse because it carries a complete - sort order. - - ``PARTIAL`` builds a chunk‑local sorted payload with segment - navigation. It is cheaper to build than ``FULL`` (roughly - half the raw storage) while still providing exact positions - for cross‑column refinement. Its exact positions are most - compact for equality or narrow range queries; wide ranges - may scan proportionally more candidate segments. - - ``OPSI`` is a specialised tier for approximate ordering; - prefer ``FULL`` when a globally sorted ordered index is - needed to accelerate ``sort_by``. - - ``SUMMARY`` stores only per‑segment min/max and is the - lightest kind; it may still skip chunks for broad range - queries but cannot accelerate ``sort_by``. - """ - if self.base is not None: - raise ValueError("Cannot create an index on a view.") - if col_name is not None and field is not None: - raise ValueError("col_name and field are mutually exclusive") - if expression is not None and (col_name is not None or field is not None): - raise ValueError("column targets and expression are mutually exclusive") - if operands is not None and expression is None: - raise ValueError("operands can only be provided together with expression") - col_name = field if field is not None else col_name - if col_name is not None: - col_name = self._logical_to_physical_name(col_name) - - from blosc2.indexing import ( - _IN_MEMORY_INDEXES, - _copy_descriptor, - _normalize_build_mode, - _normalize_full_build_method, - _normalize_index_cparams, - _normalize_index_kind, - _target_token, - ) - from blosc2.indexing import create_index as _ix_create_index - - cparams_obj = _normalize_index_cparams(kwargs.pop("cparams", None)) - method = kwargs.pop("method", None) - opsi_max_cycles = kwargs.pop("opsi_max_cycles", None) - if opsi_max_cycles is not None: - opsi_max_cycles = max(1, int(opsi_max_cycles)) - if kwargs: - raise TypeError(f"unexpected keyword argument(s): {', '.join(sorted(kwargs))}") - - kind_str = _normalize_index_kind(kind) - build_str = _normalize_build_mode(build) - method_str = _normalize_full_build_method(method) if kind_str == "full" else None - if method is not None and kind_str != "full": - raise ValueError("method is only supported for kind=IndexKind.FULL") - catalog = self._storage.load_index_catalog() - - if expression is not None: - target, dtype = self._normalize_table_expression_target(expression, operands) - token = _target_token(target) - if token in catalog: - raise ValueError( - f"Index already exists for expression {expression!r}. " - "Call rebuild_index() to replace it or drop_index() first." - ) - expr_arr = self._build_expression_values_array(target, dtype, cparams=cparams_obj) - _ix_create_index( - expr_arr, - kind=blosc2.IndexKind(kind_str), - optlevel=optlevel, - name=name, - build=build, - tmpdir=tmpdir, - cparams=cparams_obj, - method=method_str, - opsi_max_cycles=opsi_max_cycles, - ) - store = _IN_MEMORY_INDEXES.get(id(expr_arr)) - if store is None: - from blosc2.indexing import _load_store - - store = _load_store(expr_arr) - descriptor = _copy_descriptor(store["indexes"]["__self__"]) - descriptor["target"] = target - descriptor["token"] = token - descriptor["dtype"] = str(np.dtype(dtype)) - descriptor["expr_values_path"] = getattr(expr_arr, "urlpath", None) - value_epoch, _ = self._storage.get_epoch_counters() - descriptor["built_value_epoch"] = value_epoch - catalog[token] = descriptor - self._storage.save_index_catalog(catalog) - return blosc2.Index._from_table(self, token, descriptor) - - if col_name is None: - raise TypeError("must specify col_name/field or expression") - if col_name in self._computed_cols: - raise ValueError( - f"Cannot create an index on computed column {col_name!r}: " - "computed columns have no physical storage." - ) - if col_name not in self._cols: - raise KeyError(f"No column named {col_name!r}. Available: {self.col_names}") - self._ensure_generated_column_not_stale(col_name) - if col_name in catalog: - raise ValueError( - f"Index already exists for column {col_name!r}. " - "Call rebuild_index() to replace it or drop_index() first." - ) - - col_arr = self._cols[col_name] - if isinstance(self._schema.columns_by_name[col_name].spec, NDArraySpec): - spec = self._schema.columns_by_name[col_name].spec - raise ValueError( - f"Cannot create an index on ndarray column {col_name!r} with per-row shape {spec.item_shape}. " - "Materialize a scalar generated column first, e.g. embedding_norm or embedding_max." - ) - if isinstance(self._schema.columns_by_name[col_name].spec, ListSpec): - raise ValueError(f"Cannot create an index on list column {col_name!r} in V1.") - if isinstance( - self._schema.columns_by_name[col_name].spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec) - ): - raise NotImplementedError( - f"Cannot create an index on variable-length scalar column {col_name!r}: " - "indexing for vlstring/vlbytes/struct/object columns is not supported yet." - ) - # Dictionary columns: index the underlying int32 codes array. - is_dictionary = isinstance(self._schema.columns_by_name[col_name].spec, DictionarySpec) - if is_dictionary: - col_arr = col_arr.codes # index the int32 codes NDArray - is_persistent = self._storage.index_anchor_path(col_name) is not None - - if is_persistent: - descriptor = self._build_index_persistent( - col_name, - col_arr, - kind=kind_str, - optlevel=optlevel, - name_hint=name, - build=build_str, - tmpdir=tmpdir, - cparams_obj=cparams_obj, - method=method_str, - opsi_max_cycles=opsi_max_cycles, - ) - else: - _ix_create_index( - col_arr, - field=None, - kind=blosc2.IndexKind(kind_str), - optlevel=optlevel, - name=name, - build=build, - tmpdir=tmpdir, - cparams=cparams_obj, - method=method_str, - opsi_max_cycles=opsi_max_cycles, - ) - store = _IN_MEMORY_INDEXES[id(col_arr)] - descriptor = _copy_descriptor(store["indexes"]["__self__"]) - - value_epoch, _ = self._storage.get_epoch_counters() - descriptor["built_value_epoch"] = value_epoch - - catalog = self._storage.load_index_catalog() - catalog[col_name] = descriptor - self._storage.save_index_catalog(catalog) - return blosc2.Index._from_table(self, col_name, descriptor) - - def drop_index( - self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None - ) -> None: - """Remove an index and delete any sidecar files.""" - if self.base is not None: - raise ValueError("Cannot drop an index from a view.") - - lookup_key, descriptor = self._resolve_index_catalog_entry( - col_name, expression=expression, name=name - ) - catalog = self._storage.load_index_catalog() - catalog.pop(lookup_key, None) - self._validate_index_descriptor(lookup_key, descriptor) - self._drop_index_descriptor(lookup_key, descriptor) - self._storage.save_index_catalog(catalog) - - def rebuild_index( - self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None - ) -> blosc2.Index: - """Drop and recreate an index with the same parameters.""" - if self.base is not None: - raise ValueError("Cannot rebuild an index on a view.") - - lookup_key, old_desc = self._resolve_index_catalog_entry(col_name, expression=expression, name=name) - self._validate_index_descriptor(lookup_key, old_desc) - create_kwargs = self._index_create_kwargs_from_descriptor(old_desc) - - self.drop_index(col_name, expression=expression, name=name) - if "expression" in create_kwargs: - return self.create_index(expression=create_kwargs.pop("expression"), **create_kwargs) - return self.create_index(lookup_key, **create_kwargs) - - def compact_index( - self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None - ) -> blosc2.Index: - """Compact an index, merging any incremental append runs.""" - if self.base is not None: - raise ValueError("Cannot compact an index on a view.") - - from blosc2.indexing import ( - _IN_MEMORY_INDEXES, - _PERSISTENT_INDEXES, - _array_key, - _copy_descriptor, - _default_index_store, - _is_persistent_array, - ) - from blosc2.indexing import compact_index as _ix_compact_index - - lookup_key, descriptor = self._resolve_index_catalog_entry( - col_name, expression=expression, name=name - ) - col_arr = self._index_target_array(lookup_key, descriptor) - catalog = self._storage.load_index_catalog() - - if _is_persistent_array(col_arr): - anchor = self._storage.index_anchor_path(lookup_key) - proxy = _CTableBuildProxy(col_arr, anchor) - proxy_key = _array_key(proxy) - store = _default_index_store() - store["indexes"][descriptor["token"]] = descriptor - _PERSISTENT_INDEXES[proxy_key] = store - try: - _ix_compact_index(proxy) - updated_store = _PERSISTENT_INDEXES.get(proxy_key) or store - updated_desc = _copy_descriptor(updated_store["indexes"][descriptor["token"]]) - finally: - _PERSISTENT_INDEXES.pop(proxy_key, None) - updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) - catalog[lookup_key] = updated_desc - self._storage.save_index_catalog(catalog) - return blosc2.Index._from_table(self, lookup_key, updated_desc) - else: - _ix_compact_index(col_arr) - store = _IN_MEMORY_INDEXES.get(id(col_arr)) - if store: - token = descriptor["token"] - updated_desc = _copy_descriptor(store["indexes"].get(token, descriptor)) - updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) - catalog[lookup_key] = updated_desc - self._storage.save_index_catalog(catalog) - return blosc2.Index._from_table(self, lookup_key, updated_desc) - return blosc2.Index._from_table(self, lookup_key, descriptor) - - def index( - self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None - ) -> blosc2.Index: - """Return the index handle for a stored-column or expression target.""" - lookup_key, descriptor = self._resolve_index_catalog_entry( - col_name, expression=expression, name=name - ) - return blosc2.Index._from_table(self, lookup_key, descriptor) - - @property - def indexes(self) -> list[blosc2.Index]: - """Return a list of :class:`blosc2.Index` handles for all active indexes.""" - catalog = self._root_table._storage.load_index_catalog() - return [blosc2.Index._from_table(self, col_name, desc) for col_name, desc in catalog.items()] - - def _rewrite_expression_query_for_index( - self, expression: str, operands: dict, target: dict - ) -> str | None: - """Rewrite matching table-expression subtrees to ``_where_x`` for planning.""" - try: - tree = ast.parse(expression, mode="eval") - except SyntaxError: - return None - - class _Rewriter(ast.NodeTransformer): - def __init__(self, outer): - self.outer = outer - self.changed = False - - def generic_visit(self, node): - normalized = None - with contextlib.suppress(Exception): - normalized, _ = self.outer._normalize_table_expression_target( - ast.unparse(node), operands - ) - if normalized is not None and normalized.get("expression_key") == target.get( - "expression_key" - ): - self.changed = True - return ast.copy_location(ast.Name(id="_where_x", ctx=ast.Load()), node) - return super().generic_visit(node) - - rewriter = _Rewriter(self) - new_body = rewriter.visit(tree.body) - if not rewriter.changed: - return None - return ast.unparse(new_body) - - def _try_expression_index_where(self, expr_result: blosc2.LazyExpr, catalog: dict) -> np.ndarray | None: - """Attempt to resolve *expr_result* via a direct table expression index.""" - from blosc2.indexing import evaluate_bucket_query, evaluate_segment_query, plan_query - - expression = expr_result.expression - operands = dict(expr_result.operands) - for lookup_key, descriptor in catalog.items(): - target = descriptor.get("target") or {} - if target.get("source") != "expression" or descriptor.get("stale", False): - continue - rewritten = self._rewrite_expression_query_for_index(expression, operands, target) - if rewritten is None: - continue - expr_arr = self._index_target_array(lookup_key, descriptor) - where_dict = {"_where_x": expr_arr} - merged_operands = {"_where_x": expr_arr} - plan = plan_query(rewritten, merged_operands, where_dict) - if not plan.usable: - continue - if plan.exact_positions is not None: - return np.asarray(plan.exact_positions, dtype=np.int64) - if plan.bucket_masks is not None: - _, positions = evaluate_bucket_query( - rewritten, merged_operands, {}, where_dict, plan, return_positions=True - ) - return np.asarray(positions, dtype=np.int64) - if plan.candidate_units is not None and plan.segment_len is not None: - _, positions = evaluate_segment_query( - rewritten, merged_operands, {}, where_dict, plan, return_positions=True - ) - return np.asarray(positions, dtype=np.int64) - return None - - @staticmethod - def _evaluate_refine_predicate(col_values, refine_plan) -> np.ndarray: - """Evaluate a single comparison predicated on *col_values*. - - ``refine_plan`` is an :class:`~blosc2.indexing.ExactPredicatePlan` - that carries ``lower`` / ``upper`` bounds and their inclusiveness. - Returns a boolean mask of the same length as *col_values*. - """ - mask = np.ones(len(col_values), dtype=bool) - if refine_plan.lower is not None: - if refine_plan.lower_inclusive: - mask &= col_values >= refine_plan.lower - else: - mask &= col_values > refine_plan.lower - if refine_plan.upper is not None: - if refine_plan.upper_inclusive: - mask &= col_values <= refine_plan.upper - else: - mask &= col_values < refine_plan.upper - return mask - - @staticmethod - def _evaluate_expression_at(expr_result, candidates): - """Evaluate *expr_result* on the operand rows at *candidates*. - - Returns a boolean ``numpy.ndarray`` the same length as *candidates*, - or ``None`` if evaluation fails. - """ - try: - operands = {} - for var_name, arr in expr_result.operands.items(): - sliced = arr[candidates] - if hasattr(sliced, "__array__"): - sliced = np.asarray(sliced) - operands[var_name] = sliced - return blosc2.evaluate(expr_result.expression, operands) - except Exception: - return None - - @staticmethod - def _find_indexed_columns(root_cols, catalog, operands): - """Return live indexed columns referenced by *operands* in expression order. - - Avoid iterating over ``root_cols.items()`` here: for lazy persistent tables - that would open every column just to find the indexed operands. - """ - indexed = [] - seen = set() - indexed_arrays = {} - for col_name, descriptor in catalog.items(): - if col_name in root_cols: - indexed_arrays[col_name] = (root_cols[col_name], descriptor) - - for operand in operands.values(): - if not isinstance(operand, blosc2.NDArray): - continue - for col_name, (col_arr, descriptor) in indexed_arrays.items(): - if col_name in seen or col_arr is not operand: - continue - CTable._validate_index_descriptor(col_name, descriptor) - if descriptor.get("stale", False): - continue - indexed.append((col_name, col_arr, descriptor)) - seen.add(col_name) - return indexed - - def _try_index_where(self, expr_result: blosc2.LazyExpr) -> np.ndarray | None: # noqa: C901 - """Attempt to resolve *expr_result* via a column index. - - Returns a 1-D int64 array of physical row positions that satisfy the - predicate, or ``None`` if no usable index was found (caller falls back - to a full scan). - """ - from blosc2.indexing import ( - _IN_MEMORY_INDEXES, - _PERSISTENT_INDEXES, - _array_key, - _default_index_store, - _is_persistent_array, - evaluate_bucket_query, - evaluate_segment_query, - plan_query, - ) - - root = self._root_table - catalog = root._storage.load_index_catalog() - if not catalog: - return None - - positions = self._try_expression_index_where(expr_result, catalog) - if positions is not None: - return positions - - expression = expr_result.expression - operands = dict(expr_result.operands) - - indexed_columns = self._find_indexed_columns(root._cols, catalog, operands) - if not indexed_columns: - return None - - primary_col_name, primary_col_arr, _ = indexed_columns[0] - nullable_indexed = [ - name - for name, _arr, _descriptor in indexed_columns - if getattr(root._schema.columns_by_name[name].spec, "null_value", None) is not None - ] - - # Global null post-filtering is not correct for OR expressions. - if nullable_indexed and ("|" in expr_result.expression or " or " in expr_result.expression): - return None - - # Inject every usable table-owned descriptor so plan_query can combine them. - # In .b2z read mode all columns share the same urlpath, so _array_key() - # returns the same key for every column — causing _SIDECAR_HANDLE_CACHE - # collisions across queries. Clear stale handles before each injection so - # the upcoming query always loads the correct sidecar for this column. - from blosc2.indexing import _clear_cached_data - - for _col_name, col_arr, descriptor in indexed_columns: - arr_key = _array_key(col_arr) - if _is_persistent_array(col_arr): - store = _PERSISTENT_INDEXES.get(arr_key) or _default_index_store() - if store["indexes"].get(descriptor["token"]) is not descriptor: - _clear_cached_data(col_arr, descriptor["token"]) - store["indexes"][descriptor["token"]] = descriptor - _PERSISTENT_INDEXES[arr_key] = store - else: - store = _IN_MEMORY_INDEXES.get(id(col_arr)) or _default_index_store() - store["indexes"][descriptor["token"]] = descriptor - _IN_MEMORY_INDEXES[id(col_arr)] = store - - where_dict = {"_where_x": primary_col_arr} - merged_operands = {**operands, "_where_x": primary_col_arr} - - plan = plan_query(expression, merged_operands, where_dict) - if not plan.usable: - return None - - def _exclude_null_positions(positions): - positions = np.asarray(positions, dtype=np.int64) - for name in nullable_indexed: - col = root._schema.columns_by_name[name] - raw = root._cols[name][positions] - nv = getattr(col.spec, "null_value", None) - if isinstance(nv, float) and np.isnan(nv): - keep = ~np.isnan(raw) - else: - keep = raw != nv - positions = positions[keep] - return positions - - if plan.exact_positions is not None: - return _exclude_null_positions(plan.exact_positions) - - if plan.partial_exact_positions is not None: - # Cross-column refinement: the FULL index on one column gave us - # exact positions, but the expression has additional predicates on - # other columns. Refinement reads every operand column at those - # candidate positions using sparse/fancy indexing. For compressed - # columns this can touch many chunks and be slower than the regular - # sequential miniexpr scan, which is very fast for simple predicates. - # Keep this intentionally conservative until sparse gathers become - # cheaper or the planner has a richer cost model. - max_sparse_refine_candidates = 1024 - candidates = np.asarray(plan.partial_exact_positions, dtype=np.int64) - if len(candidates) > max_sparse_refine_candidates: - return None - candidates = _exclude_null_positions(candidates) - restricted = self._evaluate_expression_at(expr_result, candidates) - if restricted is not None and restricted.dtype == np.bool_: - refined = candidates[np.asarray(restricted, dtype=bool)] - return _exclude_null_positions(refined) - # Fall through to full scan if refinement fails - - if plan.bucket_masks is not None: - # When bucket pruning covers all units (100 % of chunks are - # candidates), the per‑chunk evaluation overhead outweighs the - # benefit over a plain scan. Fall back to the scan path. - if plan.total_units > 0 and plan.selected_units >= plan.total_units: - return None - _, positions = evaluate_bucket_query( - expression, merged_operands, {}, where_dict, plan, return_positions=True - ) - return _exclude_null_positions(positions) - - if plan.candidate_units is not None and plan.segment_len is not None: - # When segment summaries prune fewer than half the candidate - # units, the per‑segment evaluation overhead outweighs a plain - # scan. Fall back to the scan path. - if plan.total_units > 0 and plan.selected_units / plan.total_units > 0.5: - return None - _, positions = evaluate_segment_query( - expression, merged_operands, {}, where_dict, plan, return_positions=True - ) - return _exclude_null_positions(positions) - - return None - @property def info_items(self) -> list[tuple[str, object]]: """Structured summary items used by :meth:`info`.""" @@ -10308,12 +9552,7 @@ def where( if isinstance(expr_result, blosc2.LazyExpr): positions = self._try_index_where(expr_result) if positions is not None: - total = len(self._valid_rows) - mask = np.zeros(total, dtype=bool) - valid_pos = positions[(positions >= 0) & (positions < total)] - mask[valid_pos] = True - mask &= self._valid_rows[:] - result = self.view(blosc2.asarray(mask)) + result = self._view_from_positions(positions) return result if columns is None else result.select(list(columns)) target_len = len(self._valid_rows) @@ -10321,7 +9560,12 @@ def where( all_rows_valid = known_n_rows == target_len filter_intersected = False - filter = expr_result.compute() if isinstance(expr_result, blosc2.LazyExpr) else expr_result + # Prefer a compressed boolean mask for LazyExpr filters so temporary + # mask materialization stays compact even for medium-sized selections. + if isinstance(expr_result, blosc2.LazyExpr): + filter = expr_result.compute() + else: + filter = expr_result if getattr(filter, "ndim", 1) != 1: raise ValueError( @@ -10347,7 +9591,10 @@ def where( filter_intersected = False if not filter_intersected and not all_rows_valid: - filter = (filter & self._valid_rows).compute() + if isinstance(filter, np.ndarray): + filter &= self._valid_rows[:] + else: + filter = (filter & self._valid_rows).compute() result = self.view(filter) return result if columns is None else result.select(list(columns)) diff --git a/src/blosc2/ctable_indexing.py b/src/blosc2/ctable_indexing.py new file mode 100644 index 000000000..b53324837 --- /dev/null +++ b/src/blosc2/ctable_indexing.py @@ -0,0 +1,1089 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Indexing support mixed into :class:`blosc2.CTable`.""" + +from __future__ import annotations + +import ast +import contextlib +import os +from typing import TYPE_CHECKING, Any + +import numpy as np + +import blosc2 +from blosc2 import compute_chunks_blocks +from blosc2.schema import ( + DictionarySpec, + ListSpec, + NDArraySpec, + ObjectSpec, + StructSpec, + VLBytesSpec, + VLStringSpec, +) + +if TYPE_CHECKING: + from blosc2.ctable import CTable + + +class _FakeVlMeta: + """Minimal vlmeta stand-in that accepts writes without touching a real SChunk.""" + + def __init__(self): + self._data: dict = {} + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, value): + self._data[key] = value + + def get(self, key, default=None): + return self._data.get(key, default) + + +class _FakeSchunk: + """Minimal SChunk stand-in whose vlmeta stores in memory.""" + + def __init__(self): + self.vlmeta = _FakeVlMeta() + + +class _CTableBuildProxy: + """Minimal shim that lets the ``indexing`` module build sidecars for a + CTable column without touching the column's own ``schunk.vlmeta``. + + Attributes mirror those required by the internal build functions: + ``urlpath``, ``schunk``, ``shape``, ``ndim``, ``dtype``, ``chunks``, + ``blocks``, and item access via ``__getitem__``. + """ + + def __init__(self, col_array: blosc2.NDArray, anchor_urlpath: str | None) -> None: + self._col_array = col_array + self.urlpath = anchor_urlpath # controls sidecar placement + self.schunk = _FakeSchunk() + self.shape = col_array.shape + self.ndim = col_array.ndim + self.dtype = col_array.dtype + self.chunks = col_array.chunks + self.blocks = col_array.blocks + + def __getitem__(self, key): + return self._col_array[key] + + +class _CTableIndexingMixin: + # Cost-model constants for cross-column index refinement. + # Calibrated from profiling with sparse-gather optimisations. + # _GATHER_COST_MS_PER_1K_ITEMS_PER_OP ≈ ms to sparse-gather 1000 items from one operand column + # _SCAN_COST_MS_PER_1M_ROWS ≈ ms to miniexpr-scan 1 million rows + # If refinement cost exceeds scan cost, fall back to a full scan. + _GATHER_COST_MS_PER_1K_ITEMS_PER_OP: float = 3.5 + _SCAN_COST_MS_PER_1M_ROWS: float = 4.3 + + @property + def _root_table(self) -> CTable: + """Return the root (non-view) table; *self* if not a view.""" + t = self + while t.base is not None: + t = t.base + return t + + def _invalidate_index_catalog_cache(self) -> None: + root = self._root_table + root._cached_index_catalog = None + root._cached_index_catalog_revision = None + + def _get_index_catalog(self) -> dict: + root = self._root_table + revision = root._storage.index_catalog_revision() + catalog = getattr(root, "_cached_index_catalog", None) + if catalog is None or getattr(root, "_cached_index_catalog_revision", None) != revision: + catalog = root._storage.load_index_catalog() + root._cached_index_catalog = catalog + root._cached_index_catalog_revision = revision + return catalog + + def _mark_all_indexes_stale(self) -> None: + """Bump value_epoch and mark every catalog entry stale on the root table.""" + root = self._root_table + root._storage.bump_value_epoch() + catalog = root._get_index_catalog() + if not catalog: + return + changed = False + for desc in catalog.values(): + if not desc.get("stale", False): + desc["stale"] = True + changed = True + if changed: + root._storage.save_index_catalog(catalog) + root._invalidate_index_catalog_cache() + + @staticmethod + def _validate_index_descriptor(col_name: str, descriptor: dict) -> None: + """Raise ValueError when an index catalog entry is malformed.""" + if not isinstance(descriptor, dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: descriptor must be a dict.") + token = descriptor.get("token") + if not isinstance(token, str) or not token: + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing token.") + kind = descriptor.get("kind") + if kind not in {"summary", "bucket", "partial", "full", "opsi"}: + raise ValueError(f"Malformed index metadata for column {col_name!r}: invalid kind {kind!r}.") + if kind == "bucket" and not isinstance(descriptor.get("bucket"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing bucket payload.") + if kind == "partial" and not isinstance(descriptor.get("partial"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing partial payload.") + if kind == "full" and not isinstance(descriptor.get("full"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing full payload.") + + def _drop_index_descriptor(self, col_name: str, descriptor: dict) -> None: + """Delete sidecars/cache for a catalog descriptor without touching the column mapping.""" + from pathlib import Path + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _clear_cached_data, + _drop_descriptor_sidecars, + _is_persistent_array, + ) + + token = descriptor["token"] + col_arr = None + with contextlib.suppress(Exception): + col_arr = self._index_target_array(col_name, descriptor) + + if col_arr is not None: + _clear_cached_data(col_arr, token) + + if col_arr is not None and _is_persistent_array(col_arr): + arr_key = _array_key(col_arr) + store = _PERSISTENT_INDEXES.get(arr_key) + if store is not None: + store["indexes"].pop(token, None) + elif col_arr is not None: + store = _IN_MEMORY_INDEXES.get(id(col_arr)) + if store is not None: + store["indexes"].pop(token, None) + + _drop_descriptor_sidecars(descriptor) + self._root_table._expr_index_arrays.pop(token, None) + + expr_values_path = descriptor.get("expr_values_path") + if expr_values_path is not None: + with contextlib.suppress(OSError): + os.remove(expr_values_path) + + anchor = self._storage.index_anchor_path(col_name) + if anchor is not None: + proxy_key = ("persistent", str(Path(anchor).resolve())) + _PERSISTENT_INDEXES.pop(proxy_key, None) + with contextlib.suppress(OSError): + os.rmdir(os.path.dirname(anchor)) + + def _index_create_kwargs_from_descriptor(self, descriptor: dict) -> dict[str, Any]: + """Return create_index kwargs that rebuild an existing descriptor.""" + build = "ooc" if bool(descriptor.get("ooc", False)) else "memory" + kwargs = { + "kind": descriptor["kind"], + "optlevel": int(descriptor.get("optlevel", 5)), + "name": descriptor.get("name") or None, + "build": build, + "cparams": descriptor.get("cparams"), + } + if descriptor.get("kind") == "full": + kwargs["method"] = descriptor.get("full", {}).get("build_method", "global-sort") + if descriptor.get("kind") == "opsi": + kwargs["opsi_max_cycles"] = descriptor.get("opsi", {}).get("max_cycles") + target = descriptor.get("target") or {} + if target.get("source") == "expression": + kwargs["expression"] = target.get("expression") + return kwargs + + def _normalize_table_expression_target( + self, expression: str, operands: dict | None = None + ) -> tuple[dict, np.dtype]: + """Normalize a same-table expression target and infer its dtype.""" + if operands is None: + operands = self._cols + try: + ast.parse(expression, mode="eval") + except SyntaxError as exc: + raise ValueError("expression is not valid Python syntax") from exc + + owned_ids = {id(arr): name for name, arr in self._root_table._cols.items()} + dependencies: list[str] = [] + valid = True + + class _Canonicalizer(ast.NodeTransformer): + def visit_Name(self_inner, node: ast.Name) -> ast.AST: + nonlocal valid + operand = operands.get(node.id) + if operand is None or not isinstance(operand, blosc2.NDArray): + return node + cname = owned_ids.get(id(operand)) + if cname is None: + valid = False + return node + dependencies.append(cname) + return ast.copy_location(ast.Name(id=cname, ctx=node.ctx), node) + + normalized = _Canonicalizer().visit( + ast.fix_missing_locations(ast.parse(expression, mode="eval")).body + ) + if not valid or not dependencies: + raise ValueError("expression indexes require operands from stored columns of the same table") + dependencies = list(dict.fromkeys(dependencies)) + expression_key = ast.unparse(normalized) + lazy = blosc2.lazyexpr(expression_key, {dep: self._root_table._cols[dep] for dep in dependencies}) + sample_stop = min( + len(self._root_table._valid_rows), max(1, int(self._root_table._valid_rows.blocks[0])) + ) + sample = lazy[:sample_stop] + if isinstance(sample, blosc2.NDArray): + sample = sample[:] + sample = np.asarray(sample) + dtype = np.dtype(sample.dtype) + if sample.ndim != 1: + raise ValueError("expression indexes require expressions returning a 1-D scalar stream") + target = { + "source": "expression", + "expression": expression, + "expression_key": expression_key, + "dependencies": dependencies, + } + return target, dtype + + def _expression_index_values_path(self, token: str) -> str | None: + anchor = self._storage.index_anchor_path(token) + if anchor is None: + return None + return os.path.join(os.path.dirname(anchor), "values.b2nd") + + def _build_expression_values_array(self, target: dict, dtype: np.dtype, cparams=None) -> blosc2.NDArray: + """Build a physical 1-D values array for a table expression target.""" + from blosc2.indexing import _target_token + + root = self._root_table + capacity = len(root._valid_rows) + chunks, blocks = compute_chunks_blocks((capacity,), dtype=dtype) + urlpath = root._expression_index_values_path(_target_token(target)) + if urlpath is not None: + os.makedirs(os.path.dirname(urlpath), exist_ok=True) + arr = blosc2.zeros( + (capacity,), dtype=dtype, urlpath=urlpath, mode="w", chunks=chunks, blocks=blocks + ) + else: + arr = blosc2.zeros((capacity,), dtype=dtype, chunks=chunks, blocks=blocks) + lazy = blosc2.lazyexpr( + target["expression_key"], {dep: root._cols[dep] for dep in target["dependencies"]} + ) + step = int(root._valid_rows.chunks[0]) if root._valid_rows.chunks else 65536 + for start in range(0, capacity, step): + stop = min(start + step, capacity) + values = lazy[start:stop] + if isinstance(values, blosc2.NDArray): + values = values[:] + arr[start:stop] = np.asarray(values, dtype=dtype) + root._expr_index_arrays[_target_token(target)] = arr + return arr + + def _index_target_array(self, lookup_key: str, descriptor: dict) -> blosc2.NDArray: + """Return the physical array backing a column or expression index.""" + target = descriptor.get("target") or {} + if target.get("source") != "expression": + return self._root_table._cols[lookup_key] + token = descriptor["token"] + root = self._root_table + arr = root._expr_index_arrays.get(token) + if arr is not None: + return arr + path = descriptor.get("expr_values_path") + if path is None: + raise KeyError(f"No backing array found for expression index {token!r}.") + arr = blosc2.open(path, mode="r" if root._read_only else "a") + root._expr_index_arrays[token] = arr + return arr + + def _resolve_index_catalog_entry( + self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None + ) -> tuple[str, dict]: + """Resolve an index catalog entry by column, expression, or label.""" + catalog = self._root_table._get_index_catalog() + if col_name is not None and expression is not None: + raise ValueError("col_name and expression are mutually exclusive") + if col_name is not None: + col_name = self._logical_to_physical_name(col_name) + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + return col_name, catalog[col_name] + if expression is not None: + from blosc2.indexing import _target_token + + target, _ = self._normalize_table_expression_target(expression) + token = _target_token(target) + if token not in catalog: + raise KeyError(f"No index found for expression {expression!r}.") + return token, catalog[token] + if name is not None: + matches = [(key, desc) for key, desc in catalog.items() if desc.get("name") == name] + if not matches: + raise KeyError(f"No index found with name {name!r}.") + if len(matches) > 1: + raise ValueError(f"Multiple indexes found with name {name!r}; specify a target explicitly.") + return matches[0] + raise TypeError("must specify col_name, expression, or name") + + def _build_index_persistent( + self, + col_name: str, + col_arr: blosc2.NDArray, + *, + kind: str, + optlevel: int, + name_hint: str | None, + build: str, + tmpdir: str | None, + cparams_obj, + method: str | None = None, + opsi_max_cycles: int | None = None, + ) -> dict: + """Build index sidecar files for a persistent-table column; return the descriptor.""" + import tempfile + from pathlib import Path + + from blosc2.indexing import ( + _PERSISTENT_INDEXES, + _array_key, + _build_bucket_descriptor, + _build_bucket_descriptor_ooc, + _build_descriptor, + _build_full_descriptor, + _build_full_descriptor_ooc, + _build_levels_descriptor, + _build_levels_descriptor_ooc, + _build_opsi_descriptor, + _build_partial_descriptor, + _build_partial_descriptor_ooc, + _copy_descriptor, + _field_target_descriptor, + _resolve_full_index_tmpdir, + _resolve_ooc_mode, + _target_token, + _values_for_target, + ) + + anchor = self._storage.index_anchor_path(col_name) + os.makedirs(os.path.dirname(anchor), exist_ok=True) + proxy = _CTableBuildProxy(col_arr, anchor) + proxy_key = _array_key(proxy) + _PERSISTENT_INDEXES.pop(proxy_key, None) # clear any stale cache entry + + target = _field_target_descriptor(None) + token = _target_token(target) + persistent = True + dtype = col_arr.dtype + use_ooc = _resolve_ooc_mode(kind, build) + if opsi_max_cycles is None: + opsi_max_cycles = max(1, optlevel if optlevel < 8 else optlevel * 2) + + if use_ooc: + resolved_tmpdir = _resolve_full_index_tmpdir(proxy, tmpdir) + levels = _build_levels_descriptor_ooc(proxy, target, token, kind, dtype, persistent, cparams_obj) + bucket = ( + _build_bucket_descriptor_ooc( + proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj + ) + if kind == "bucket" + else None + ) + partial = ( + _build_partial_descriptor_ooc( + proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj + ) + if kind == "partial" + else None + ) + full = None + opsi = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-", dir=resolved_tmpdir) as td: + full = _build_full_descriptor_ooc( + proxy, target, token, kind, dtype, persistent, Path(td), cparams_obj, optlevel + ) + full["build_method"] = "global-sort" + if kind == "opsi": + opsi = _build_opsi_descriptor( + proxy, target, token, kind, dtype, persistent, cparams_obj, opsi_max_cycles, optlevel + ) + descriptor = _build_descriptor( + proxy, + target, + token, + kind, + optlevel, + persistent, + True, + name_hint, + dtype, + levels, + bucket, + partial, + full, + cparams_obj, + opsi, + ) + else: + values = _values_for_target(proxy, target) + levels = _build_levels_descriptor( + proxy, target, token, kind, dtype, values, persistent, cparams_obj + ) + bucket = ( + _build_bucket_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) + if kind == "bucket" + else None + ) + partial = ( + _build_partial_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) + if kind == "partial" + else None + ) + full = None + opsi = None + if kind == "full": + full = _build_full_descriptor(proxy, token, kind, values, persistent, cparams_obj, optlevel) + full["build_method"] = "global-sort" + if kind == "opsi": + opsi = _build_opsi_descriptor( + proxy, target, token, kind, dtype, persistent, cparams_obj, opsi_max_cycles, optlevel + ) + descriptor = _build_descriptor( + proxy, + target, + token, + kind, + optlevel, + persistent, + False, + name_hint, + dtype, + levels, + bucket, + partial, + full, + cparams_obj, + opsi, + ) + + result = _copy_descriptor(descriptor) + _PERSISTENT_INDEXES.pop(proxy_key, None) # evict proxy to avoid memory leak + return result + + def create_index( # noqa: C901 + self, + col_name: str | None = None, + *, + field: str | None = None, + expression: str | None = None, + operands: dict | None = None, + kind: blosc2.IndexKind = blosc2.IndexKind.BUCKET, + optlevel: int = 5, + name: str | None = None, + build: str = "auto", + tmpdir: str | None = None, + **kwargs, + ) -> blosc2.Index: + """Build and register an index for a stored column or table expression. + + For tables with **nested (dotted) column names**, pass the dotted leaf + name directly:: + + t.create_index("trip.begin.lon") + t.where("trip.begin.lon > -87.7").nrows # index is used automatically + + .. rubric:: Choosing an index kind + + ``BUCKET`` (the default) is the cheapest to build and store. + It accelerates single‑column ``where`` queries and ``sort_by`` + reuse with approximate ordering derived from value + quantization. Sufficient for most workloads. + + ``FULL`` builds a globally sorted index that returns exact + row positions for any range predicate. It enables the + **cross‑column refinement** planner path: when a multi‑column + conjunction such as ``(tips > 100) & (km > 0) & (sec > 0)`` + indexes only the most selective column, the planner obtains + compact exact positions from ``FULL`` and evaluates the + remaining predicates on just those rows. ``FULL`` is also + ideal for ``sort_by`` reuse because it carries a complete + sort order. + + ``PARTIAL`` builds a chunk‑local sorted payload with segment + navigation. It is cheaper to build than ``FULL`` (roughly + half the raw storage) while still providing exact positions + for cross‑column refinement. Its exact positions are most + compact for equality or narrow range queries; wide ranges + may scan proportionally more candidate segments. + + ``OPSI`` is a specialised tier for approximate ordering; + prefer ``FULL`` when a globally sorted ordered index is + needed to accelerate ``sort_by``. + + ``SUMMARY`` stores only per‑segment min/max and is the + lightest kind; it may still skip chunks for broad range + queries but cannot accelerate ``sort_by``. + """ + if self.base is not None: + raise ValueError("Cannot create an index on a view.") + if col_name is not None and field is not None: + raise ValueError("col_name and field are mutually exclusive") + if expression is not None and (col_name is not None or field is not None): + raise ValueError("column targets and expression are mutually exclusive") + if operands is not None and expression is None: + raise ValueError("operands can only be provided together with expression") + col_name = field if field is not None else col_name + if col_name is not None: + col_name = self._logical_to_physical_name(col_name) + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _copy_descriptor, + _normalize_build_mode, + _normalize_full_build_method, + _normalize_index_cparams, + _normalize_index_kind, + _target_token, + ) + from blosc2.indexing import create_index as _ix_create_index + + cparams_obj = _normalize_index_cparams(kwargs.pop("cparams", None)) + method = kwargs.pop("method", None) + opsi_max_cycles = kwargs.pop("opsi_max_cycles", None) + if opsi_max_cycles is not None: + opsi_max_cycles = max(1, int(opsi_max_cycles)) + if kwargs: + raise TypeError(f"unexpected keyword argument(s): {', '.join(sorted(kwargs))}") + + kind_str = _normalize_index_kind(kind) + build_str = _normalize_build_mode(build) + method_str = _normalize_full_build_method(method) if kind_str == "full" else None + if method is not None and kind_str != "full": + raise ValueError("method is only supported for kind=IndexKind.FULL") + catalog = self._get_index_catalog() + + if expression is not None: + target, dtype = self._normalize_table_expression_target(expression, operands) + token = _target_token(target) + if token in catalog: + raise ValueError( + f"Index already exists for expression {expression!r}. " + "Call rebuild_index() to replace it or drop_index() first." + ) + expr_arr = self._build_expression_values_array(target, dtype, cparams=cparams_obj) + _ix_create_index( + expr_arr, + kind=blosc2.IndexKind(kind_str), + optlevel=optlevel, + name=name, + build=build, + tmpdir=tmpdir, + cparams=cparams_obj, + method=method_str, + opsi_max_cycles=opsi_max_cycles, + ) + store = _IN_MEMORY_INDEXES.get(id(expr_arr)) + if store is None: + from blosc2.indexing import _load_store + + store = _load_store(expr_arr) + descriptor = _copy_descriptor(store["indexes"]["__self__"]) + descriptor["target"] = target + descriptor["token"] = token + descriptor["dtype"] = str(np.dtype(dtype)) + descriptor["expr_values_path"] = getattr(expr_arr, "urlpath", None) + value_epoch, _ = self._storage.get_epoch_counters() + descriptor["built_value_epoch"] = value_epoch + catalog[token] = descriptor + self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() + return blosc2.Index._from_table(self, token, descriptor) + + if col_name is None: + raise TypeError("must specify col_name/field or expression") + if col_name in self._computed_cols: + raise ValueError( + f"Cannot create an index on computed column {col_name!r}: " + "computed columns have no physical storage." + ) + if col_name not in self._cols: + raise KeyError(f"No column named {col_name!r}. Available: {self.col_names}") + self._ensure_generated_column_not_stale(col_name) + if col_name in catalog: + raise ValueError( + f"Index already exists for column {col_name!r}. " + "Call rebuild_index() to replace it or drop_index() first." + ) + + col_arr = self._cols[col_name] + if isinstance(self._schema.columns_by_name[col_name].spec, NDArraySpec): + spec = self._schema.columns_by_name[col_name].spec + raise ValueError( + f"Cannot create an index on ndarray column {col_name!r} with per-row shape {spec.item_shape}. " + "Materialize a scalar generated column first, e.g. embedding_norm or embedding_max." + ) + if isinstance(self._schema.columns_by_name[col_name].spec, ListSpec): + raise ValueError(f"Cannot create an index on list column {col_name!r} in V1.") + if isinstance( + self._schema.columns_by_name[col_name].spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec) + ): + raise NotImplementedError( + f"Cannot create an index on variable-length scalar column {col_name!r}: " + "indexing for vlstring/vlbytes/struct/object columns is not supported yet." + ) + # Dictionary columns: index the underlying int32 codes array. + is_dictionary = isinstance(self._schema.columns_by_name[col_name].spec, DictionarySpec) + if is_dictionary: + col_arr = col_arr.codes # index the int32 codes NDArray + is_persistent = self._storage.index_anchor_path(col_name) is not None + + if is_persistent: + descriptor = self._build_index_persistent( + col_name, + col_arr, + kind=kind_str, + optlevel=optlevel, + name_hint=name, + build=build_str, + tmpdir=tmpdir, + cparams_obj=cparams_obj, + method=method_str, + opsi_max_cycles=opsi_max_cycles, + ) + else: + _ix_create_index( + col_arr, + field=None, + kind=blosc2.IndexKind(kind_str), + optlevel=optlevel, + name=name, + build=build, + tmpdir=tmpdir, + cparams=cparams_obj, + method=method_str, + opsi_max_cycles=opsi_max_cycles, + ) + store = _IN_MEMORY_INDEXES[id(col_arr)] + descriptor = _copy_descriptor(store["indexes"]["__self__"]) + + value_epoch, _ = self._storage.get_epoch_counters() + descriptor["built_value_epoch"] = value_epoch + + catalog = self._get_index_catalog() + catalog[col_name] = descriptor + self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() + return blosc2.Index._from_table(self, col_name, descriptor) + + def drop_index( + self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None + ) -> None: + """Remove an index and delete any sidecar files.""" + if self.base is not None: + raise ValueError("Cannot drop an index from a view.") + + lookup_key, descriptor = self._resolve_index_catalog_entry( + col_name, expression=expression, name=name + ) + catalog = self._get_index_catalog() + catalog.pop(lookup_key, None) + self._validate_index_descriptor(lookup_key, descriptor) + self._drop_index_descriptor(lookup_key, descriptor) + self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() + + def rebuild_index( + self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None + ) -> blosc2.Index: + """Drop and recreate an index with the same parameters.""" + if self.base is not None: + raise ValueError("Cannot rebuild an index on a view.") + + lookup_key, old_desc = self._resolve_index_catalog_entry(col_name, expression=expression, name=name) + self._validate_index_descriptor(lookup_key, old_desc) + create_kwargs = self._index_create_kwargs_from_descriptor(old_desc) + + self.drop_index(col_name, expression=expression, name=name) + if "expression" in create_kwargs: + return self.create_index(expression=create_kwargs.pop("expression"), **create_kwargs) + return self.create_index(lookup_key, **create_kwargs) + + def compact_index( + self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None + ) -> blosc2.Index: + """Compact an index, merging any incremental append runs.""" + if self.base is not None: + raise ValueError("Cannot compact an index on a view.") + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _copy_descriptor, + _default_index_store, + _is_persistent_array, + ) + from blosc2.indexing import compact_index as _ix_compact_index + + lookup_key, descriptor = self._resolve_index_catalog_entry( + col_name, expression=expression, name=name + ) + col_arr = self._index_target_array(lookup_key, descriptor) + catalog = self._get_index_catalog() + + if _is_persistent_array(col_arr): + anchor = self._storage.index_anchor_path(lookup_key) + proxy = _CTableBuildProxy(col_arr, anchor) + proxy_key = _array_key(proxy) + store = _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _PERSISTENT_INDEXES[proxy_key] = store + try: + _ix_compact_index(proxy) + updated_store = _PERSISTENT_INDEXES.get(proxy_key) or store + updated_desc = _copy_descriptor(updated_store["indexes"][descriptor["token"]]) + finally: + _PERSISTENT_INDEXES.pop(proxy_key, None) + updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) + catalog[lookup_key] = updated_desc + self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() + return blosc2.Index._from_table(self, lookup_key, updated_desc) + else: + _ix_compact_index(col_arr) + store = _IN_MEMORY_INDEXES.get(id(col_arr)) + if store: + token = descriptor["token"] + updated_desc = _copy_descriptor(store["indexes"].get(token, descriptor)) + updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) + catalog[lookup_key] = updated_desc + self._storage.save_index_catalog(catalog) + self._invalidate_index_catalog_cache() + return blosc2.Index._from_table(self, lookup_key, updated_desc) + return blosc2.Index._from_table(self, lookup_key, descriptor) + + def index( + self, col_name: str | None = None, *, expression: str | None = None, name: str | None = None + ) -> blosc2.Index: + """Return the index handle for a stored-column or expression target.""" + lookup_key, descriptor = self._resolve_index_catalog_entry( + col_name, expression=expression, name=name + ) + return blosc2.Index._from_table(self, lookup_key, descriptor) + + @property + def indexes(self) -> list[blosc2.Index]: + """Return a list of :class:`blosc2.Index` handles for all active indexes.""" + catalog = self._root_table._get_index_catalog() + return [blosc2.Index._from_table(self, col_name, desc) for col_name, desc in catalog.items()] + + def _rewrite_expression_query_for_index( + self, expression: str, operands: dict, target: dict + ) -> str | None: + """Rewrite matching table-expression subtrees to ``_where_x`` for planning.""" + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError: + return None + + class _Rewriter(ast.NodeTransformer): + def __init__(self, outer): + self.outer = outer + self.changed = False + + def generic_visit(self, node): + normalized = None + with contextlib.suppress(Exception): + normalized, _ = self.outer._normalize_table_expression_target( + ast.unparse(node), operands + ) + if normalized is not None and normalized.get("expression_key") == target.get( + "expression_key" + ): + self.changed = True + return ast.copy_location(ast.Name(id="_where_x", ctx=ast.Load()), node) + return super().generic_visit(node) + + rewriter = _Rewriter(self) + new_body = rewriter.visit(tree.body) + if not rewriter.changed: + return None + return ast.unparse(new_body) + + def _try_expression_index_where(self, expr_result: blosc2.LazyExpr, catalog: dict) -> np.ndarray | None: + """Attempt to resolve *expr_result* via a direct table expression index.""" + from blosc2.indexing import evaluate_bucket_query, evaluate_segment_query, plan_query + + expression = expr_result.expression + operands = dict(expr_result.operands) + for lookup_key, descriptor in catalog.items(): + target = descriptor.get("target") or {} + if target.get("source") != "expression" or descriptor.get("stale", False): + continue + rewritten = self._rewrite_expression_query_for_index(expression, operands, target) + if rewritten is None: + continue + expr_arr = self._index_target_array(lookup_key, descriptor) + where_dict = {"_where_x": expr_arr} + merged_operands = {"_where_x": expr_arr} + plan = plan_query(rewritten, merged_operands, where_dict) + if not plan.usable: + continue + if plan.exact_positions is not None: + return np.asarray(plan.exact_positions, dtype=np.int64) + if plan.bucket_masks is not None: + _, positions = evaluate_bucket_query( + rewritten, merged_operands, {}, where_dict, plan, return_positions=True + ) + return np.asarray(positions, dtype=np.int64) + if plan.candidate_units is not None and plan.segment_len is not None: + _, positions = evaluate_segment_query( + rewritten, merged_operands, {}, where_dict, plan, return_positions=True + ) + return np.asarray(positions, dtype=np.int64) + return None + + @staticmethod + def _evaluate_expression_at(expr_result, candidates, *, prefetched: dict | None = None): + """Evaluate *expr_result* on the operand rows at *candidates*. + + Returns a boolean ``numpy.ndarray`` the same length as *candidates*, + or ``None`` if evaluation fails. + + Parameters + ---------- + prefetched: + Optional dict mapping operand variable names to already-gathered + NumPy arrays. When provided, those operands are reused instead of + re-read from storage. + """ + try: + operands = {} + for var_name, arr in expr_result.operands.items(): + if prefetched is not None and var_name in prefetched: + sliced = prefetched[var_name] + else: + sliced = arr[candidates] + if hasattr(sliced, "__array__"): + sliced = np.asarray(sliced) + operands[var_name] = sliced + return blosc2.evaluate(expr_result.expression, operands) + except Exception: + return None + + @staticmethod + def _find_indexed_columns(root_cols, catalog, operands): + """Return live indexed columns referenced by *operands* in expression order. + + Avoid iterating over ``root_cols.items()`` here: for lazy persistent tables + that would open every column just to find the indexed operands. + """ + indexed = [] + seen = set() + indexed_arrays = {} + for col_name, descriptor in catalog.items(): + if col_name in root_cols: + indexed_arrays[col_name] = (root_cols[col_name], descriptor) + + for operand in operands.values(): + if not isinstance(operand, blosc2.NDArray): + continue + for col_name, (col_arr, descriptor) in indexed_arrays.items(): + if col_name in seen or col_arr is not operand: + continue + _CTableIndexingMixin._validate_index_descriptor(col_name, descriptor) + if descriptor.get("stale", False): + continue + indexed.append((col_name, col_arr, descriptor)) + seen.add(col_name) + return indexed + + def _try_index_where(self, expr_result: blosc2.LazyExpr) -> np.ndarray | None: # noqa: C901 + """Attempt to resolve *expr_result* via a column index. + + Returns a 1-D int64 array of physical row positions that satisfy the + predicate, or ``None`` if no usable index was found (caller falls back + to a full scan). + """ + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _default_index_store, + _is_persistent_array, + evaluate_bucket_query, + evaluate_segment_query, + plan_query, + ) + + root = self._root_table + catalog = root._get_index_catalog() + if not catalog: + return None + + positions = self._try_expression_index_where(expr_result, catalog) + if positions is not None: + return positions + + expression = expr_result.expression + operands = dict(expr_result.operands) + + indexed_columns = self._find_indexed_columns(root._cols, catalog, operands) + if not indexed_columns: + return None + + primary_col_name, primary_col_arr, _ = indexed_columns[0] + nullable_indexed = [ + name + for name, _arr, _descriptor in indexed_columns + if getattr(root._schema.columns_by_name[name].spec, "null_value", None) is not None + ] + + # Global null post-filtering is not correct for OR expressions. + if nullable_indexed and ("|" in expr_result.expression or " or " in expr_result.expression): + return None + + # Inject every usable table-owned descriptor so plan_query can combine them. + # In .b2z read mode all columns share the same urlpath, so _array_key() + # returns the same key for every column — causing _SIDECAR_HANDLE_CACHE + # collisions across queries. Clear stale handles before each injection so + # the upcoming query always loads the correct sidecar for this column. + from blosc2.indexing import _clear_cached_data + + for _col_name, col_arr, descriptor in indexed_columns[:1]: + arr_key = _array_key(col_arr) + if _is_persistent_array(col_arr): + store = _PERSISTENT_INDEXES.get(arr_key) or _default_index_store() + if store["indexes"].get(descriptor["token"]) is not descriptor: + _clear_cached_data(col_arr, descriptor["token"]) + store["indexes"][descriptor["token"]] = descriptor + _PERSISTENT_INDEXES[arr_key] = store + else: + store = _IN_MEMORY_INDEXES.get(id(col_arr)) or _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _IN_MEMORY_INDEXES[id(col_arr)] = store + + where_dict = {"_where_x": primary_col_arr} + merged_operands = {**operands, "_where_x": primary_col_arr} + + plan = plan_query(expression, merged_operands, where_dict) + if not plan.usable: + return None + + def _exclude_null_positions(positions): + positions = np.asarray(positions, dtype=np.int64) + for name in nullable_indexed: + col = root._schema.columns_by_name[name] + raw = root._cols[name][positions] + nv = getattr(col.spec, "null_value", None) + if isinstance(nv, float) and np.isnan(nv): + keep = ~np.isnan(raw) + else: + keep = raw != nv + positions = positions[keep] + return positions + + if plan.exact_positions is not None: + return _exclude_null_positions(plan.exact_positions) + + if plan.partial_exact_positions is not None: + # Cross-column refinement: the FULL index on one column gave us + # exact positions, but the expression has additional predicates on + # other columns. Refinement reads every operand column at those + # candidate positions using sparse/fancy indexing. For compressed + # columns this can touch many chunks and be slower than the regular + # sequential miniexpr scan, which is very fast for simple predicates. + # Use a cost model to compare refinement vs full scan. + candidates = np.asarray(plan.partial_exact_positions, dtype=np.int64) + n_candidates = len(candidates) + n_operands = len(expr_result.operands) + target_len = len(root._valid_rows) + + estimated_refine_ms = ( + (n_candidates / 1000.0) * self._GATHER_COST_MS_PER_1K_ITEMS_PER_OP * n_operands + ) + estimated_scan_ms = (target_len / 1_000_000.0) * self._SCAN_COST_MS_PER_1M_ROWS + if estimated_refine_ms > estimated_scan_ms: + return None + + # Read the primary column once and reuse for both null filtering + # and refinement, avoiding a second sparse gather later. + primary_op_name = next( + (vn for vn, va in expr_result.operands.items() if va is primary_col_arr), None + ) + prefetched = None + if nullable_indexed and primary_op_name is not None: + raw = primary_col_arr[candidates] + raw = np.asarray(raw) if hasattr(raw, "__array__") else raw + pos = candidates + for name in nullable_indexed: + if name == primary_col_name: + nv = getattr(root._schema.columns_by_name[name].spec, "null_value", None) + if isinstance(nv, float) and np.isnan(nv): + keep = ~np.isnan(raw) + else: + keep = raw != nv + pos = pos[keep] + raw = raw[keep] # already filtered for refinement reuse + else: + col = root._schema.columns_by_name[name] + vals = root._cols[name][pos] + nv = getattr(col.spec, "null_value", None) + if isinstance(nv, float) and np.isnan(nv): + keep = ~np.isnan(vals) + else: + keep = vals != nv + pos = pos[keep] + candidates = pos + prefetched = {primary_op_name: raw} + else: + candidates = _exclude_null_positions(candidates) + + restricted = self._evaluate_expression_at(expr_result, candidates, prefetched=prefetched) + if restricted is not None and restricted.dtype == np.bool_: + refined = candidates[np.asarray(restricted, dtype=bool)] + return _exclude_null_positions(refined) + # Fall through to full scan if refinement fails + + if plan.bucket_masks is not None: + # When bucket pruning covers all units (100 % of chunks are + # candidates), the per‑chunk evaluation overhead outweighs the + # benefit over a plain scan. Fall back to the scan path. + if plan.total_units > 0 and plan.selected_units >= plan.total_units: + return None + _, positions = evaluate_bucket_query( + expression, merged_operands, {}, where_dict, plan, return_positions=True + ) + return _exclude_null_positions(positions) + + if plan.candidate_units is not None and plan.segment_len is not None: + # When segment summaries prune fewer than half the candidate + # units, the per‑segment evaluation overhead outweighs a plain + # scan. Fall back to the scan path. + if plan.total_units > 0 and plan.selected_units / plan.total_units > 0.5: + return None + _, positions = evaluate_segment_query( + expression, merged_operands, {}, where_dict, plan, return_positions=True + ) + return _exclude_null_positions(positions) + + return None diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py index fa17a4e68..f6313b77a 100644 --- a/src/blosc2/ctable_storage.py +++ b/src/blosc2/ctable_storage.py @@ -158,6 +158,13 @@ def save_index_catalog(self, catalog: dict) -> None: """Persist *catalog* (column_name → descriptor dict).""" raise NotImplementedError + def index_catalog_revision(self) -> int: + """Return a process-local revision for cache invalidation.""" + return int(getattr(self, "_index_catalog_revision", 0)) + + def _bump_index_catalog_revision(self) -> None: + self._index_catalog_revision = self.index_catalog_revision() + 1 + def get_epoch_counters(self) -> tuple[int, int]: """Return ``(value_epoch, visibility_epoch)``.""" raise NotImplementedError @@ -268,6 +275,7 @@ def load_index_catalog(self) -> dict: def save_index_catalog(self, catalog: dict) -> None: self._index_catalog = copy.deepcopy(catalog) + self._bump_index_catalog_revision() def get_epoch_counters(self) -> tuple[int, int]: return self._value_epoch, self._visibility_epoch @@ -291,6 +299,7 @@ def index_anchor_path(self, col_name: str) -> str | None: _META_KEY = "/_meta" _VALID_ROWS_KEY = "/_valid_rows" _COLS_DIR = "_cols" +_VLMETA_KEY = "/_vlmeta" def split_field_path(path: str) -> tuple[str, ...]: @@ -369,6 +378,13 @@ def __init__(self, urlpath: str, mode: str, store: blosc2.TreeStore | None = Non self._root = urlpath self._mode = mode self._meta: blosc2.SChunk | None = None + self._vlmeta: blosc2.SChunk | None = None + # CTable internals must always use external-file storage (never the + # embed store) so that small SChunk overwrites (e.g. _meta with + # nbytes=0) are reliably persisted. Normalise a pre-existing store + # that was opened by generic dispatch without this setting. + if store is not None and store.threshold != 0: + store.threshold = 0 self._store: blosc2.TreeStore | None = store # ------------------------------------------------------------------ @@ -383,6 +399,10 @@ def _meta_path(self) -> str: def _valid_rows_path(self) -> str: return self._key_to_path(_VALID_ROWS_KEY) + @property + def _vlmeta_path(self) -> str: + return self._key_to_path(_VLMETA_KEY) + def _col_path(self, name: str) -> str: return self._key_to_path(self._col_key(name)) @@ -402,7 +422,7 @@ def _col_key(self, name: str) -> str: def _key_to_path(self, key: str) -> str: rel_key = key.lstrip("/") - suffix = ".b2f" if key == _META_KEY else ".b2nd" + suffix = ".b2f" if key in (_META_KEY, _VLMETA_KEY) else ".b2nd" if self._root.endswith(".b2d"): return os.path.join(self._root, rel_key + suffix) return os.path.join(self._root, rel_key + suffix) @@ -557,6 +577,40 @@ def save_schema(self, schema_dict: dict[str, Any]) -> None: raise ValueError("CTable manifest '/_meta' must materialize as an SChunk.") self._meta = opened + def save_vlmeta(self, schunk: blosc2.SChunk) -> None: + """Persist the user vlmeta SChunk to the storage.""" + if self._mode == "r": + return + self._vlmeta = schunk + if self._store is not None: + self._store[_VLMETA_KEY] = schunk + + def _open_vlmeta(self) -> blosc2.SChunk | None: + """Open (or return cached) the ``/_vlmeta`` SChunk. + + Returns ``None`` if the file does not exist (read-only open of a + table that never had user vlmeta written). + """ + uv = getattr(self, "_vlmeta", None) + if uv is not None: + return uv + # Try TreeStore first + try: + opened = self._open_store()[_VLMETA_KEY] + if isinstance(opened, blosc2.SChunk): + self._vlmeta = opened + return opened + except (KeyError, FileNotFoundError): + pass + # Fallback: try opening the filesystem path directly + uv_path = self._vlmeta_path + if os.path.exists(uv_path): + opened = blosc2.open(uv_path, mode="r") + if isinstance(opened, blosc2.SChunk): + self._vlmeta = opened + return opened + return None + def _open_meta(self) -> blosc2.SChunk: """Open (or return cached) the ``/_meta`` SChunk.""" if self._meta is None: @@ -712,6 +766,7 @@ def save_index_catalog(self, catalog: dict) -> None: working_dir = self._open_store().working_dir relativized = {col: self._relativize_descriptor(desc, working_dir) for col, desc in catalog.items()} meta.vlmeta["index_catalog"] = relativized + self._bump_index_catalog_revision() def get_epoch_counters(self) -> tuple[int, int]: meta = self._open_meta() @@ -773,6 +828,7 @@ def __init__( self._mode = mode self._owns_store = owns_store self._meta: blosc2.SChunk | None = None + self._vlmeta: blosc2.SChunk | None = None # ------------------------------------------------------------------ # Key / path helpers @@ -1059,6 +1115,31 @@ def check_kind(self) -> None: if kind != "ctable": raise ValueError(f"Object at {self._root_key!r} is not a CTable (kind={kind!r})") + def save_vlmeta(self, schunk: blosc2.SChunk) -> None: + """Persist the user vlmeta SChunk to the outer TreeStore.""" + if self._mode == "r": + return + self._vlmeta = schunk + self._write_leaf("/_vlmeta", schunk, ".b2f") + + def _open_vlmeta(self) -> blosc2.SChunk | None: + """Open (or return cached) the ``/_vlmeta`` SChunk. + + Returns ``None`` if the leaf does not exist (read-only open of a + table that never had user vlmeta written). + """ + uv = getattr(self, "_vlmeta", None) + if uv is not None: + return uv + try: + opened = self._open_leaf("/_vlmeta") + except (KeyError, FileNotFoundError): + return None + if not isinstance(opened, blosc2.SChunk): + return None + self._vlmeta = opened + return opened + def column_names_from_schema(self) -> list[str]: return [c["name"] for c in self.load_schema()["columns"]] @@ -1145,6 +1226,7 @@ def save_index_catalog(self, catalog: dict) -> None: col: FileTableStorage._relativize_descriptor(desc, working_dir) for col, desc in catalog.items() } meta.vlmeta["index_catalog"] = relativized + self._bump_index_catalog_revision() def get_epoch_counters(self) -> tuple[int, int]: meta = self._open_meta() diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 7a7b38ab3..03f977a14 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -71,16 +71,15 @@ class DictStore: If None, the default Blosc2 storage properties are used. threshold : int or None, optional Threshold (in bytes of uncompressed data) under which values are kept - in the embedded store. If None, in-memory arrays are stored in the - embedded store and on-disk arrays are stored as separate files. - C2Array objects will always be stored in the embedded store, - regardless of their size. + in the embedded store. Default is 0, meaning all values are persisted + as external files by default. C2Array objects are always stored in + the embedded store regardless of this setting. Examples -------- >>> dstore = DictStore(localpath="my_dstore.b2z", mode="w") - >>> dstore["/node1"] = np.array([1, 2, 3]) # goes to embed store - >>> dstore["/node2"] = blosc2.ones(2) # goes to embed store + >>> dstore["/node1"] = np.array([1, 2, 3]) + >>> dstore["/node2"] = blosc2.ones(2) >>> arr_external = blosc2.arange(3, urlpath="ext_node3.b2nd", mode="w") >>> dstore["/dir1/node3"] = arr_external # external file in dir1 (.b2nd) >>> schunk = blosc2.SChunk(chunksize=32) @@ -110,7 +109,7 @@ def __init__( cparams: blosc2.CParams | None = None, dparams: blosc2.DParams | None = None, storage: blosc2.Storage | None = None, - threshold: int | None = 2**13, + threshold: int | None = 0, *, mmap_mode: str | None = None, _storage_meta: dict | None = None, @@ -465,6 +464,13 @@ def __setitem__( rel_path = rel_path.replace(os.sep, "/") self.map_tree[key] = rel_path else: + # Remove any old external file so it doesn't shadow the embed-stored + # value on read (map_tree is checked first in __getitem__). + if key in self.map_tree: + old_filepath = self.map_tree.pop(key) + old_full_path = os.path.join(self.working_dir, old_filepath) + if os.path.exists(old_full_path): + os.remove(old_full_path) if external_file: # Embed a copy by using cframe value = blosc2.from_cframe(value.to_cframe()) diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index 20d32d142..a06de896a 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -98,11 +98,19 @@ def __init__( self.mmap_mode = mmap_mode if _from_schunk is not None: + self.urlpath = _from_schunk.urlpath self.cparams = _from_schunk.cparams self.dparams = _from_schunk.dparams - self.mode = mode + self.mode = _from_schunk.mode + self.mmap_mode = getattr(_from_schunk, "mmap_mode", None) self._store = _from_schunk - self.storage = blosc2.Storage() + self.storage = blosc2.Storage( + contiguous=_from_schunk.contiguous, + urlpath=_from_schunk.urlpath, + mode=self.mode, + mmap_mode=self.mmap_mode, + initial_mapping_size=getattr(_from_schunk, "initial_mapping_size", None), + ) self.storage.meta = _from_schunk.meta self._load_metadata() return diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 5c53d694b..b86c98d15 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -63,13 +63,28 @@ QUERY_CACHE_MAX_MEM_NBYTES = 131_072 # 128 KB for the in-process hot cache QUERY_CACHE_MAX_PERSISTENT_NBYTES = 4 * 1024 * 1024 # 4 MB of logical int64 positions in the payload store -# In-process hot cache: (array-scope, digest) -> decoded np.ndarray of coordinates. -_HOT_CACHE: dict[tuple[tuple[str, str | int], str], np.ndarray] = {} + +@dataclass(frozen=True, slots=True) +class _CompressedHotCoords: + dtype: str + nrows: int + compressed: bool + data: bytes + + @property + def nbytes(self) -> int: + return len(self.data) + + +# In-process hot cache: (array-scope, digest) -> compressed coordinate payload. +_HOT_CACHE: dict[tuple[tuple[str, str | int], str], _CompressedHotCoords] = {} # Insertion-order list for LRU eviction. _HOT_CACHE_ORDER: list[tuple[tuple[str, str | int], str]] = [] # Total bytes of arrays currently in the hot cache. _HOT_CACHE_BYTES: int = 0 -# Persistent ObjectArray handles: resolved urlpath -> open ObjectArray object. +# Legacy query-cache sidecar handles: resolved urlpath -> open ObjectArray object. +# Query caches are hot-cache-only now, but we keep this state so invalidation can +# still drop stale artifacts produced by older versions. _QUERY_CACHE_STORE_HANDLES: dict[str, object] = {} # Cached mmap handles for data arrays used in full-query gather: urlpath -> NDArray. _GATHER_MMAP_HANDLES: dict[str, object] = {} @@ -435,45 +450,18 @@ def _normalize_query_cache_catalog(catalog: dict) -> dict: def _load_query_cache_catalog(array: blosc2.NDArray) -> dict | None: - """Read the query-cache catalog from *array* vlmeta, or return None.""" - if not _is_persistent_array(array): - return None - try: - cat = array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] - except KeyError: - return None - if not isinstance(cat, dict) or cat.get("version") != QUERY_CACHE_FORMAT_VERSION: - return None - return _normalize_query_cache_catalog(cat) + """Return ``None`` because query caches are intentionally not persisted.""" + return None def _save_query_cache_catalog(array: blosc2.NDArray, catalog: dict) -> None: - """Write *catalog* back to *array* vlmeta.""" - array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] = catalog + """No-op: query caches are intentionally not persisted.""" + return def _open_query_cache_store(array: blosc2.NDArray, *, create: bool = False): - """Return an open (writable) ObjectArray for the persistent payload store. - - Returns ``None`` if the array is not persistent. When *create* is True the - store is created if it does not yet exist. - """ - _purge_stale_persistent_caches() - if not _is_persistent_array(array): - return None - path = _query_cache_payload_path(array) - cached = _QUERY_CACHE_STORE_HANDLES.get(path) - if cached is not None: - return cached - if Path(path).exists(): - vla = blosc2.ObjectArray(storage=blosc2.Storage(urlpath=path, mode="a")) - _QUERY_CACHE_STORE_HANDLES[path] = vla - return vla - if not create: - return None - vla = blosc2.ObjectArray(storage=blosc2.Storage(urlpath=path, mode="w")) - _QUERY_CACHE_STORE_HANDLES[path] = vla - return vla + """Return ``None`` because query caches are intentionally not persisted.""" + return def _close_query_cache_store(path: str) -> None: @@ -543,24 +531,49 @@ def _hot_cache_key( return (_HOT_CACHE_GLOBAL_SCOPE if scope is None else scope, digest) +def _compress_hot_coords(coords: np.ndarray) -> _CompressedHotCoords: + payload = _encode_coords_payload(np.asarray(coords)) + raw = payload["data"] + compressed = False + data = raw + if len(raw) != 0: + dtype = np.dtype(payload["dtype"]) + candidate = blosc2.compress2(raw, typesize=dtype.itemsize, codec=blosc2.Codec.LZ4, clevel=5) + if len(candidate) < len(raw): + data = candidate + compressed = True + return _CompressedHotCoords( + dtype=payload["dtype"], nrows=int(payload["nrows"]), compressed=compressed, data=data + ) + + +def _decompress_hot_coords(entry: _CompressedHotCoords) -> np.ndarray: + dtype = np.dtype(entry.dtype) + if entry.nrows == 0: + return np.empty((0,), dtype=dtype) + raw = blosc2.decompress2(entry.data) if entry.compressed else entry.data + return np.frombuffer(raw, dtype=dtype, count=entry.nrows).copy() + + def _hot_cache_get(digest: str, scope: tuple[str, str | int] | None = None) -> np.ndarray | None: """Return the cached coordinate array for *digest*, or ``None``.""" key = _hot_cache_key(digest, scope) - arr = _HOT_CACHE.get(key) - if arr is None: + entry = _HOT_CACHE.get(key) + if entry is None: return None # Move to most-recently-used position. with contextlib.suppress(ValueError): _HOT_CACHE_ORDER.remove(key) _HOT_CACHE_ORDER.append(key) - return arr + return _decompress_hot_coords(entry) def _hot_cache_put(digest: str, coords: np.ndarray, scope: tuple[str, str | int] | None = None) -> None: """Insert *coords* into the hot cache, evicting LRU entries if needed.""" global _HOT_CACHE_BYTES key = _hot_cache_key(digest, scope) - entry_bytes = coords.nbytes + entry = _compress_hot_coords(coords) + entry_bytes = entry.nbytes if entry_bytes > QUERY_CACHE_MAX_MEM_NBYTES: # Single entry too large; skip. return @@ -575,7 +588,7 @@ def _hot_cache_put(digest: str, coords: np.ndarray, scope: tuple[str, str | int] evicted = _HOT_CACHE.pop(oldest, None) if evicted is not None: _HOT_CACHE_BYTES -= evicted.nbytes - _HOT_CACHE[key] = coords + _HOT_CACHE[key] = entry _HOT_CACHE_ORDER.append(key) _HOT_CACHE_BYTES += entry_bytes @@ -595,25 +608,8 @@ def _hot_cache_clear(scope: tuple[str, str | int] | None = None) -> None: def _persistent_cache_lookup(array: blosc2.NDArray, digest: str) -> np.ndarray | None: - """Return coordinates from the persistent cache for *digest*, or ``None``.""" - catalog = _load_query_cache_catalog(array) - if catalog is None: - return None - entry = catalog.get("entries", {}).get(digest) - if entry is None: - return None - slot = entry["slot"] - store = _open_query_cache_store(array) - if store is None or slot >= len(store): - return None - payload = store[slot] - if not isinstance(payload, dict) or payload.get("version") != QUERY_CACHE_FORMAT_VERSION: - return None - try: - coords = _decode_coords_payload(payload) - except Exception: - return None - return coords + """Return ``None`` because query caches are intentionally not persisted.""" + return None def _query_cache_entry_nbytes(coords: np.ndarray) -> int: @@ -644,51 +640,8 @@ def _persistent_cache_insert( coords: np.ndarray, query_descriptor: dict, ) -> bool: - """Append *coords* to the persistent cache and update the catalog. - - Returns ``True`` on success, ``False`` if the entry is too large or the - persistent budget is exceeded. - """ - catalog = _load_query_cache_catalog(array) - payload_path = _query_cache_payload_path(array) - if catalog is None: - catalog = _default_query_cache_catalog(payload_path) - elif digest in catalog.get("entries", {}): - return True - - payload_mapping = _encode_coords_payload(coords) - nbytes = _query_cache_entry_nbytes(coords) - - max_entry = catalog.get("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES) - if nbytes > max_entry: - return False - - max_persistent = catalog.get("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) - current_persistent = int(catalog.get("persistent_nbytes", 0)) - if current_persistent + nbytes > max_persistent: - if nbytes > max_persistent: - return False - catalog = _reset_persistent_query_cache_catalog(array, catalog) - current_persistent = 0 - - store = _open_query_cache_store(array, create=True) - if store is None: - return False - - slot = len(store) - store.append(payload_mapping) - - catalog["entries"][digest] = { - "slot": slot, - "nbytes": nbytes, - "nrows": len(coords), - "dtype": payload_mapping["dtype"], - "query": query_descriptor, - } - catalog["persistent_nbytes"] = current_persistent + nbytes - catalog["next_slot"] = slot + 1 - _save_query_cache_catalog(array, catalog) - return True + """Return ``False`` because query caches are intentionally not persisted.""" + return False # --------------------------------------------------------------------------- @@ -731,17 +684,7 @@ def get_cached_coords( scope = _query_cache_scope(owner) descriptor = _normalize_query_descriptor(expression, tokens, order) digest = _query_cache_digest(descriptor) - # 1. In-process hot cache. - coords = _hot_cache_get(digest, scope=scope) - if coords is not None: - return coords - # 2. Persistent cache (persistent arrays only). - if _is_persistent_array(owner): - coords = _persistent_cache_lookup(owner, digest) - if coords is not None: - _hot_cache_put(digest, coords, scope=scope) - return coords - return None + return _hot_cache_get(digest, scope=scope) def store_cached_coords( @@ -751,14 +694,12 @@ def store_cached_coords( order: list[str] | None, coords: np.ndarray, ) -> None: - """Store *coords* in both the hot cache and (if persistent) the payload store.""" + """Store *coords* in the in-process hot cache only.""" owner = _query_cache_owner(array) scope = _query_cache_scope(owner) descriptor = _normalize_query_descriptor(expression, tokens, order) digest = _query_cache_digest(descriptor) _hot_cache_put(digest, coords, scope=scope) - if _is_persistent_array(owner): - _persistent_cache_insert(owner, digest, coords, descriptor) def _supported_index_dtype(dtype: np.dtype) -> bool: diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 7858f7a1a..250b7ec05 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -142,19 +142,23 @@ def ne_evaluate(expression, local_dict=None, **kwargs): def _get_result(expression, chunk_operands, ne_args, where=None, indices=None, _order=None): chunk_indices = None - if expression in {"o0", "(o0)"} and where is None: - # We don't have an actual expression, so avoid a copy except to make contiguous (later) - return chunk_operands["o0"], None - # Apply the where condition (in result) + + # Apply the where condition (in result) — fusion path, evaluate before shortcut if where is not None and len(where) == 2: # x = chunk_operands["_where_x"] # y = chunk_operands["_where_y"] - # result = np.where(result, x, y) # numexpr is a bit faster than np.where, and we can fuse operations in this case new_expr = f"where({expression}, _where_x, _where_y)" return ne_evaluate(new_expr, chunk_operands, **ne_args), None - result = ne_evaluate(expression, chunk_operands, **ne_args) + # If the expression is a simple operand reference (e.g. "key", "o0"), + # grab it directly from chunk_operands instead of calling ne_evaluate. + # This avoids ~150 µs of numexpr parsing/setup overhead per chunk. + _expr = expression.strip("()") + if _expr in chunk_operands: + result = chunk_operands[_expr] + else: + result = ne_evaluate(expression, chunk_operands, **ne_args) if where is None: return result, None elif len(where) == 1: @@ -219,6 +223,7 @@ def _get_result(expression, chunk_operands, ne_args, where=None, indices=None, _ # functions that have to be evaluated before chunkwise lazyexpr machinery eager_funcs = linalg_funcs + reducers + ["slice"] + ["." + attr for attr in linalg_attrs] functions = blosc2_funcs +_TRANSIENT_MASK_CPARAMS = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=5, filters=[blosc2.Filter.SHUFFLE]) _constructor_call_patterns = {name: re.compile(rf"\b{re.escape(name)}\s*\(") for name in constructors} @@ -392,6 +397,18 @@ def vlmeta(self) -> LazyArrayVLMeta: self._vlmeta_proxy = LazyArrayVLMeta(self) return self._vlmeta_proxy + def __enter__(self) -> LazyArray: + """Enter a context manager and return this lazy array.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Exit a context manager. + + Lazy arrays do not currently keep explicit closeable resources, so this + is a logical no-op kept for API consistency with :func:`blosc2.open`. + """ + return False + @abstractmethod def argsort(self, order: str | list[str] | None = None) -> blosc2.LazyArray: """ @@ -464,6 +481,29 @@ def compute( failures are raised instead of silently falling back to regular chunked eval for non-DSL expressions. + - ``jit`` (bool | None): enable (``True``) or disable (``False``) JIT compilation + of the expression via miniexpr. When ``None`` (default), JIT is only used + for DSL kernels; plain expressions are evaluated by the bytecode interpreter. + Setting ``jit=True`` forces auto-lift of plain expressions into JIT-compiled + kernels. + + - ``jit_backend`` (str | None): select the JIT compiler backend. Valid + values are ``"tcc"`` (bundled Tiny C Compiler) and ``"cc"`` (system C + compiler, e.g. gcc or clang). ``None`` (default) defers to the miniexpr + default (``"tcc"``). + + - ``BLOSC_ME_JIT`` environment variable: when set to ``"1"``, ``"true"``, + ``"on"``, ``"tcc"``, or ``"cc"``, it forces ``jit=True`` for all + ``compute()`` and ``__getitem__`` calls where ``jit`` is not explicitly + passed. Setting it to ``"tcc"`` or ``"cc"`` also selects that backend + unless ``jit_backend`` is given explicitly. + + - ``BLOSC_ME_JIT_TRACE`` environment variable: when set to ``"1"``, + ``"true"``, or ``"on"``, prints a one-line diagnostic to stdout + showing which compute engine was selected (``miniexpr`` or + ``ne_evaluate``), the JIT mode and backend if applicable, and the + expression being evaluated. + Returns ------- out: :ref:`NDArray` @@ -664,6 +704,21 @@ def compute_broadcast_shape(arrays): return np.broadcast_shapes(*shapes) if shapes else None +def _jit_from_env(jit, jit_backend): + """Apply BLOSC_ME_JIT environment variable to jit/jit_backend defaults.""" + if jit is not None: + return jit, jit_backend + env_jit = os.environ.get("BLOSC_ME_JIT", "") + if not env_jit: + return jit, jit_backend + env_jit_lower = env_jit.lower() + if env_jit_lower in ("1", "true", "on", "tcc", "cc"): + jit = True + if jit_backend is None and env_jit_lower in ("tcc", "cc"): + jit_backend = env_jit_lower + return jit, jit_backend + + # Define the patterns for validation validation_patterns = [ r"[\;]", # Flow control characters @@ -1449,8 +1504,9 @@ def fast_eval( # noqa: C901 if strict_miniexpr is None: # Be strict by default for DSL kernels to avoid silently losing DSL fast-path regressions. strict_miniexpr = bool(is_dsl) - if where is not None: - # miniexpr does not support where(); use the regular path. + if where is not None and len(where) != 2: + # miniexpr does not support cardinality-changing where (len==1); + # where(cond, x, y) with two args is element-wise and IS supported. use_miniexpr = False if is_dsl: dsl_disable_reason = "DSL kernels cannot be run without miniexpr." @@ -1579,15 +1635,28 @@ def fast_eval( # noqa: C901 if is_dsl and not use_miniexpr: _raise_dsl_miniexpr_required(dsl_disable_reason) + if os.environ.get("BLOSC_ME_JIT_TRACE", "").lower() in ("1", "true", "on"): + engine = ( + "miniexpr" if use_miniexpr else ("ne_evaluate" if isinstance(expr_string, str) else "python-udf") + ) + jit_info = f"jit={jit}, backend={jit_backend}" if use_miniexpr else "" + expr_short = str(expr_string)[:120].replace("\n", " ") + print(f"[blosc2] engine={engine} {jit_info} expr={expr_short}", flush=True) + if use_miniexpr: cparams = kwargs.pop("cparams", blosc2.CParams()) # All values will be overwritten, so we can use an uninitialized array res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs) prefilter_set = False try: + # Fuse where(cond, x, y) into the expression for miniexpr + _pref_expr = expr_string_miniexpr + _pref_ops = operands_miniexpr + if where is not None and len(where) == 2: + _pref_expr = f"where({_pref_expr}, _where_x, _where_y)" res_eval._set_pref_expr( - expr_string_miniexpr, - operands_miniexpr, + _pref_expr, + _pref_ops, fp_accuracy=fp_accuracy, jit=jit, ) @@ -2023,9 +2092,12 @@ def slices_eval( # noqa: C901 if where is None or len(where) == 2: if behaved and result.shape == out.chunks and result.dtype == out.dtype: - # Fast path - # TODO: Check this only works when slice is () - out.schunk.update_data(nchunk, result, copy=False) + # Fast path: only use it when the output chunk index is valid + # (operand and output may have different chunk layouts when slicing) + if nchunk < out.schunk.nchunks: + out.schunk.update_data(nchunk, result, copy=False) + else: + out[cslice_subidx] = result else: try: out[cslice_subidx] = result @@ -2280,6 +2352,13 @@ def reduce_slices( # noqa: C901 # Compute the shape and chunks of the output array, including broadcasting shape = compute_broadcast_shape(operands.values()) + # Validate axis against operand dimensions before any computation. + if axis is not None and not np.isscalar(axis): + ndim = len(shape) + for ax in axis: + if ax < -ndim or ax >= ndim: + raise np.exceptions.AxisError(ax, ndim) + _slice = _slice.raw shape_slice = shape mask_slice = np.array([isinstance(i, int) for i in _slice], dtype=np.bool_) @@ -2466,7 +2545,11 @@ def reduce_slices( # noqa: C901 if reduce_op in {ReduceOp.ANY, ReduceOp.ALL}: result = reduce_op.value(aux_reduc, **reduce_args) else: - result = reduce_op.value.reduce(aux_reduc, **reduce_args) + # The accumulator is always 1-D (one slot per output block). + # The original axis may refer to dimensions that no longer + # exist after per-block reduction. Use axis=0 to combine + # all block results. + result = reduce_op.value.reduce(aux_reduc, axis=0) return result # Iterate over the operands and get the chunks @@ -3112,8 +3195,20 @@ def __init__(self, new_op): # noqa: C901 if not (isinstance(value2, blosc2.Operand | np.ndarray) or np.isscalar(value2)) else value2 ) - # Reset values represented as np.int64 etc. to be set as Python natives - value2 = value2.item() if np.isscalar(value2) and hasattr(value2, "item") else value2 + + # Reset values represented as np.int64 etc. to be set as Python natives, + # BUT preserve numpy integer scalars that require explicit typing (unsigned or + # 64-bit) so that dtype-sensitive backends (numexpr) don't downcast them to int32. + def _to_native_if_safe(v): + if not (np.isscalar(v) and hasattr(v, "item")): + return v + dt = np.dtype(type(v)) + # Keep typed when unsigned or itemsize >= 8 to avoid silent int32 truncation. + if np.issubdtype(dt, np.unsignedinteger) or dt.itemsize >= 8: + return v + return v.item() + + value2 = _to_native_if_safe(value2) if isinstance(value1, LazyExpr) or isinstance(value2, LazyExpr): if isinstance(value1, LazyExpr): @@ -3146,14 +3241,22 @@ def __init__(self, new_op): # noqa: C901 self.expression = "o0" self.operands = {"o0": ne_evaluate(f"({value1!r} {op} {value2!r})")} # eager evaluation elif np.isscalar(value2): - self.operands = {"o0": value1} - self.expression = f"(o0 {op} {value2!r})" + if hasattr(value2, "dtype"): # typed numpy scalar — keep as named operand + self.operands = {"o0": value1, "o1": value2} + self.expression = f"(o0 {op} o1)" + else: + self.operands = {"o0": value1} + self.expression = f"(o0 {op} {value2!r})" elif hasattr(value2, "shape") and value2.shape == (): self.operands = {"o0": value1} self.expression = f"(o0 {op} {value2[()]})" elif np.isscalar(value1): - self.operands = {"o0": value2} - self.expression = f"({value1!r} {op} o0)" + if hasattr(value1, "dtype"): # typed numpy scalar — keep as named operand + self.operands = {"o0": value2, "o1": value1} + self.expression = f"(o1 {op} o0)" + else: + self.operands = {"o0": value2} + self.expression = f"({value1!r} {op} o0)" elif hasattr(value1, "shape") and value1.shape == (): self.operands = {"o0": value2} self.expression = f"({value1[()]} {op} o0)" @@ -3796,6 +3899,113 @@ def find_args(expr): return value, expression[idx:idx2] + @staticmethod + def _is_full_slice(lazy_item): + """Return True if *lazy_item* is a no-op full slice (() or slice(None)).""" + if isinstance(lazy_item, slice): + return lazy_item == slice(None) + if isinstance(lazy_item, tuple): + return lazy_item == () or all(isinstance(s, slice) and s == slice(None) for s in lazy_item) + return False + + @staticmethod + def _collect_flat_indices_from_bool_ndarray(bool_ndarray): + """Collect flat indices of True positions from a compressed boolean NDArray. + + Uses :meth:`~blosc2.NDArray.iterchunks_info` to skip chunks that are + special values (e.g. all-False ``ZERO``), avoiding decompression and + scanning for those chunks. + + Parameters + ---------- + bool_ndarray: blosc2.NDArray + A 1D NDArray with boolean dtype. + + Returns + ------- + np.ndarray + Flat indices of True positions (int64). + """ + chunk_len = bool_ndarray.chunks[0] + all_indices = [] + + for info in bool_ndarray.iterchunks_info(): + # Skip special-value chunks that are entirely False + if info.special == blosc2.SpecialValue.ZERO: + continue + if info.special == blosc2.SpecialValue.VALUE: + if not info.repeated_value: # repeated_value is False/0 + continue + # repeated_value is True: all elements in this chunk are True + offset = info.nchunk * chunk_len + all_indices.append(np.arange(offset, offset + chunk_len, dtype=np.int64)) + continue + + # Normal chunk: decompress and scan for True positions + raw = bool_ndarray.schunk.decompress_chunk(info.nchunk) + arr = np.frombuffer(raw, dtype=np.bool_) + # Truncate to the logical chunk size (buffer may include padding) + if len(arr) > chunk_len: + arr = arr[:chunk_len] + idx = np.flatnonzero(arr) + if len(idx) > 0: + offset = info.nchunk * chunk_len + all_indices.append(idx + offset) + + if not all_indices: + return np.array([], dtype=np.int64) + return np.concatenate(all_indices) + + def _where_getitem_fastpath(self, item, kwargs): + """Fast path for where(cond, x) full-slice getitem calls. + + Returns ``None`` when the fast path does not apply. + """ + from . import indexing + + simple_operand_expr = self.expression.strip("() ") in self.operands + if not ( + hasattr(self, "_where_args") + and len(self._where_args) == 1 + and not hasattr(self, "_indices") + and not hasattr(self, "_order") + and "_reduce_args" not in kwargs + and isinstance(self._where_args["_where_x"], blosc2.NDArray) + and self._is_full_slice(item) + and not simple_operand_expr + ): + return None + + # Preserve index/caching behavior for indexed queries. + if kwargs.get("_use_index", True) and indexing.will_use_index(self): + return None + + cond_expr = blosc2.LazyExpr._new_expr(self.expression, self.operands, guess=False) + if not blosc2.isdtype(cond_expr.dtype, "bool"): + return None + + target = self._where_args["_where_x"] + if cond_expr.ndim != 1 or target.ndim != 1: + return None + + cache_tokens = [indexing.SELF_TARGET_NAME] + cached_coords = indexing.get_cached_coords(target, self.expression, cache_tokens, None) + if cached_coords is not None: + cached_plan = indexing.IndexPlan( + usable=True, reason="cache-hit", base=target, exact_positions=cached_coords + ) + return indexing.evaluate_full_query(self._where_args, cached_plan) + + # Evaluate the condition using the miniexpr prefilter (fastest first pass) + mask = cond_expr.compute((), cparams=_TRANSIENT_MASK_CPARAMS) + + # Collect flat indices by iterating the compressed bool chunks, + # avoiding a full-mask decompression + count_nonzero + flatnonzero. + flat_indices = self._collect_flat_indices_from_bool_ndarray(mask) + indexing.store_cached_coords(target, self.expression, cache_tokens, None, flat_indices) + plan = indexing.IndexPlan(usable=True, reason="mask-scan", base=target, exact_positions=flat_indices) + return indexing.evaluate_full_query(self._where_args, plan) + def _compute_expr(self, item, kwargs): if any(method in self.expression for method in eager_funcs): # We have reductions in the expression (probably coming from a string lazyexpr) @@ -3857,6 +4067,12 @@ def _compute_expr(self, item, kwargs): return chunked_eval(lazy_expr.expression, lazy_expr.operands, item, **kwargs) + # Optimization: for where(cond, x) (1-arg) with a boolean condition, + # stream matching values chunk-by-chunk without materializing the full mask. + fastpath_result = self._where_getitem_fastpath(item, kwargs) + if fastpath_result is not None: + return fastpath_result + return chunked_eval(self.expression, self.operands, item, **kwargs) # TODO: argsort and sort are repeated in LazyUDF; refactor @@ -3941,6 +4157,7 @@ def compute( if hasattr(self, "_where_args"): kwargs["_where_args"] = self._where_args kwargs.setdefault("fp_accuracy", fp_accuracy) + jit, jit_backend = _jit_from_env(jit, jit_backend) if jit is not None: kwargs["jit"] = jit if jit_backend is not None: diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 87298f787..8e6186f08 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -113,6 +113,30 @@ } +def normalize_1d_sparse_indices(key, size: int) -> np.ndarray | None: + if isinstance(key, list): + indices = np.asarray(key) + elif isinstance(key, np.ndarray): + indices = key + else: + return None + + if indices.ndim != 1 or not np.issubdtype(indices.dtype, np.integer): + return None + + indices = np.ascontiguousarray(indices, dtype=np.int64) + if len(indices) == 0: + return indices + + negative = indices < 0 + if np.any(negative): + indices = indices.copy() + indices[negative] += size + if np.any((indices < 0) | (indices >= size)): + raise IndexError("index out of bounds for axis 0") + return indices + + @runtime_checkable class Array(Protocol): """ @@ -3776,7 +3800,11 @@ class NDArray(blosc2_ext.NDArray, Operand): """Compressed, chunked N-dimensional array with NumPy-like indexing.""" def __init__(self, **kwargs): - self._schunk = SChunk(_schunk=kwargs["_schunk"], _is_view=True) # SChunk Python instance + schunk_kwargs = {"_schunk": kwargs["_schunk"], "_is_view": True} + mode = kwargs.pop("mode", None) + if mode is not None: + schunk_kwargs["mode"] = mode + self._schunk = SChunk(**schunk_kwargs) # SChunk Python instance self._keep_last_read = False # Where to store the last read data self._last_read = {} @@ -3786,6 +3814,18 @@ def __init__(self, **kwargs): field_names = tuple(self.dtype.fields) if self.dtype.fields else () self._fields = FieldsAccessor(self, field_names) + def __enter__(self) -> NDArray: + """Enter a context manager and return this array.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Exit a context manager. + + For regular :func:`blosc2.open` handles this is a logical no-op kept for + API symmetry with higher-level persistent containers. + """ + return False + @property def cparams(self) -> blosc2.CParams: """The compression parameters used by the array.""" @@ -4312,6 +4352,115 @@ def _get_set_nonunit_steps(self, _slice, out=None, value=None): out = super().set_slice((locstart, locstop), chunk) # load updated partial chunk into array return out + @staticmethod + def _normalize_take_indices(indices, size: int) -> np.ndarray: + if isinstance(indices, NDArray): + indices = indices[()] + indices = np.asarray(indices) + if indices.size == 0: + return np.ascontiguousarray(indices, dtype=np.int64) + if not np.issubdtype(indices.dtype, np.integer): + raise TypeError("take indices must be an integer array") + normalized = np.ascontiguousarray(indices, dtype=np.int64) + negative = normalized < 0 + if np.any(negative): + normalized = normalized.copy() + normalized[negative] += size + if np.any((normalized < 0) | (normalized >= size)): + raise IndexError("take index out of bounds") + return normalized + + @staticmethod + def _normalize_take_axis(axis: int, ndim: int) -> int: + if not isinstance(axis, (int, np.integer)): + raise TypeError("axis must be an integer or None") + axis = int(axis) + if axis < 0: + axis += ndim + if not 0 <= axis < ndim: + raise ValueError(f"axis {axis} is out of bounds for array of dimension {ndim}") + return axis + + def _take_sparse_normalized(self, indices: np.ndarray, out: np.ndarray | None = None) -> np.ndarray: + out = np.empty(indices.shape, dtype=self.dtype) if out is None else out + return super().get_sparse_numpy(out, indices) + + def _take_numpy(self, indices, /, *, axis: int | None = None) -> np.ndarray: + """Return a NumPy buffer for :meth:`take` and internal gather paths.""" + if axis is None: + normalized = self._normalize_take_indices(indices, self.size) + flat = normalized.reshape(-1) + return self._take_sparse_normalized(flat).reshape(normalized.shape) + + axis = self._normalize_take_axis(axis, self.ndim) + normalized = self._normalize_take_indices(indices, self.shape[axis]) + flat = normalized.reshape(-1) + result_shape = self.shape[:axis] + normalized.shape + self.shape[axis + 1 :] + if flat.size == 0: + return np.empty(result_shape, dtype=self.dtype) + if self.ndim == 1: + return self._take_sparse_normalized(flat).reshape(result_shape) + + # For ndim > 1 axis-based take, use orthogonal selection which + # decompresses each chunk once and copies contiguous row/slab + # slices. Per-element sparse gather is the wrong tool here + # because it would iterate over every individual element + # coordinate (n_indices × product of other dims). + selection = [np.arange(dim, dtype=np.int64) for dim in self.shape] + selection[axis] = flat + orthogonal_shape = self.shape[:axis] + (flat.size,) + self.shape[axis + 1 :] + out = np.empty(orthogonal_shape, dtype=self.dtype) + self.get_oindex_numpy(out, selection) + return out.reshape(result_shape) + + def take(self, indices, /, *, axis: int | None = None) -> NDArray: + """Return elements selected by integer indices. + + This follows the Array API ``take`` shape rules: when ``axis`` is + ``None`` the array is conceptually flattened and the result has the + same shape as ``indices``; otherwise the indexed axis is replaced by + ``indices.shape``. + """ + return blosc2.asarray(self._take_numpy(indices, axis=axis)) + + def _try_sparse_fancy_index(self, key) -> np.ndarray | None: + """Try to handle integer-array fancy indexing via the sparse gather path. + + If *key* is a single integer array (list or ndarray, any dimensionality) + route it through ``_take_numpy`` which uses ``b2nd_get_sparse_cbuffer``. + Return the result ndarray on success, or ``None`` to signal that the + caller should fall back to the regular fancy-indexing machinery. + """ + if isinstance(key, (slice, tuple)): + return None + if not isinstance(key, (list, np.ndarray)): + return None + key_arr = np.asarray(key) + if not (np.issubdtype(key_arr.dtype, np.integer) and key_arr.ndim >= 1): + return None + # 1-D: axis=None (flat); ndim>1: axis=0 (row selection) + return self._take_numpy(key_arr, axis=None if self.ndim == 1 else 0) + + def _getitem_bool_mask(self, key): + """Handle boolean array key with optional sparse-gather fast path. + + Returns the result array or ``None`` if *key* is not a matching + boolean mask (caller should continue with regular indexing). + """ + if not (hasattr(key, "dtype") and np.issubdtype(key.dtype, np.bool_) and key.shape == self.shape): + return None + # For sparse boolean masks, converting to flat indices and using the + # sparse-gather path is faster than decompressing every data chunk. + try: + idx = _bool_mask_to_flat_indices(key, self.schunk.nchunks) + except _BoolMaskDense: + pass + else: + return blosc2.take(self, idx, axis=None)[:] + # Fall through to the LazyExpr path for dense masks + expr = blosc2.LazyExpr._new_expr("key", {"key": key}, guess=False).where(self) + return expr[:] + def __getitem__( self, key: None @@ -4381,6 +4530,17 @@ def __getitem__( key = key[()] if isinstance(key, NDArray) else key # key not iterable key = tuple(k[()] if isinstance(k, NDArray) else k for k in key) if isinstance(key, tuple) else key + # Check boolean array key early to avoid expensive process_key / nonzero + result = self._getitem_bool_mask(key) + if result is not None: + return result + + # Integer array fancy indexing -> route through the efficient sparse + # gather (b2nd_get_sparse_cbuffer) for all dimensionalities. + result = self._try_sparse_fancy_index(key) + if result is not None: + return result + # decompress NDArrays key_, mask = process_key(key, self.shape) # internally handles key an integer key = key[()] if hasattr(key, "shape") and key.shape == () else key # convert to scalar @@ -4398,16 +4558,6 @@ def __getitem__( return np.expand_dims(self._get_set_findex_default(_slice, out=out), 0) else: # do nothing return np.empty((0,) + self.shape, dtype=self.dtype) - elif ( - hasattr(key, "dtype") and np.issubdtype(key.dtype, np.bool_) and key.shape == self.shape - ): # check ORIGINAL key - # This can be interpreted as a boolean expression but only for key shape same as self shape - expr = blosc2.LazyExpr._new_expr("key", {"key": key}, guess=False).where(self) - # Decorate with where and force a getitem operation to return actual values. - # This behavior is consistent with NumPy, although different from e.g. ['expr'] - # which returns a lazy expression. - # This is faster than the fancy indexing path - return expr[:] return self.get_fselection_numpy(key) # fancy index default, can be quite slow start, stop, step, none_mask = get_ndarray_start_stop(self.ndim, key_, self.shape) @@ -7152,43 +7302,46 @@ def full_like(x: blosc2.Array, fill_value: bool | int | float | complex, dtype=N return blosc2.full(shape=x.shape, fill_value=fill_value, dtype=dtype, **kwargs) -def take(x: blosc2.Array, indices: blosc2.Array, axis: int | None = None) -> NDArray: - """ - Returns elements of an array along an axis. +def take(x: blosc2.Array, indices: blosc2.Array, axis: int | None = None): + """Return elements selected by integer indices. + + For array inputs, this follows the Array API ``take`` shape rules: when + ``axis`` is ``None``, *x* is conceptually flattened and the output shape is + ``indices.shape``; otherwise the indexed axis is replaced by + ``indices.shape``. For :class:`CTable` and :class:`Column` inputs, indices + select logical rows/values and ``axis`` is not supported. Parameters ---------- - x: blosc2.Array - Input array. Should have one or more dimensions (axes). + x: blosc2.Array, CTable, Column, or array-like + Input object. ``NDArray`` inputs return an ``NDArray``; + ``CTable`` inputs return a ``CTable``; ``Column`` inputs return a + ``Column``. Other array-like inputs are converted to a Blosc2 + ``NDArray`` result. indices: array-like - Array indices. The array must be one-dimensional and have an integer data type. + Integer indices. Negative indices are normalized relative to the + selected axis (or to the flattened array when ``axis`` is ``None``). + For array inputs, indices may have any shape. axis: int | None - Axis over which to select values. - If x is a one-dimensional array, providing an axis is optional; however, if x - has more than one dimension, providing an axis is required. Default: None. + Axis over which to select values for array inputs. If ``None``, the + input array is flattened before selection. Must be ``None`` for + ``CTable`` and ``Column`` inputs. Returns ------- - out: NDArray - Selected indices of x. + out: NDArray | CTable | Column + Selected values, preserving the container type for ``NDArray``, + ``CTable`` and ``Column`` inputs. """ - if axis is None: - axis = 0 - if x.ndim != 1: - raise ValueError("Must specify axis parameter if x is not 1D.") - if axis < 0: - axis += x.ndim - if not isinstance(axis, int | np.integer): - raise ValueError("Axis must be integer.") - if isinstance(indices, list): - indices = np.asarray(indices) - if indices.ndim != 1: - raise ValueError("Indices must be 1D array.") - key = tuple(indices if i == axis else slice(None, None, 1) for i in range(x.ndim)) - # TODO: Implement fancy indexing in .slice so that this is more efficient - return blosc2.asarray(x[key]) + if isinstance(x, NDArray): + return x.take(indices, axis=axis) + if isinstance(x, (blosc2.CTable, blosc2.Column)): + if axis is not None: + raise ValueError("axis is not supported for CTable or Column") + return x.take(indices) + return blosc2.asarray(np.take(np.asarray(x), np.asarray(indices), axis=axis)) def take_along_axis(x: blosc2.Array, indices: blosc2.Array, axis: int = -1) -> NDArray: @@ -7301,3 +7454,48 @@ def mygen(i): for a in myarrs: out += (broadcast_to(a, shape),) return out + + +# --------------------------------------------------------------------------- +# Sparse boolean-mask helper (used by NDArray.__getitem__) +# --------------------------------------------------------------------------- + + +class _BoolMaskDense(Exception): + """Raised when a boolean mask is too dense for the sparse-gather fast path.""" + + +def _bool_mask_to_flat_indices(bool_arr, nchunks_data): + """Convert a sparse boolean mask to flat indices, or raise _BoolMaskDense. + + For numpy masks, uses ``np.count_nonzero`` / ``np.flatnonzero``. + For blosc2 NDArray masks, iterates chunks incrementally and bails out + early when the mask is too dense. + """ + # Threshold: if True values exceed the number of data chunks times a + # generous factor, the LazyExpr full-scan path is likely faster. + threshold = builtins.max(nchunks_data * 500, 50_000) + + if isinstance(bool_arr, np.ndarray): + n_true = np.count_nonzero(bool_arr) + if n_true >= threshold: + raise _BoolMaskDense + return np.flatnonzero(bool_arr) + + # blosc2 NDArray: iterate chunks incrementally + total_true = 0 + flat_parts = [] + offset = 0 + for nchunk in range(bool_arr.schunk.nchunks): + raw = bool_arr.schunk.decompress_chunk(nchunk) + chunk = np.frombuffer(raw, dtype=np.bool_) + n_true = np.count_nonzero(chunk) + total_true += n_true + if total_true >= threshold: + raise _BoolMaskDense + if n_true > 0: + flat_parts.append(np.flatnonzero(chunk) + offset) + offset += len(chunk) + if not flat_parts: + return np.array([], dtype=np.int64) + return np.concatenate(flat_parts) diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index e12605df1..a80a6224c 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -266,6 +266,18 @@ def __init__( for key in vlmeta: self._schunk_cache.vlmeta[key] = vlmeta[key] + def __enter__(self) -> "Proxy": + """Enter a context manager and return this proxy.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Exit a context manager. + + ``Proxy`` does not currently expose an explicit close operation; the + underlying cache object manages its own lifetime. + """ + return False + def fetch(self, item: slice | list[slice] | None = ()) -> blosc2.NDArray | blosc2.schunk.SChunk: """ Get the container used as cache with the requested data updated. @@ -433,8 +445,14 @@ def __getitem__(self, item: slice | list[slice]) -> np.ndarray: [17 18 19] [22 23 24]] """ - # Populate the cache - self.fetch(item) + # Populate the cache when possible. Read-only reopens must remain + # observational, so fall back to the source without mutating the cache. + try: + self.fetch(item) + except ValueError as exc: + if getattr(self._schunk_cache, "mode", None) != "r" or "reading mode" not in str(exc): + raise + return self.src[item] return self._cache[item] @property diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index f6aeb3200..bf1a0a2ed 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -9,6 +9,7 @@ import os import pathlib +import weakref import zipfile from collections import namedtuple from collections.abc import Iterator, Mapping, MutableMapping @@ -43,13 +44,49 @@ class vlmeta(MutableMapping, blosc2_ext.vlmeta): references only; purely in-memory operands are intentionally rejected. """ - def __init__(self, schunk, urlpath, mode, mmap_mode, initial_mapping_size): - self.urlpath = urlpath - self.mode = mode - self.mmap_mode = mmap_mode - self.initial_mapping_size = initial_mapping_size + def __init__(self, owner, schunk): + self._owner_ref = weakref.ref(owner) super().__init__(schunk) + @property + def _owner(self): + owner = self._owner_ref() + if owner is None: + raise ReferenceError("The parent SChunk for this vlmeta object no longer exists") + return owner + + @property + def urlpath(self): + return self._owner.urlpath + + @urlpath.setter + def urlpath(self, value): + self._owner.urlpath = value + + @property + def mode(self): + return self._owner.mode + + @mode.setter + def mode(self, value): + self._owner.mode = value + + @property + def mmap_mode(self): + return self._owner.mmap_mode + + @mmap_mode.setter + def mmap_mode(self, value): + self._owner.mmap_mode = value + + @property + def initial_mapping_size(self): + return self._owner.initial_mapping_size + + @initial_mapping_size.setter + def initial_mapping_size(self, value): + self._owner.initial_mapping_size = value + def __setitem__(self, name, content): blosc2_ext.check_access_mode(self.urlpath, self.mode) # If name is a slice, assume that content is a dictionary and copy all the items @@ -357,12 +394,22 @@ def __init__( # noqa: C901 chunksize = 2**28 super().__init__(_schunk=sc, chunksize=chunksize, data=data, **kwargs) - self._vlmeta = vlmeta( - super().c_schunk, self.urlpath, self.mode, self.mmap_mode, self.initial_mapping_size - ) + self._vlmeta = vlmeta(self, super().c_schunk) self._cparams = super().get_cparams() self._dparams = super().get_dparams() + def __enter__(self) -> SChunk: + """Enter a context manager and return this super-chunk.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Exit a context manager. + + For regular :func:`blosc2.open` handles this is a logical no-op kept for + API symmetry with higher-level persistent containers. + """ + return False + @property def cparams(self) -> blosc2.CParams: """ @@ -1667,12 +1714,9 @@ def process_opened_object(res): meta = getattr(res, "schunk", res).meta if "proxy-source" in meta: proxy_cache = res - cache_schunk = getattr(res, "schunk", res) - if getattr(cache_schunk, "urlpath", None) is not None and getattr(cache_schunk, "mode", None) == "r": - proxy_cache = blosc2_ext.open(cache_schunk.urlpath, "a", 0) proxy_src = meta["proxy-source"] if proxy_src["local_abspath"] is not None: - src = blosc2.open(proxy_src["local_abspath"], mode="a") + src = blosc2.open(proxy_src["local_abspath"], mode="r") return blosc2.Proxy(src, _cache=proxy_cache) elif proxy_src["urlpath"] is not None: src = blosc2.C2Array(proxy_src["urlpath"][0], proxy_src["urlpath"][1], proxy_src["urlpath"][2]) @@ -1788,6 +1832,14 @@ def open( 'a' means read/write (create if it doesn't exist); 'w' means create (overwrite if it exists). Defaults to 'r' ( read-only). + + Open modes also define the allowed persistence side effects: + + - ``'r'`` never writes to the persistent object or any sidecar/cache file. + Query acceleration and other execution caches remain process-local only. + - ``'a'`` and ``'w'`` may persist explicit user-visible changes such as data, + metadata, and index maintenance, but execution caches and query memoization + still remain process-local only. offset: int, optional An offset in the file where super-chunk or array data is located (e.g. in a file containing several such objects). @@ -1816,12 +1868,21 @@ def open( Notes ----- - * This is just a 'logical' open, so there is no `close()` counterpart because - currently, there is no need for it. + * Returned objects can be used as context managers for API consistency. + For objects with an explicit ``close()`` implementation, exiting the + context will close/flush them; for logical handles such as regular + :class:`SChunk`, :class:`NDArray`, :class:`C2Array`, :class:`Proxy`, and + :class:`LazyArray`, exiting the context is currently a no-op. * If :paramref:`urlpath` is a :ref:`URLPath` instance, :paramref:`mode` must be 'r', :paramref:`offset` must be 0, and kwargs cannot be passed. + * Persistent data handling follows a strict no-hidden-writes rule: + + - ``mode='r'`` is observational only and never mutates the opened object. + - ``mode='a'`` / ``mode='w'`` only persist explicit mutations requested by the + caller; runtime caches are not serialized back to disk. + * If the original object saved in :paramref:`urlpath` is a :ref:`Proxy`, this function will only return a :ref:`Proxy` if its source is a local :ref:`SChunk`, :ref:`NDArray` or a remote :ref:`C2Array`. Otherwise, diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 53a325889..52ed507c0 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -119,12 +119,9 @@ class TreeStore(DictStore): If None, the default Blosc2 storage properties are used. threshold : int, optional Threshold for the array size (bytes) to be kept in the embed store. - If the *compressed* array size is below this threshold, it will be - stored in the embed store instead of as a separate file. If None, - in-memory arrays are stored in the embed store and on-disk arrays - are stored as separate files. - C2Array objects will always be stored in the embed store, - regardless of their size. + Default is 0, meaning values are persisted as external files by + default. C2Array objects are always stored in the embed store + regardless of this setting. Examples -------- diff --git a/tests/ctable/test_ctable_indexing.py b/tests/ctable/test_ctable_indexing.py index d1005b5f6..3a7ffc4b4 100644 --- a/tests/ctable/test_ctable_indexing.py +++ b/tests/ctable/test_ctable_indexing.py @@ -99,6 +99,22 @@ def test_where_with_index_matches_scan_in_memory(): assert ids_idx == ids_scan +def test_indexed_where_view_sort_by_reuses_cached_live_positions(monkeypatch): + t = _make_table(200) + t.create_index("id", kind=blosc2.IndexKind.FULL) + + view = t.where(t["id"] > 100, columns=["id", "value"]) + assert view._cached_live_positions is not None + + def fail_iter_live_positions_chunks(): + raise AssertionError("sort_by() should reuse cached live positions") + + monkeypatch.setattr(view, "_iter_live_positions_chunks", fail_iter_live_positions_chunks) + sorted_view = view.sort_by("id") + + assert sorted_view["id"][:].tolist() == list(range(101, 200)) + + def test_create_expression_index_in_memory(): t = _make_table(50) idx = t.create_index(expression="value * category", kind=blosc2.IndexKind.FULL, name="vc") @@ -308,6 +324,33 @@ def test_catalog_survives_reopen(tmpdir): assert not idxs[0].stale +def test_index_catalog_cached_per_opened_ctable(tmpdir, monkeypatch): + path = str(tmpdir / "table.b2d") + t = _make_table(200, persistent_path=path) + t.create_index("id", kind=blosc2.IndexKind.FULL) + del t + + with blosc2.open(path, mode="r") as t2: + calls = 0 + original = t2._storage.load_index_catalog + + def wrapped_load_index_catalog(): + nonlocal calls + calls += 1 + return original() + + monkeypatch.setattr(t2._storage, "load_index_catalog", wrapped_load_index_catalog) + + first = t2.where(t2["id"] > 100, columns=["id", "value"]).sort_by("id") + second = t2.where(t2["id"] > 150, columns=["id", "value"]).sort_by("id") + idxs = t2.indexes + + assert first["id"][:].tolist() == list(range(101, 200)) + assert second["id"][:].tolist() == list(range(151, 200)) + assert len(idxs) == 1 + assert calls == 1 + + @pytest.mark.heavy def test_where_with_index_matches_scan_persistent(tmpdir): path = str(tmpdir / "table.b2d") diff --git a/tests/ctable/test_table_persistency.py b/tests/ctable/test_table_persistency.py index 2f10d493a..ec0e07bba 100644 --- a/tests/ctable/test_table_persistency.py +++ b/tests/ctable/test_table_persistency.py @@ -80,6 +80,107 @@ def test_schema_saved_in_meta_vlmeta(): assert col_names == ["id", "score", "active"] +# --------------------------------------------------------------------------- +# CTable.vlmeta property +# --------------------------------------------------------------------------- + + +def test_ctable_vlmeta_in_memory(): + """CTable.vlmeta works for in-memory tables.""" + t = CTable(Row) + # Initially empty + assert t.vlmeta[:] == {} + # Set and get + t.vlmeta["author"] = "test" + t.vlmeta["version"] = 2 + t.vlmeta["active"] = True + assert t.vlmeta["author"] == "test" + assert t.vlmeta[:]["author"] == "test" + assert t.vlmeta[:]["version"] == 2 + assert t.vlmeta[:]["active"] is True + + +def test_ctable_vlmeta_persistent(tmp_path): + """CTable.vlmeta round-trips through close/reopen.""" + path = str(tmp_path / "vlmeta.b2z") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 10.0, True)) + t.vlmeta["description"] = "test table" + t.vlmeta["rows"] = 1 + t.vlmeta["tags"] = ["a", "b", "c"] + t.close() + + t2 = CTable(Row, urlpath=path, mode="a") + assert t2.vlmeta[:]["description"] == "test table" + assert t2.vlmeta[:]["rows"] == 1 + assert t2.vlmeta[:]["tags"] == ["a", "b", "c"] + + +def test_ctable_vlmeta_value_types(tmp_path): + """CTable.vlmeta supports various value types via msgpack.""" + path = str(tmp_path / "vlmeta_types.b2z") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 10.0, True)) + t.vlmeta["bool_val"] = True + t.vlmeta["int_val"] = 42 + t.vlmeta["float_val"] = 3.14 + t.vlmeta["str_val"] = "hello" + t.vlmeta["list_val"] = [1, 2, 3] + t.vlmeta["dict_val"] = {"a": 1} + t.close() + + t2 = CTable(Row, urlpath=path, mode="a") + assert t2.vlmeta[:]["bool_val"] is True + assert t2.vlmeta[:]["int_val"] == 42 + assert t2.vlmeta[:]["float_val"] == 3.14 + assert t2.vlmeta[:]["str_val"] == "hello" + assert t2.vlmeta[:]["list_val"] == [1, 2, 3] + assert t2.vlmeta[:]["dict_val"] == {"a": 1} + + +def test_ctable_vlmeta_delete(tmp_path): + """CTable.vlmeta supports deletion of keys.""" + path = str(tmp_path / "vlmeta_del.b2z") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 10.0, True)) + t.vlmeta["keep"] = "stay" + t.vlmeta["remove"] = "go" + del t.vlmeta["remove"] + assert "remove" not in t.vlmeta[:] + assert t.vlmeta[:]["keep"] == "stay" + t.close() + + t2 = CTable(Row, urlpath=path, mode="a") + assert "remove" not in t2.vlmeta[:] + assert t2.vlmeta[:]["keep"] == "stay" + + +def test_ctable_vlmeta_no_internal_keys(tmp_path): + """Internal schema keys are NOT in user vlmeta (separate storage).""" + path = str(tmp_path / "vlmeta_int.b2z") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 10.0, True)) + t.close() + + t2 = CTable(Row, urlpath=path, mode="a") + # User vlmeta is separate from internal schema vlmeta + assert "kind" not in t2.vlmeta[:] + assert "schema" not in t2.vlmeta[:] + assert "version" not in t2.vlmeta[:] + + +def test_ctable_vlmeta_reopen_read_only(tmp_path): + """Vlmeta is readable in read-only mode.""" + path = str(tmp_path / "vlmeta_ro.b2z") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 10.0, True)) + t.vlmeta["data"] = "secret" + t.close() + + t2 = CTable(Row, urlpath=path, mode="r") + assert t2.vlmeta[:]["data"] == "secret" + + # --------------------------------------------------------------------------- # Round-trip: data survives reopen # --------------------------------------------------------------------------- diff --git a/tests/ndarray/test_getitem.py b/tests/ndarray/test_getitem.py index a2565561e..3092fa8f3 100644 --- a/tests/ndarray/test_getitem.py +++ b/tests/ndarray/test_getitem.py @@ -6,6 +6,7 @@ ####################################################################### import math +from pathlib import Path import numpy as np import pytest @@ -139,6 +140,119 @@ def test_bool_values(shape, chunks, blocks, idx): assert b2a[idx].ndim == npa[idx].ndim +def test_dense_bool_ndarray_mask_no_recursion(): + nitems = 60_000 + npa = np.arange(nitems, dtype=np.int32) + a = blosc2.asarray(npa, chunks=(20_000,)) + mask = blosc2.asarray(np.ones(nitems, dtype=np.bool_), chunks=(20_000,)) + + np.testing.assert_array_equal(a[mask], npa) + + +def test_lazyexpr_where_full_slice_no_recursion(): + nitems = 60_000 + a = blosc2.linspace(0, 1, nitems, chunks=(20_000,)) + expected = np.linspace(0, 1, nitems) + + np.testing.assert_allclose(a[a < 5][:], expected) + + +def test_lazyexpr_where_full_slice_persisted_reuses_shared_chunk_cache(tmp_path): + nitems = 60_000 + expected = np.linspace(0, 1, nitems) + a = blosc2.asarray( + expected, chunks=(20_000,), blocks=(2_000,), urlpath=str(tmp_path / "persisted.b2nd"), mode="w" + ) + old_nthreads = blosc2.nthreads + blosc2.set_nthreads(max(2, old_nthreads)) + try: + for _ in range(10): + np.testing.assert_allclose(a[a < 5][:], expected) + finally: + blosc2.set_nthreads(old_nthreads) + + +def test_lazyexpr_where_full_slice_cached_repeat_avoids_full_mask_scan(monkeypatch): + nitems = 60_000 + expected = np.arange(5, dtype=np.int64) + a = blosc2.asarray(np.arange(nitems, dtype=np.int64), chunks=(20_000,)) + + np.testing.assert_allclose(a[a < 5][:], expected) + monkeypatch.setattr( + blosc2.LazyExpr, + "_collect_flat_indices_from_bool_ndarray", + staticmethod(lambda _mask: (_ for _ in ()).throw(AssertionError("mask scan should be cached"))), + ) + + np.testing.assert_allclose(a[a < 5][:], expected) + + +@pytest.mark.parametrize("mode", ["r", "a"]) +def test_lazyexpr_where_full_slice_persistent_uses_hot_cache_without_persisting(tmp_path, monkeypatch, mode): + nitems = 60_000 + expected = np.arange(5, dtype=np.int64) + urlpath = tmp_path / "persisted_readonly.b2nd" + blosc2.asarray( + np.arange(nitems, dtype=np.int64), chunks=(20_000,), blocks=(2_000,), urlpath=urlpath, mode="w" + ) + persisted = blosc2.open(urlpath, mode=mode) + initial_size = urlpath.stat().st_size + indexing = __import__("blosc2.indexing", fromlist=["QUERY_CACHE_VLMETA_KEY", "_hot_cache_clear"]) + payload_path = Path(indexing._query_cache_payload_path(persisted)) + indexing._hot_cache_clear() + + np.testing.assert_allclose(persisted[persisted < 5][:], expected) + monkeypatch.setattr( + blosc2.LazyExpr, + "_collect_flat_indices_from_bool_ndarray", + staticmethod(lambda _mask: (_ for _ in ()).throw(AssertionError("mask scan should be cached"))), + ) + + np.testing.assert_allclose(persisted[persisted < 5][:], expected) + assert not payload_path.exists() + assert urlpath.stat().st_size == initial_size + assert indexing.QUERY_CACHE_VLMETA_KEY not in persisted.schunk.vlmeta + + +def test_sparse_bool_mask_routes_through_take_fastpath(monkeypatch): + nitems = 120_000 + npa = np.arange(nitems, dtype=np.int32) + a = blosc2.asarray(npa, chunks=(20_000,)) + mask = np.zeros(nitems, dtype=np.bool_) + mask[[1, 10, 11_111, 55_555, nitems - 1]] = True + + call_count = {"take": 0} + original_take = blosc2.take + + def wrapped_take(*args, **kwargs): + call_count["take"] += 1 + return original_take(*args, **kwargs) + + monkeypatch.setattr(blosc2, "take", wrapped_take) + + np.testing.assert_array_equal(a[mask], npa[mask]) + assert call_count["take"] == 1 + + +def test_dense_bool_mask_skips_take_fastpath(monkeypatch): + nitems = 60_000 + npa = np.arange(nitems, dtype=np.int32) + a = blosc2.asarray(npa, chunks=(20_000,)) + mask = np.ones(nitems, dtype=np.bool_) + + call_count = {"take": 0} + original_take = blosc2.take + + def wrapped_take(*args, **kwargs): + call_count["take"] += 1 + return original_take(*args, **kwargs) + + monkeypatch.setattr(blosc2, "take", wrapped_take) + + np.testing.assert_array_equal(a[mask], npa[mask]) + assert call_count["take"] == 0 + + @pytest.mark.parametrize( ("shape", "chunks", "blocks"), [ @@ -171,6 +285,87 @@ def test_ndarray(dtype): np.testing.assert_almost_equal(a_slice, na_slice) +def test_take_1d_uses_sparse_path_matches_numpy(tmp_path): + npa = np.arange(1000, dtype=np.int32) + a = blosc2.asarray(npa, chunks=(128,), urlpath=tmp_path / "take_sparse.b2nd", mode="w") + idx = np.array([999, 998, 997, 997, 500, 129, 128, 127, 126, 33, 32, 31, 31, 0], dtype=np.int64) + + np.testing.assert_array_equal(a.take(idx)[()], npa[idx]) + np.testing.assert_array_equal(a[idx], npa[idx]) + + +def test_take_1d_sparse_path_negative_indices(): + npa = np.arange(20, dtype=np.int32) + a = blosc2.asarray(npa, chunks=(8,)) + idx = np.array([-1, -5, 0, 3], dtype=np.int64) + + np.testing.assert_array_equal(a.take(idx)[()], npa[idx]) + np.testing.assert_array_equal(a[idx], npa[idx]) + + +def test_take_1d_sparse_path_structured_non_behaved_partitions(): + npa = np.empty((100,), dtype=[("a", np.int32), ("b", np.int32)]) + npa["a"] = np.arange(1, 101) + npa["b"] = np.arange(200, 100, -1) + a = blosc2.asarray(npa, chunks=(44,), blocks=(33,)) + + for idx in [ + np.arange(2, 100), + np.arange(99, 1, -1), + np.array([5, 1, 5, 99, 0, 44, 43], dtype=np.int64), + ]: + np.testing.assert_array_equal(a.take(idx)[()], npa[idx]) + np.testing.assert_array_equal(a[idx], npa[idx]) + + +def test_ndarray_take_1d_matches_numpy(): + npa = np.arange(20, dtype=np.int32) + a = blosc2.asarray(npa, chunks=(7,)) + idx = np.array([5, 1, -1, 5, 0], dtype=np.int64) + + result = a.take(idx) + assert isinstance(result, blosc2.NDArray) + np.testing.assert_array_equal(result[()], np.take(npa, idx)) + + +def test_ndarray_take_axis_with_nd_indices_matches_numpy(): + npa = np.arange(3 * 4 * 5, dtype=np.int32).reshape(3, 4, 5) + a = blosc2.asarray(npa, chunks=(2, 2, 3)) + idx = np.array([[3, 0], [1, -1]], dtype=np.int64) + + expected = np.take(npa, idx, axis=1) + result = a.take(idx, axis=1) + top_level_result = blosc2.take(a, idx, axis=1) + assert isinstance(result, blosc2.NDArray) + assert isinstance(top_level_result, blosc2.NDArray) + np.testing.assert_array_equal(result[()], expected) + np.testing.assert_array_equal(top_level_result[()], expected) + + +def test_ndarray_take_axis_none_nd_fallback_matches_numpy(): + npa = np.arange(3 * 4 * 5, dtype=np.int32).reshape(3, 4, 5) + a = blosc2.asarray(npa, chunks=(2, 2, 3)) + idx = np.array([[0, -1], [17, 5]], dtype=np.int64) + + expected = np.take(npa, idx, axis=None) + result = a.take(idx) + top_level_result = blosc2.take(a, idx) + assert isinstance(result, blosc2.NDArray) + assert isinstance(top_level_result, blosc2.NDArray) + np.testing.assert_array_equal(result[()], expected) + np.testing.assert_array_equal(top_level_result[()], expected) + + +def test_ndarray_take_rejects_bad_indices_and_axis(): + a = blosc2.asarray(np.arange(12, dtype=np.int32).reshape(3, 4)) + with pytest.raises(TypeError, match="integer"): + a.take(np.array([1.5]), axis=0) + with pytest.raises(ValueError, match="axis"): + a.take([0], axis=2) + with pytest.raises(IndexError, match="bounds"): + a.take([3], axis=0) + + @pytest.mark.parametrize( ("shape", "chunkshape", "axis", "indices"), [ @@ -225,3 +420,238 @@ def test_take_along_axis(shape, chunkshape, axis): # Compare np.testing.assert_array_equal(result[()], expected) + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "axis", "indices"), + [ + # 2D + ((6, 7), (4, 5), (3, 4), 0, [0, 3, 5]), + ((6, 7), (4, 5), (3, 4), 1, [0, 3, 6]), + ((20, 15), (6, 7), (3, 4), 0, [0, 10, 19]), + ((20, 15), (6, 7), (3, 4), 1, [0, 7, 14]), + # 3D + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 0, [0, 2, 4]), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 1, [0, 3, 5]), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 2, [0, 3, 6]), + ((9, 10, 11), (4, 5, 6), (2, 3, 3), 0, [0, 4, 8]), + ((9, 10, 11), (4, 5, 6), (2, 3, 3), 1, [0, 5, 9]), + ((9, 10, 11), (4, 5, 6), (2, 3, 3), 2, [0, 5, 10]), + # 4D + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), 0, [0, 2, 3]), + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), 2, [0, 3, 5]), + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), 3, [0, 3, 6]), + ], +) +def test_ndarray_take_ndim(shape, chunks, blocks, axis, indices): + npa = np.arange(np.prod(shape), dtype=np.float64).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + expected = np.take(npa, indices, axis=axis) + result = a.take(indices, axis=axis) + top_result = blosc2.take(a, indices, axis=axis) + + assert isinstance(result, blosc2.NDArray) + assert isinstance(top_result, blosc2.NDArray) + np.testing.assert_array_equal(result[:], expected) + np.testing.assert_array_equal(top_result[:], expected) + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "indices"), + [ + # 2D, 3D, 4D with axis=None + ((6, 7), (4, 5), (3, 4), [0, 10, 41]), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), [0, 50, 209]), + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), [0, 100, 500, 839]), + ], +) +def test_ndarray_take_ndim_axis_none(shape, chunks, blocks, indices): + npa = np.arange(np.prod(shape), dtype=np.float64).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + expected = np.take(npa, indices, axis=None) + result = a.take(indices) + top_result = blosc2.take(a, indices) + + assert isinstance(result, blosc2.NDArray) + assert isinstance(top_result, blosc2.NDArray) + np.testing.assert_array_equal(result[:], expected) + np.testing.assert_array_equal(top_result[:], expected) + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "axis", "indices"), + [ + # 2D, 3D, 4D with multi-dim index arrays + ((6, 7), (4, 5), (3, 4), 1, np.array([[0, 3], [6, 2]])), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 0, np.array([[0, 2], [4, 1]])), + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), 2, np.array([[0, 3], [5, 1]])), + ], +) +def test_ndarray_take_ndim_multidim_indices(shape, chunks, blocks, axis, indices): + npa = np.arange(np.prod(shape), dtype=np.float64).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + expected = np.take(npa, indices, axis=axis) + result = a.take(indices, axis=axis) + + assert isinstance(result, blosc2.NDArray) + np.testing.assert_array_equal(result[:], expected) + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "axis", "indices"), + [ + # Negative indices + ((6, 7), (4, 5), (3, 4), 0, [-1, -3, 0]), + ((6, 7), (4, 5), (3, 4), 1, [-1, -7, 3, 0]), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 2, [-1, -7, 3]), + # Duplicate indices + ((6, 7), (4, 5), (3, 4), 0, [0, 5, 0, 5, 3]), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 1, [3, 3, 5, 5, 0]), + # Single index (scalar-like list) + ((6, 7), (4, 5), (3, 4), 0, [3]), + ((6, 7), (4, 5), (3, 4), 1, [0]), + # Empty indices + ((6, 7), (4, 5), (3, 4), 0, []), + ((6, 7), (4, 5), (3, 4), 1, []), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 0, []), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 1, []), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 2, []), + ], +) +def test_ndarray_take_ndim_edge_cases(shape, chunks, blocks, axis, indices): + npa = np.arange(np.prod(shape), dtype=np.float64).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + expected = np.take(npa, indices, axis=axis) + result = a.take(indices, axis=axis) + + assert isinstance(result, blosc2.NDArray) + np.testing.assert_array_equal(result[:], expected) + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "axis"), + [ + # 2D with non-behaved (non-even) partitions + ((7, 11), (5, 7), (3, 5), 0), + ((7, 11), (5, 7), (3, 5), 1), + # 3D with non-behaved partitions + ((7, 11, 13), (5, 7, 8), (3, 4, 5), 0), + ((7, 11, 13), (5, 7, 8), (3, 4, 5), 1), + ((7, 11, 13), (5, 7, 8), (3, 4, 5), 2), + ], +) +def test_ndarray_take_ndim_non_behaved_partitions(shape, chunks, blocks, axis): + npa = np.arange(np.prod(shape), dtype=np.int32).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + rng = np.random.default_rng(42) + indices = rng.integers(0, shape[axis], size=min(shape[axis], 8)).tolist() + + expected = np.take(npa, indices, axis=axis) + result = a.take(indices, axis=axis) + + assert isinstance(result, blosc2.NDArray) + np.testing.assert_array_equal(result[:], expected) + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "axis"), + [ + # Different dtypes + ((6, 7), (4, 5), (3, 4), 0), + ((5, 6, 7), (3, 4, 5), (2, 2, 3), 1), + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), 2), + ], +) +def test_ndarray_take_ndim_dtypes(shape, chunks, blocks, axis): + for dtype in [np.int32, np.int64, np.float32, np.float64, np.complex128]: + npa = np.arange(np.prod(shape), dtype=dtype).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + rng = np.random.default_rng(42) + indices = rng.integers(0, shape[axis], size=min(shape[axis], 5)).tolist() + + expected = np.take(npa, indices, axis=axis) + result = a.take(indices, axis=axis) + + assert isinstance(result, blosc2.NDArray) + np.testing.assert_array_equal(result[:], expected) + + +# --- __getitem__ fancy indexing with integer arrays (uses b2nd_get_sparse_cbuffer) --- + + +@pytest.mark.parametrize( + ("shape", "chunks", "blocks", "indices"), + [ + # 1-D with 1-D index (was already sparse, regression check) + ((100,), (23,), (7,), [0, 5, 50, 99]), + # 1-D with 2-D index (was fancy indexing before, now sparse) + ((100,), (23,), (7,), [[1, 3], [5, 7]]), + # 2-D with 1-D index (was fancy indexing before, now sparse) + ((6, 7), (4, 5), (3, 4), [0, 3, 5]), + ((20, 15), (6, 7), (3, 4), [0, 10, 19]), + # 2-D with 2-D index (was fancy indexing before, now sparse) + ((6, 7), (4, 5), (3, 4), [[0, 3], [5, 2]]), + # 3-D with 1-D index + ((5, 6, 7), (3, 4, 5), (2, 2, 3), [0, 2, 4]), + # 3-D with 2-D index + ((5, 6, 7), (3, 4, 5), (2, 2, 3), [[0, 2], [4, 1]]), + # 4-D with 1-D index + ((4, 5, 6, 7), (3, 3, 4, 5), (2, 2, 2, 3), [0, 2, 3]), + ], +) +def test_getitem_integer_array_fancy_index(shape, chunks, blocks, indices): + npa = np.arange(np.prod(shape), dtype=np.float64).reshape(shape) + a = blosc2.asarray(npa, chunks=chunks, blocks=blocks) + + expected = npa[indices] + result = a[indices] + + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + ("shape", "indices"), + [ + ((6, 7), [-1, 0, 3, -3]), + ((6, 7), [0, 5, 0, 5, 3]), + ((6, 7), [3]), + ((6, 7), []), + ((5, 6, 7), [-1, 0, 4, -2]), + ((5, 6, 7), [0, 4, 0, 2]), + ((5, 6, 7), [2]), + ((5, 6, 7), []), + ], +) +def test_getitem_integer_array_edge_cases(shape, indices): + npa = np.arange(np.prod(shape), dtype=np.float64).reshape(shape) + a = blosc2.asarray(npa) + + expected = npa[indices] + result = a[indices] + + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, expected) + + +def test_getitem_integer_array_out_of_bounds(): + a = blosc2.asarray(np.arange(12, dtype=np.int32).reshape(3, 4)) + with pytest.raises(IndexError, match="bounds"): + _ = a[[3]] + with pytest.raises(IndexError, match="bounds"): + _ = a[[-4]] + + +def test_getitem_integer_array_still_uses_fancy_for_boolean(): + """Boolean arrays should NOT be routed through the sparse path.""" + a = blosc2.asarray(np.arange(12, dtype=np.int32).reshape(3, 4)) + mask = np.array([True, False, True]) + expected = np.arange(12, dtype=np.int32).reshape(3, 4)[mask] + result = a[mask] + np.testing.assert_array_equal(result, expected) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index c89bb3a2c..bbcfe43c9 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -1661,6 +1661,10 @@ def test_hot_cache_put_then_get(): _clear_caches() coords = np.array([1, 2, 3], dtype=np.int64) indexing._hot_cache_put("abc", coords) + entry = next(iter(indexing._HOT_CACHE.values())) + assert isinstance(entry, indexing._CompressedHotCoords) + assert isinstance(entry.data, bytes) + assert entry.nbytes == indexing._HOT_CACHE_BYTES result = indexing._hot_cache_get("abc") assert result is not None np.testing.assert_array_equal(result, coords) @@ -1678,18 +1682,21 @@ def test_hot_cache_scope_isolation(): def test_hot_cache_byte_limit_evicts_lru(): _clear_caches() - # Each entry is 100 * 8 = 800 bytes. Budget is 128 KB = 131072 bytes. - # Fill with 165 entries (165 * 800 = 132000 > 131072); expect oldest evicted. - entry_size = 100 - for i in range(165): - coords = np.arange(entry_size, dtype=np.int64) - indexing._hot_cache_put(f"key{i}", coords) - - # First keys should have been evicted. - assert indexing._hot_cache_get("key0") is None - # Most recent keys should still be present. - assert indexing._hot_cache_get("key164") is not None - assert indexing._HOT_CACHE_BYTES <= indexing.QUERY_CACHE_MAX_MEM_NBYTES + coords = np.arange(10_000, dtype=np.int64) + compressed = indexing._compress_hot_coords(coords) + original_budget = indexing.QUERY_CACHE_MAX_MEM_NBYTES + try: + indexing.QUERY_CACHE_MAX_MEM_NBYTES = compressed.nbytes * 2 + for i in range(3): + indexing._hot_cache_put(f"key{i}", coords) + + # First key should have been evicted, the two newest should remain. + assert indexing._hot_cache_get("key0") is None + assert indexing._hot_cache_get("key1") is not None + assert indexing._hot_cache_get("key2") is not None + assert indexing._HOT_CACHE_BYTES <= indexing.QUERY_CACHE_MAX_MEM_NBYTES + finally: + indexing.QUERY_CACHE_MAX_MEM_NBYTES = original_budget def test_hot_cache_clear(): @@ -1725,12 +1732,11 @@ def test_in_memory_array_hot_cache_hit(): # --------------------------------------------------------------------------- -# Stage 4 – Persistent cache: cross-session hit +# Stage 4 – Persistent arrays still use hot cache only # --------------------------------------------------------------------------- -def test_persistent_cache_survives_reopen(tmp_path): - """After reopening the array the persistent cache should serve the result.""" +def test_persistent_arrays_do_not_create_query_cache_artifacts(tmp_path): arr, urlpath = _make_persistent_array(tmp_path) _clear_caches() @@ -1738,98 +1744,35 @@ def test_persistent_cache_survives_reopen(tmp_path): result1 = expr.argsort().compute() payload_path = indexing._query_cache_payload_path(arr) - assert Path(payload_path).exists(), "persistent payload store should be created" - - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert len(catalog["entries"]) == 1 + assert indexing._HOT_CACHE_BYTES > 0 + assert not Path(payload_path).exists() + assert indexing._load_query_cache_catalog(arr) is None - # Re-open the array in a fresh process-local state. _clear_caches() arr2 = blosc2.open(urlpath, mode="r") result2 = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr2.fields).where(arr2).argsort().compute() np.testing.assert_array_equal(result1, result2) + assert not Path(indexing._query_cache_payload_path(arr2)).exists() + assert indexing._load_query_cache_catalog(arr2) is None -def test_persistent_cache_not_created_for_non_persistent_array(): - _clear_caches() - data = np.arange(10_000, dtype=np.int64) - arr = blosc2.asarray(data, chunks=(1_000,), blocks=(200,)) - arr.create_index(kind=blosc2.IndexKind.FULL) - result = indexing._persistent_cache_lookup(arr, "any_digest") - assert result is None - - -# --------------------------------------------------------------------------- -# Stage 3 – Per-entry logical-byte size limit -# --------------------------------------------------------------------------- - - -def test_persistent_entry_size_limit_rejected(tmp_path): - """Entries whose logical int64 position bytes exceed the entry limit must not be stored.""" +def test_persistent_cache_helpers_are_disabled(tmp_path): arr, _ = _make_persistent_array(tmp_path, n=50_000) _clear_caches() - # 10k coordinates imply 80 KB of logical int64 positions and should exceed the 64 KB limit. rng = np.random.default_rng(42) coords = np.sort(rng.choice(50_000, size=10_000, replace=False)).astype(np.int64) - - entry_nbytes = indexing._query_cache_entry_nbytes(coords) - assert entry_nbytes > indexing.QUERY_CACHE_MAX_ENTRY_NBYTES, ( - f"test setup error: logical size {entry_nbytes} must exceed " - f"{indexing.QUERY_CACHE_MAX_ENTRY_NBYTES} for this test to be meaningful" - ) - descriptor = indexing._normalize_query_descriptor("(id >= 0) & (id < 50000)", ["__self__"], None) digest = indexing._query_cache_digest(descriptor) - result = indexing._persistent_cache_insert(arr, digest, coords, descriptor) - assert result is False, "oversized entry must be rejected" - - -def test_persistent_cache_overflow_nukes_persistent_entries_and_keeps_newest(tmp_path, monkeypatch): - arr, urlpath = _make_persistent_array(tmp_path, n=8_000) - _clear_caches() - - rng = np.random.default_rng(123) - payloads = [] - for i in range(3): - coords = np.sort(rng.choice(8_000, size=256, replace=False)).astype(np.int64) - descriptor = indexing._normalize_query_descriptor( - f"(id >= {i}) & (id < {i + 1})", ["__self__"], None - ) - digest = indexing._query_cache_digest(descriptor) - nbytes = indexing._query_cache_entry_nbytes(coords) - payloads.append((digest, descriptor, coords, nbytes)) - - budget = max(payloads[0][3] + payloads[1][3], payloads[1][3] + payloads[2][3]) - monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_NBYTES", budget) - - for digest, descriptor, coords, _ in payloads: - assert indexing._persistent_cache_insert(arr, digest, coords, descriptor) is True - - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert catalog["max_persistent_nbytes"] == budget - assert set(catalog["entries"]) == {payloads[2][0]} - assert catalog["entries"][payloads[2][0]]["slot"] == 0 - assert catalog["next_slot"] == 1 - assert catalog["persistent_nbytes"] == payloads[2][3] - - assert indexing._persistent_cache_lookup(arr, payloads[0][0]) is None - assert indexing._persistent_cache_lookup(arr, payloads[1][0]) is None - np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[2][0]), payloads[2][2]) - - _clear_caches() - reopened = blosc2.open(urlpath, mode="r") - assert indexing._persistent_cache_lookup(reopened, payloads[1][0]) is None - np.testing.assert_array_equal( - indexing._persistent_cache_lookup(reopened, payloads[2][0]), payloads[2][2] - ) + assert indexing._persistent_cache_insert(arr, digest, coords, descriptor) is False + assert indexing._persistent_cache_lookup(arr, digest) is None + assert indexing._load_query_cache_catalog(arr) is None + assert not Path(indexing._query_cache_payload_path(arr)).exists() -def test_persistent_cache_overflow_preserves_hot_cache(tmp_path, monkeypatch): +def test_store_cached_coords_for_persistent_array_uses_hot_cache_only(tmp_path): arr, _ = _make_persistent_array(tmp_path, n=8_000) _clear_caches() @@ -1838,27 +1781,18 @@ def test_persistent_cache_overflow_preserves_hot_cache(tmp_path, monkeypatch): expr1 = "(id >= 0) & (id < 256)" expr2 = "(id >= 256) & (id < 512)" - budget = indexing._query_cache_entry_nbytes(coords1) - monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_NBYTES", budget) - indexing.store_cached_coords(arr, expr1, [indexing.SELF_TARGET_NAME], None, coords1) indexing.store_cached_coords(arr, expr2, [indexing.SELF_TARGET_NAME], None, coords2) - assert ( - indexing._persistent_cache_lookup( - arr, - indexing._query_cache_digest( - indexing._normalize_query_descriptor(expr1, [indexing.SELF_TARGET_NAME], None) - ), - ) - is None - ) + assert indexing._persistent_cache_lookup(arr, "unused") is None np.testing.assert_array_equal( indexing.get_cached_coords(arr, expr1, [indexing.SELF_TARGET_NAME], None), coords1 ) np.testing.assert_array_equal( indexing.get_cached_coords(arr, expr2, [indexing.SELF_TARGET_NAME], None), coords2 ) + assert indexing._load_query_cache_catalog(arr) is None + assert not Path(indexing._query_cache_payload_path(arr)).exists() # --------------------------------------------------------------------------- @@ -1874,7 +1808,8 @@ def test_invalidation_on_drop_index(tmp_path): expr.argsort().compute() payload_path = indexing._query_cache_payload_path(arr) - assert Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES > 0 + assert not Path(payload_path).exists() arr.drop_index() assert not Path(payload_path).exists(), "payload file should be removed after drop_index" @@ -1890,7 +1825,8 @@ def test_invalidation_on_rebuild_index(tmp_path): expr.argsort().compute() payload_path = indexing._query_cache_payload_path(arr) - assert Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES > 0 + assert not Path(payload_path).exists() arr.rebuild_index() assert not Path(payload_path).exists() @@ -1905,6 +1841,8 @@ def test_invalidation_on_compact_index(tmp_path): expr.argsort().compute() payload_path = indexing._query_cache_payload_path(arr) + assert indexing._HOT_CACHE_BYTES > 0 + assert not Path(payload_path).exists() arr.compact_index() assert not Path(payload_path).exists() assert indexing._HOT_CACHE_BYTES == 0 @@ -1918,7 +1856,8 @@ def test_invalidation_on_mark_indexes_stale(tmp_path): expr.argsort().compute() payload_path = indexing._query_cache_payload_path(arr) - assert Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES > 0 + assert not Path(payload_path).exists() indexing.mark_indexes_stale(arr) assert not Path(payload_path).exists() @@ -1933,7 +1872,8 @@ def test_invalidation_on_append(tmp_path): expr.argsort().compute() payload_path = indexing._query_cache_payload_path(arr) - assert Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES > 0 + assert not Path(payload_path).exists() dtype = np.dtype([("id", np.int64), ("val", np.float32)]) extra = np.empty(1_000, dtype=dtype) @@ -1950,8 +1890,8 @@ def test_invalidation_on_append(tmp_path): # --------------------------------------------------------------------------- -def test_ordered_query_indices_cached(tmp_path): - """Ordered .argsort(order=...).compute() results are cached and reused.""" +def test_ordered_query_indices_cached(tmp_path, monkeypatch): + """Ordered .argsort(order=...).compute() results are cached and reused in-process.""" arr, _ = _make_persistent_array(tmp_path) _clear_caches() @@ -1959,9 +1899,12 @@ def test_ordered_query_indices_cached(tmp_path): result1 = lazy.argsort(order="id").compute() assert indexing._HOT_CACHE_BYTES > 0 - - _clear_caches() arr2 = blosc2.open(arr.urlpath, mode="r") + monkeypatch.setattr( + indexing, + "ordered_query_indices", + lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("ordered query should be cached")), + ) result2 = ( blosc2.lazyexpr("(id >= 10_000) & (id < 20_000)", arr2.fields) .where(arr2) @@ -2004,15 +1947,15 @@ def test_multiple_distinct_queries_in_same_cache(tmp_path): r1 = expr1.argsort().compute() r2 = expr2.argsort().compute() - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert len(catalog["entries"]) == 2 - # Verify both results are consistent with scan. dtype = arr.dtype data = arr[:] - np.testing.assert_array_equal(r1, np.where((data["id"] >= 5_000) & (data["id"] < 10_000))[0]) - np.testing.assert_array_equal(r2, np.where((data["id"] >= 20_000) & (data["id"] < 25_000))[0]) + expected1 = np.where((data["id"] >= 5_000) & (data["id"] < 10_000))[0] + expected2 = np.where((data["id"] >= 20_000) & (data["id"] < 25_000))[0] + np.testing.assert_array_equal(r1, expected1) + np.testing.assert_array_equal(r2, expected2) + assert len(indexing._HOT_CACHE) == 2 + assert indexing._load_query_cache_catalog(arr) is None # --------------------------------------------------------------------------- @@ -2042,21 +1985,18 @@ def test_hot_cache_avoids_recompute(tmp_path): def test_value_path_cache_hit_persistent(tmp_path): - """arr[cond][:] on a persistent full-indexed array caches coords and serves warm calls.""" - arr, urlpath = _make_persistent_array(tmp_path) + """arr[cond][:] on a persistent full-indexed array caches coords in-process only.""" + arr, _ = _make_persistent_array(tmp_path) _clear_caches() cond = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr.fields) result1 = arr[cond][:] - # After first call, cache should have an entry. - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert len(catalog["entries"]) == 1 + assert indexing._HOT_CACHE_BYTES > 0 + assert indexing._load_query_cache_catalog(arr) is None + assert not Path(indexing._query_cache_payload_path(arr)).exists() - # Warm call: serve from cache. - _clear_caches() # only clears hot cache; persistent ObjectArray remains - arr2 = blosc2.open(urlpath, mode="r") + arr2 = blosc2.open(arr.urlpath, mode="r") cond2 = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr2.fields) result2 = arr2[cond2][:] @@ -2203,38 +2143,28 @@ def test_ondisk_value_path_correct(tmp_path, kind): def test_ondisk_value_path_full_warm_hits_cache(tmp_path): - """After the first on-disk full-index value query, warm calls use the cache.""" + """After the first on-disk full-index value query, warm calls use the in-process cache.""" arr = _make_structured_array(tmp_path, kind="full") - urlpath = arr.urlpath _clear_caches() - # Cold call – populates persistent cache r1 = _value_query(arr) - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert len(catalog["entries"]) == 1 - - # Warm call after clearing hot cache (simulates a new process re-opening the file) - _clear_caches() - arr2 = blosc2.open(urlpath, mode="r") + assert indexing._HOT_CACHE_BYTES > 0 + assert indexing._load_query_cache_catalog(arr) is None + arr2 = blosc2.open(arr.urlpath, mode="r") r2 = _value_query(arr2) np.testing.assert_array_equal(r1, r2) @pytest.mark.parametrize("kind", ["summary", "bucket"]) def test_ondisk_value_path_non_exact_warm_hits_cache(tmp_path, kind): - """Summary/bucket on-disk value queries should populate the coordinate cache.""" + """Summary/bucket on-disk value queries should populate the in-process coordinate cache.""" arr = _make_structured_array(tmp_path, kind=kind) - urlpath = arr.urlpath _clear_caches() r1 = _value_query(arr) - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert len(catalog["entries"]) == 1 - - _clear_caches() - arr2 = blosc2.open(urlpath, mode="r") + assert indexing._HOT_CACHE_BYTES > 0 + assert indexing._load_query_cache_catalog(arr) is None + arr2 = blosc2.open(arr.urlpath, mode="r") r2 = _value_query(arr2) np.testing.assert_array_equal(r1, r2) @@ -2261,16 +2191,15 @@ def test_ondisk_value_path_non_full_correct(tmp_path, kind): @pytest.mark.parametrize("kind", ["full"]) def test_ondisk_indices_path_warm_hits_cache(tmp_path, kind): - """After the first on-disk .argsort().compute(), warm calls use the cache.""" + """After the first on-disk .argsort().compute(), warm calls use the in-process cache.""" arr = _make_structured_array(tmp_path, kind=kind) - urlpath = arr.urlpath _clear_caches() expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) r1 = expr.argsort().compute() - _clear_caches() - arr2 = blosc2.open(urlpath, mode="r") + assert indexing._HOT_CACHE_BYTES > 0 + arr2 = blosc2.open(arr.urlpath, mode="r") expr2 = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr2.fields).where(arr2) r2 = expr2.argsort().compute() @@ -2345,20 +2274,22 @@ def test_ondisk_indices_path_no_cross_array_hot_cache_contamination(tmp_path): assert r2.size == 0 -def test_ondisk_empty_indices_result_cached(tmp_path): - arr, urlpath = _make_persistent_array(tmp_path) +def test_ondisk_empty_indices_result_cached(tmp_path, monkeypatch): + arr, _ = _make_persistent_array(tmp_path) _clear_caches() expr = blosc2.lazyexpr("(id >= 60_000) & (id < 61_000)", arr.fields).where(arr) result1 = expr.argsort().compute()[:] assert result1.size == 0 - catalog = indexing._load_query_cache_catalog(arr) - assert catalog is not None - assert len(catalog["entries"]) == 1 - - _clear_caches() - arr2 = blosc2.open(urlpath, mode="r") + assert len(indexing._HOT_CACHE) == 1 + assert indexing._load_query_cache_catalog(arr) is None + monkeypatch.setattr( + indexing, + "plan_query", + lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("empty result should be cached")), + ) + arr2 = blosc2.open(arr.urlpath, mode="r") result2 = ( blosc2.lazyexpr("(id >= 60_000) & (id < 61_000)", arr2.fields).where(arr2).argsort().compute()[:] ) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 7b33cdff1..54615c10b 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -1760,6 +1760,10 @@ def test_save_proxy_operands_reopen_default_mode(tmp_path): assert isinstance(restored, blosc2.LazyExpr) np.testing.assert_array_equal(restored[:], np.arange(10, dtype=np.int64) * 2) + with blosc2.open(str(expr_path), mode="r") as restored_ctx: + assert isinstance(restored_ctx, blosc2.LazyExpr) + np.testing.assert_array_equal(restored_ctx[:], np.arange(10, dtype=np.int64) * 2) + def test_lazyexpr_vlmeta_in_memory_and_persisted(tmp_path): a = blosc2.asarray(np.arange(5, dtype=np.int64), urlpath=str(tmp_path / "a.b2nd"), mode="w") @@ -1778,7 +1782,11 @@ def test_lazyexpr_vlmeta_in_memory_and_persisted(tmp_path): assert restored.vlmeta["name"] == "sum" assert restored.vlmeta["config"] == {"scale": 1} - restored.vlmeta["note"] = "persisted" + with pytest.raises(ValueError, match="reading mode"): + restored.vlmeta["note"] = "persisted" + + writable = blosc2.open(str(expr_path), mode="a") + writable.vlmeta["note"] = "persisted" reopened = blosc2.open(str(expr_path), mode="r") assert reopened.vlmeta["note"] == "persisted" np.testing.assert_array_equal(reopened[:], np.arange(5, dtype=np.int64) * 2) diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py index b0d1d8b8c..c89201d10 100644 --- a/tests/ndarray/test_lazyexpr_fields.py +++ b/tests/ndarray/test_lazyexpr_fields.py @@ -193,7 +193,7 @@ def test_reductions(array_fixture): expr = a1 + a2 - a3 * a4 nres = ne_evaluate("na1 + na2 - na3 * na4") # Use relative tolerance for mean and std - np.testing.assert_allclose(expr.sum()[()], nres.sum()) + np.testing.assert_allclose(expr.sum()[()], nres.sum(), rtol=1e-5) np.testing.assert_allclose(expr.mean()[()], nres.mean(), rtol=1e-5) np.testing.assert_allclose(expr.min()[()], nres.min()) np.testing.assert_allclose(expr.max()[()], nres.max()) diff --git a/tests/ndarray/test_lazyudf.py b/tests/ndarray/test_lazyudf.py index ab0cd814e..cbadd3c5f 100644 --- a/tests/ndarray/test_lazyudf.py +++ b/tests/ndarray/test_lazyudf.py @@ -517,6 +517,11 @@ def test_lazyudf_vlmeta_roundtrip(tmp_path): assert restored.vlmeta["name"] == "increment" assert restored.vlmeta["attrs"] == {"version": 1} + with blosc2.open(str(expr_path), mode="r") as restored_ctx: + assert isinstance(restored_ctx, blosc2.LazyUDF) + assert restored_ctx.vlmeta["name"] == "increment" + assert restored_ctx.vlmeta["attrs"] == {"version": 1} + # Test get_chunk method def test_get_chunk(): diff --git a/tests/ndarray/test_proxy.py b/tests/ndarray/test_proxy.py index fc4577d95..7886a9e12 100644 --- a/tests/ndarray/test_proxy.py +++ b/tests/ndarray/test_proxy.py @@ -105,6 +105,30 @@ def test_open(urlpath, shape, chunks, blocks, slices, dtype): blosc2.remove_urlpath(proxy_urlpath) +def test_open_readonly_proxy_keeps_cache_and_source_readonly(tmp_path): + source_path = tmp_path / "source.b2nd" + proxy_path = tmp_path / "proxy.b2nd" + data = np.arange(120, dtype=np.int32).reshape(12, 10) + + source = blosc2.asarray(data, chunks=(4, 5), blocks=(2, 5), urlpath=source_path, mode="w") + proxy = blosc2.Proxy(source, urlpath=proxy_path, mode="w") + proxy.fetch() + cached_size = proxy_path.stat().st_size + del proxy, source + + readonly = blosc2.open(proxy_path) + + assert readonly.schunk.mode == "r" + assert readonly.schunk.vlmeta.mode == "r" + assert readonly.src.schunk.mode == "r" + np.testing.assert_array_equal(readonly[:], data) + assert proxy_path.stat().st_size == cached_size + + with blosc2.open(proxy_path) as readonly_ctx: + assert isinstance(readonly_ctx, blosc2.Proxy) + np.testing.assert_array_equal(readonly_ctx[:], data) + + # Test the ProxyNDSources interface @pytest.mark.parametrize( ("shape", "chunks", "blocks"), diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py index ee1f738a2..6c307d741 100644 --- a/tests/ndarray/test_reductions.py +++ b/tests/ndarray/test_reductions.py @@ -195,20 +195,30 @@ def test_fp_accuracy(accuracy, dtype): def test_reduce_params(array_fixture, axis, keepdims, dtype_out, reduce_op, kwargs): a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture reduce_args = {"axis": axis} - if reduce_op in {"cumulative_sum", "cumulative_prod"}: - if npcumprod.__name__ == "cumulative_prod": - reduce_args["include_initial"] = keepdims # include_initial only available in cumulative_ - else: + if reduce_op not in {"cumulative_sum", "cumulative_prod"}: reduce_args["keepdims"] = keepdims if reduce_op in ("mean", "std") and dtype_out == np.int16: # mean and std need float dtype as output dtype_out = np.float64 if reduce_op in ("sum", "prod", "mean", "std"): reduce_args["dtype"] = dtype_out - if axis is not None and np.isscalar(axis) and len(a1.shape) >= axis: - return - if isinstance(axis, tuple) and (len(a1.shape) < len(axis) or reduce_op in ("argmax", "argmin")): - return + if axis is not None: + if np.isscalar(axis): + if len(a1.shape) <= axis: + # axis out of bounds for this array + return + elif isinstance(axis, tuple): + if any(ax >= len(a1.shape) for ax in axis) or reduce_op in ("argmax", "argmin"): + return + if reduce_op in {"cumulative_sum", "cumulative_prod"}: + # numpy's cumsum/cumprod do not support tuple axes + return + if reduce_op in {"cumulative_sum", "cumulative_prod"}: + if axis is None and len(a1.shape) > 1: + # Blosc2 requires axis for cumulative ops on non-1D arrays + return + # NumPy uses cumsum/cumprod for these + np_op = "cumsum" if reduce_op == "cumulative_sum" else "cumprod" if reduce_op in {"prod", "cumulative_prod"}: # To avoid overflow, create a1 and a2 with small values na1 = np.linspace(0, 0.1, np.prod(a1.shape), dtype=np.float32).reshape(a1.shape) @@ -222,14 +232,28 @@ def test_reduce_params(array_fixture, axis, keepdims, dtype_out, reduce_op, kwar nres = eval("na1 + na2 - na3 * na4") res = getattr(expr, reduce_op)(**reduce_args, **kwargs) - nres = getattr(nres, reduce_op)(**reduce_args) - tol = 1e-15 if a1.dtype == "float64" else 1e-6 + if reduce_op in {"cumulative_sum", "cumulative_prod"}: + # NumPy uses cumsum/cumprod + nres_op = (npcumsum if reduce_op == "cumulative_sum" else npcumprod).__call__ + # Strip out include_initial from reduce_args for numpy (not supported) + np_reduce_args = {k: v for k, v in reduce_args.items() if k != "include_initial"} + nres = nres_op(nres, **np_reduce_args) + else: + nres = getattr(nres, reduce_op)(**reduce_args) + if reduce_op in {"cumulative_sum", "cumulative_prod"}: + # Cumulative ops through compressed chunks accumulate absolute error + # across chunk boundaries. Use atol only (error is absolute, not relative). + atol = 1e-8 if a1.dtype == "float64" else 1.0 + rtol = 0 + else: + atol = 1e-15 if a1.dtype == "float64" else 1e-6 + rtol = atol if kwargs != {}: if not np.isscalar(res): assert isinstance(res, blosc2.NDArray) - np.testing.assert_allclose(res[()], nres, atol=tol, rtol=tol) + np.testing.assert_allclose(res[()], nres, atol=atol, rtol=rtol) else: - np.testing.assert_allclose(res, nres, atol=tol, rtol=tol) + np.testing.assert_allclose(res, nres, atol=atol, rtol=rtol) # TODO: "prod" is not supported here because it overflows with current values @@ -296,6 +320,9 @@ def test_broadcast_params(axis, keepdims, reduce_op, shapes): if reduce_op in ("argmax", "argmin", "cumulative_sum", "cumulative_prod"): axis = 1 if isinstance(axis, tuple) else axis axis = 0 if reduce_op[:3] == "cum" else axis + # prod overflows for large array sizes; skip those cases + if reduce_op == "prod" and np.prod(np.prod(shapes[1])) >= 1e4: + return reduce_args = {"axis": axis} if reduce_op in {"cumulative_sum", "cumulative_prod"}: if npcumprod.__name__ == "cumulative_prod": @@ -372,8 +399,17 @@ def test_reduce_item(reduce_op, dtype, stripes, stripe_len, shape, chunks): with pytest.raises(ValueError): getattr(na[_slice], reduce_op)() else: - res = getattr(a, reduce_op)(item=_slice) - nres = getattr(na[_slice], reduce_op)() + if reduce_op in ("cumulative_sum", "cumulative_prod"): + # Blosc2 requires axis for cumulative ops on non-1D arrays. + # Use the dimension that the stripe iterates over (0 for rows, 1 for columns). + axis = 0 if stripes == "rows" else 1 + res = getattr(a, reduce_op)(item=_slice, axis=axis) + # NumPy uses cumsum/cumprod for these operations + np_op = "cumsum" if reduce_op == "cumulative_sum" else "cumprod" + nres = getattr(na[_slice], np_op)(axis=axis) + else: + res = getattr(a, reduce_op)(item=_slice) + nres = getattr(na[_slice], reduce_op)() np.testing.assert_allclose(res, nres, atol=tol, rtol=tol) diff --git a/tests/test_b2view_model.py b/tests/test_b2view_model.py new file mode 100644 index 000000000..5985af1f2 --- /dev/null +++ b/tests/test_b2view_model.py @@ -0,0 +1,229 @@ +from __future__ import annotations + +import dataclasses + +import numpy as np +import pytest + +import blosc2 +from blosc2.b2view.model import ( + StoreBrowser, + preview_array, + preview_array_1d, + preview_array_2d, + preview_ctable, +) +from blosc2.b2view.render import make_preview_renderables + + +@dataclasses.dataclass +class Row: + x: int = 0 + y: float = 0.0 + + +def make_ctable(n=5): + table = blosc2.CTable(Row) + for i in range(n): + table.append(Row(x=i, y=i * 1.5)) + return table + + +def make_store(path): + with blosc2.TreeStore(str(path), mode="w") as store: + store["/group/arr"] = np.arange(12).reshape(3, 4) + store["/table"] = make_ctable(6) + + +def test_store_browser_lists_children_and_kinds(tmp_path): + path = tmp_path / "bundle.b2z" + make_store(path) + + with StoreBrowser(str(path)) as browser: + root = browser.list_children("/") + assert [(node.path, node.kind, node.has_children) for node in root] == [ + ("/group", "group", True), + ("/table", "ctable", False), + ] + group = browser.list_children("/group") + assert [(node.path, node.kind) for node in group] == [("/group/arr", "ndarray")] + + +def test_store_browser_metadata_and_previews(tmp_path): + path = tmp_path / "bundle.b2d" + make_store(path) + + with StoreBrowser(str(path)) as browser: + arr_info = browser.get_info("/group/arr") + assert arr_info.kind == "ndarray" + assert arr_info.metadata["shape"] == (3, 4) + assert arr_info.metadata["dtype"] == np.arange(12).dtype.name + arr_preview = browser.preview("/group/arr", max_rows=2, max_cols=3) + assert arr_preview["source_kind"] == "ndarray2d" + np.testing.assert_array_equal(arr_preview["data"]["0"], np.array([0, 4])) + np.testing.assert_array_equal(arr_preview["data"]["2"], np.array([2, 6])) + + table_info = browser.get_info("/table") + assert table_info.kind == "ctable" + assert table_info.metadata["rows"] == 6 + preview = browser.preview("/table", max_rows=3, max_cols=1) + assert preview["columns"] == ["x"] + assert preview["hidden_columns"] == 1 + np.testing.assert_array_equal(preview["data"]["x"], np.array([0, 1, 2])) + + +def test_store_browser_supports_standalone_ctable(tmp_path): + path = tmp_path / "table.b2z" + table = make_ctable(4) + persistent = blosc2.CTable(Row, urlpath=str(path), mode="w") + persistent.extend(table) + persistent.close() + + with StoreBrowser(str(path)) as browser: + assert browser.list_children("/") == [] + info = browser.get_info("/") + assert info.kind == "ctable" + assert info.metadata["rows"] == 4 + preview = browser.preview("/", max_rows=2) + np.testing.assert_array_equal(preview["data"]["x"], np.array([0, 1])) + + +def test_preview_array_1d_returns_grid_preview(): + arr = np.arange(10) + preview = preview_array_1d(arr, start=3, stop=7) + assert preview["start"] == 3 + assert preview["stop"] == 7 + assert preview["nrows"] == 10 + assert preview["columns"] == ["value"] + assert preview["source_kind"] == "ndarray1d" + np.testing.assert_array_equal(preview["data"]["value"], np.array([3, 4, 5, 6])) + + +def test_preview_array_2d_returns_grid_preview(): + arr = np.arange(30).reshape(5, 6) + preview = preview_array_2d(arr, start=1, stop=4, col_start=2, max_cols=3) + assert preview["start"] == 1 + assert preview["stop"] == 4 + assert preview["nrows"] == 5 + assert preview["columns"] == ["2", "3", "4"] + assert preview["hidden_columns"] == 3 + assert preview["col_start"] == 2 + assert preview["col_stop"] == 5 + assert preview["ncols"] == 6 + np.testing.assert_array_equal(preview["data"]["2"], np.array([8, 14, 20])) + np.testing.assert_array_equal(preview["data"]["4"], np.array([10, 16, 22])) + + +def test_store_browser_uses_grid_preview_for_2d_ndarray(tmp_path): + path = tmp_path / "bundle.b2z" + with blosc2.TreeStore(str(path), mode="w") as store: + store["/arr"] = np.arange(30).reshape(5, 6) + + with StoreBrowser(str(path)) as browser: + preview = browser.preview("/arr", start=2, stop=5, max_cols=2) + assert preview["source_kind"] == "ndarray2d" + assert preview["columns"] == ["0", "1"] + np.testing.assert_array_equal(preview["data"]["1"], np.array([13, 19, 25])) + + +def test_ctable_preview_buffer_reuses_loaded_rows(tmp_path): + pytest.importorskip("textual", reason="b2view TUI requires textual") + path = tmp_path / "table.b2z" + persistent = blosc2.CTable(Row, urlpath=str(path), mode="w") + for i in range(100): + persistent.append(Row(x=i, y=float(i))) + persistent.close() + + from blosc2.b2view.app import B2ViewApp + + app = B2ViewApp(str(path), preview_rows=5) + with StoreBrowser(str(path)) as browser: + app.browser = browser + app.table_buffer = None + app.query_one = lambda selector, cls=None: type( + "FakeTable", (), {"size": type("Size", (), {"height": 6, "width": 80})()} + )() + page0 = app._load_table_page("/", 0) + first_buffer = app.table_buffer + page1 = app._load_table_page("/", 5) + assert app.table_buffer is first_buffer + np.testing.assert_array_equal(page0["data"]["x"], np.arange(5)) + np.testing.assert_array_equal(page1["data"]["x"], np.arange(5, 10)) + + +def test_preview_ctable_skips_expensive_nested_columns_by_default(): + class Table: + def __init__(self): + self.col_names = ["path"] + + def __len__(self): + return 3 + + def __getitem__(self, name): + raise AssertionError("expensive column should not be read") + + def schema_dict(self): + return {"columns": [{"name": "path", "kind": "list", "item": {"kind": "struct"}}]} + + @property + def info_items(self): + return [("schema", {"path": "list[struct]"})] + + preview = preview_ctable(Table(), max_cols=1) + assert preview["skipped_columns"] == {"path": "list[struct]"} + assert preview["data"]["path"].tolist() == [""] * 3 + + +def test_ctable_preview_preserves_ragged_nested_values(): + class Column: + def __init__(self, values): + self.values = values + + def __getitem__(self, key): + return self.values[key] + + class Table: + def __init__(self): + self.col_names = ["path"] + self.columns = {"path": Column([[{"x": 1}], [{"x": 2}, {"x": 3}]])} + + def __len__(self): + return 2 + + def __getitem__(self, name): + return self.columns[name] + + preview = preview_ctable(Table(), max_cols=1) + assert preview["data"]["path"].dtype == object + assert preview["data"]["path"][1] == [{"x": 2}, {"x": 3}] + + +def test_ctable_preview_header_uses_column_names_without_dtype_labels(): + preview = { + "start": 0, + "stop": 1, + "nrows": 1, + "columns": ["when", "value"], + "hidden_columns": 0, + "data": { + "when": np.array(["2025-01-01"], dtype="datetime64[D]"), + "value": np.array([1], dtype=np.int64), + }, + } + pytest.importorskip("rich", reason="b2view rendering requires rich") + from rich.console import Console + + header, _ = make_preview_renderables(preview) + console = Console(width=80, record=True) + console.print(header) + rendered = console.export_text() + assert "when" in rendered + assert "value" in rendered + assert "datetime64" not in rendered + assert "int64" not in rendered + + +def test_preview_array_high_dimensional_slice(): + arr = np.arange(2 * 3 * 4).reshape(2, 3, 4) + preview = preview_array(arr, max_rows=2, max_cols=3) + np.testing.assert_array_equal(preview, arr[0, :2, :3]) diff --git a/tests/test_embed_store.py b/tests/test_embed_store.py index f66adf3df..30e1b966c 100644 --- a/tests/test_embed_store.py +++ b/tests/test_embed_store.py @@ -96,6 +96,16 @@ def test_with_compression(): assert value.cparams.codec == blosc2.Codec.BLOSCLZ +def test_from_schunk_preserves_mode(populate_nodes): + schunk = blosc2.blosc2_ext.open("test_estore.b2e", mode="r", offset=0) + estore = blosc2.EmbedStore(_from_schunk=schunk) + + assert estore.mode == "r" + assert estore.storage.mode == "r" + assert estore._store.mode == "r" + assert set(estore.keys()) == {"/node1", "/node2", "/node3"} + + def test_with_many_nodes(): # Create a estore with many nodes N = 200 diff --git a/tests/test_open.py b/tests/test_open.py index 06bc51966..a4b3671c9 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -180,14 +180,17 @@ def test_open_defaults_to_readonly(tmp_path): # Opening without explicit mode should work (read-only by default) obj = blosc2.open(urlpath) assert obj.schunk.mode == "r" + assert obj.schunk.vlmeta.mode == "r" def test_open_explicit_mode_no_warn(tmp_path): """No warnings are emitted when mode is explicitly given.""" urlpath = str(tmp_path / "test.b2nd") blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") - _ = blosc2.open(urlpath, mode="r") - _ = blosc2.open(urlpath, mode="a") + obj = blosc2.open(urlpath, mode="r") + assert obj.schunk.vlmeta.mode == "r" + obj = blosc2.open(urlpath, mode="a") + assert obj.schunk.vlmeta.mode == "a" def test_open_mmap_defaults_to_readonly(tmp_path): @@ -198,3 +201,25 @@ def test_open_mmap_defaults_to_readonly(tmp_path): urlpath = str(tmp_path / "test.b2nd") blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") obj = blosc2.open(urlpath, mmap_mode="r") + assert obj.schunk.mode == "r" + assert obj.schunk.vlmeta.mode == "r" + + +def test_open_ndarray_context_manager(tmp_path): + urlpath = tmp_path / "array.b2nd" + expected = np.arange(12).reshape(3, 4) + blosc2.asarray(expected, urlpath=urlpath, mode="w") + + with blosc2.open(urlpath) as arr: + assert isinstance(arr, blosc2.NDArray) + np.testing.assert_array_equal(arr[:], expected) + + +def test_open_schunk_context_manager(tmp_path): + urlpath = tmp_path / "schunk.b2frame" + data = np.arange(20, dtype=np.int32) + blosc2.SChunk(data=data, urlpath=urlpath, mode="w", cparams={"typesize": data.dtype.itemsize}) + + with blosc2.open(urlpath, mode="r") as schunk: + assert isinstance(schunk, blosc2.SChunk) + assert schunk[:] == data.tobytes() diff --git a/tests/test_open_c2array.py b/tests/test_open_c2array.py index 8d4458ac2..eb46e2bac 100644 --- a/tests/test_open_c2array.py +++ b/tests/test_open_c2array.py @@ -35,6 +35,10 @@ def test_open_c2array(cat2_context): a_open = blosc2.open(urlpath, mode="r") np.testing.assert_allclose(a1[:], a_open[:]) + with blosc2.open(urlpath, mode="r") as a_ctx: + assert isinstance(a_ctx, blosc2.C2Array) + np.testing.assert_allclose(a1[:], a_ctx[:]) + ## Test slicing np.testing.assert_allclose(a1[:10], a_open[:10]) np.testing.assert_allclose(a1.slice(slice(1, 10, 1))[:], a_open.slice(slice(1, 10, 1))[:]) diff --git a/tests/test_proxy_schunk.py b/tests/test_proxy_schunk.py index 7155834cf..dcd793ec5 100644 --- a/tests/test_proxy_schunk.py +++ b/tests/test_proxy_schunk.py @@ -77,6 +77,26 @@ def test_open(urlpath, chunksize, nchunks): blosc2.remove_urlpath(proxy_urlpath) +def test_open_readonly_proxy_keeps_schunk_cache_and_source_readonly(tmp_path): + source_path = tmp_path / "source.b2frame" + proxy_path = tmp_path / "proxy.b2frame" + data = np.arange(200, dtype="int32") + source = blosc2.SChunk(chunksize=40, data=data, urlpath=str(source_path), cparams={"typesize": 4}) + proxy = blosc2.Proxy(source, urlpath=str(proxy_path), mode="w") + proxy.fetch() + cached_size = proxy_path.stat().st_size + expected = data.tobytes() + del proxy, source + + readonly = blosc2.open(str(proxy_path)) + + assert readonly.schunk.mode == "r" + assert readonly.schunk.vlmeta.mode == "r" + assert readonly.src.mode == "r" + assert readonly[0 : len(data) * data.dtype.itemsize] == expected + assert proxy_path.stat().st_size == cached_size + + # Test the ProxySource class def test_proxy_source(): # Define an object that will be used as a source