From 87160070304e257920e163b4bd705ebdd7e54b3b Mon Sep 17 00:00:00 2001 From: zhangt Date: Wed, 20 May 2026 03:16:16 +0000 Subject: [PATCH 01/48] Replay distributed work onto users/qiazh/pre-merge-tikv-bugfix Branch users/zhangt/merge-onto-qiazh ports our shared remote/local pool + per-layer routing changes from users/zhangt/merge-distributed-to-tikv on top of qianxi's TiKV bugfix branch (lock ordering, splitAsync, version check, etc.). Avoids the 21-block ExtraDynamicSearcher.h merge conflict on the merged_spfresh side by replaying instead of merging. Pragmatic approach for heavy files (ExtraDynamicSearcher.h, SPFreshTest.cpp): take our HEAD versions wholesale (which already contain our distributed + MultiChunk logic), and patch only the compile-breaking deltas caused by qianxi's refactors: - PostingCountCache moved from ExtraDynamicSearcher.h to ExtraTiKVController.h - KeyValueIO grew MultiMerge + LogAsyncWaitStatsAndReset virtuals (qianxi version kept; our MultiPut/MultiDelete virtuals re-added on top) - Options/ParameterDefinitionList: kept qianxi version (adds m_globalIDPath) - ThreadPool: kept our add_high + added addfront alias for qianxi callers Index.h / IExtraSearcher.h / SPANNIndex.cpp: applied small additive hooks on top of qianxi (forward-decl WorkerNode, SetWorker/GetSharedSplitPool accessors, BuildIndexInternalLayer + AddIndex worker loop). qianxi bugfixes preserved in those files. Build system: - CMakeLists updated for absl_cord + cordz family (kvproto 25.3 uses absl 2308, anaconda's grpc bundles 2111; explicit linkage avoids DSO-missing-from-command-line) - cmake invoked with gRPC_DIR/Protobuf_DIR/absl_DIR pointing at /usr/local so generated kvproto + libabsl 2308 versions align Verified: SPTAGTest links cleanly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 +- AnnService/CMakeLists.txt | 8 +- AnnService/inc/Core/Common/FineGrainedLock.h | 25 +- AnnService/inc/Core/Common/IVersionMap.h | 12 + AnnService/inc/Core/Common/TiKVVersionMap.h | 52 + .../SPANN/Distributed/ConsistentHashRing.h | 93 ++ .../SPANN/Distributed/DispatchCoordinator.h | 364 +++++ .../Core/SPANN/Distributed/DispatcherNode.h | 293 ++++ .../SPANN/Distributed/DistributedProtocol.h | 651 ++++++++ .../inc/Core/SPANN/Distributed/NetworkNode.h | 319 ++++ .../Core/SPANN/Distributed/RemotePostingOps.h | 1325 ++++++++++++++++ .../inc/Core/SPANN/Distributed/WorkerNode.h | 616 ++++++++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 620 +++++++- .../inc/Core/SPANN/ExtraTiKVController.h | 1 + AnnService/inc/Core/SPANN/IExtraSearcher.h | 17 + AnnService/inc/Core/SPANN/Index.h | 40 + AnnService/inc/Core/VectorIndex.h | 9 + AnnService/inc/Helper/KeyValueIO.h | 14 + AnnService/inc/Helper/ThreadPool.h | 33 +- AnnService/inc/Socket/ConnectionManager.h | 6 +- AnnService/inc/Socket/Packet.h | 36 +- AnnService/inc/Socket/SimpleSerialization.h | 52 + .../src/Core/SPANN/ExtraFileController.cpp | 2 +- AnnService/src/Core/SPANN/SPANNIndex.cpp | 78 +- AnnService/src/Core/VectorIndex.cpp | 25 + AnnService/src/Socket/Connection.cpp | 30 +- AnnService/src/Socket/Server.cpp | 2 +- Test/CMakeLists.txt | 2 +- Test/inc/TestDataGenerator.h | 15 +- Test/src/SPFreshTest.cpp | 1071 +++++++++++-- Test/src/TestDataGenerator.cpp | 12 +- Test/src/main.cpp | 7 +- benchmark.ini | 19 + evaluation/distributed/README.md | 294 ++++ .../configs/benchmark_100m_1node.ini | 71 + .../configs/benchmark_100m_2node.ini | 71 + .../configs/benchmark_100m_template.ini | 71 + .../configs/benchmark_10m_1node.ini | 62 + .../configs/benchmark_10m_2node.ini | 62 + .../configs/benchmark_10m_template.ini | 62 + .../benchmark_insert_dominant_1node.ini | 58 + .../benchmark_insert_dominant_2node.ini | 58 + .../benchmark_insert_dominant_3node.ini | 59 + .../benchmark_insert_dominant_template.ini | 58 + .../distributed/configs/cluster_2node.conf | 31 + .../distributed/configs/cluster_3node.conf | 34 + evaluation/distributed/configs/tikv.toml | 74 + evaluation/distributed/run_distributed.sh | 1364 +++++++++++++++++ 48 files changed, 8050 insertions(+), 231 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/NetworkNode.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/WorkerNode.h create mode 100644 benchmark.ini create mode 100644 evaluation/distributed/README.md create mode 100644 evaluation/distributed/configs/benchmark_100m_1node.ini create mode 100644 evaluation/distributed/configs/benchmark_100m_2node.ini create mode 100644 evaluation/distributed/configs/benchmark_100m_template.ini create mode 100644 evaluation/distributed/configs/benchmark_10m_1node.ini create mode 100644 evaluation/distributed/configs/benchmark_10m_2node.ini create mode 100644 evaluation/distributed/configs/benchmark_10m_template.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_1node.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_2node.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_3node.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_template.ini create mode 100644 evaluation/distributed/configs/cluster_2node.conf create mode 100644 evaluation/distributed/configs/cluster_3node.conf create mode 100755 evaluation/distributed/configs/tikv.toml create mode 100755 evaluation/distributed/run_distributed.sh diff --git a/.gitignore b/.gitignore index 190ca29d3..e3dc9796a 100644 --- a/.gitignore +++ b/.gitignore @@ -464,4 +464,5 @@ FodyWeavers.xsd *.sln.iml # SPTAG benchmark generated artifacts -*perftest_* +/perftest_* +/evaluation/2026-04-23/output_distributed_hostname_*.json diff --git a/AnnService/CMakeLists.txt b/AnnService/CMakeLists.txt index cd23345fd..299faf3ed 100644 --- a/AnnService/CMakeLists.txt +++ b/AnnService/CMakeLists.txt @@ -10,6 +10,12 @@ include_directories(${Zstd}/lib) file(GLOB_RECURSE HDR_FILES ${AnnService}/inc/Core/*.h ${AnnService}/inc/Helper/*.h) file(GLOB_RECURSE SRC_FILES ${AnnService}/src/Core/*.cpp ${AnnService}/src/Helper/*.cpp) +# Include Socket sources in core lib for PostingRouter +file(GLOB SOCKET_HDR_FILES ${AnnService}/inc/Socket/*.h) +file(GLOB SOCKET_SRC_FILES ${AnnService}/src/Socket/*.cpp) +list(APPEND HDR_FILES ${SOCKET_HDR_FILES}) +list(APPEND SRC_FILES ${SOCKET_SRC_FILES}) + set(SPDK_LIBRARIES "") if (SPDK) set(Spdk ${PROJECT_SOURCE_DIR}/ThirdParty/spdk/build) @@ -73,7 +79,7 @@ endif() add_library (SPTAGLib SHARED ${SRC_FILES} ${HDR_FILES} ${TiKV_PROTO_SOURCES}) target_link_libraries (SPTAGLib DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_shared ${NUMA_LIBRARY} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES}) add_library (SPTAGLibStatic STATIC ${SRC_FILES} ${HDR_FILES} ${TiKV_PROTO_SOURCES}) -target_link_libraries (SPTAGLibStatic DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_static ${NUMA_LIBRARY_STATIC} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES}) +target_link_libraries (SPTAGLibStatic DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_static ${NUMA_LIBRARY_STATIC} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES} ${Boost_LIBRARIES}) if (MSVC) # SPANNIndex.cpp can exceed COFF section limits in Debug without /bigobj. diff --git a/AnnService/inc/Core/Common/FineGrainedLock.h b/AnnService/inc/Core/Common/FineGrainedLock.h index 06c8f44d1..5cfad7ac6 100644 --- a/AnnService/inc/Core/Common/FineGrainedLock.h +++ b/AnnService/inc/Core/Common/FineGrainedLock.h @@ -56,10 +56,27 @@ namespace SPTAG return GetLock(idx); } + // Per-posting lock identity. Two indices share a lock iff they are + // the same posting, so external callers can use `hash_func(a) == + // hash_func(b)` as a self-lock guard (e.g. in Split, to skip + // re-locking the same head VID). static inline unsigned hash_func(unsigned idx) { return idx; } + + // Bucket index for the internal mutex-sharded unordered_map of + // per-posting locks. Exposed for callers that need an array sized + // to BucketCount and indexed by the same granularity as the lock + // pool (e.g. ExtraDynamicSearcher::m_remoteBucketLocked). + static inline unsigned BucketIndex(SizeType idx) + { + unsigned key = static_cast(idx); + return ((unsigned)(key * 99991) + _rotl(key, 2) + 101) & BucketMask; + } + + static const int BucketMask = 32767; + static const int BucketCount = BucketMask + 1; private: struct Bucket { std::mutex mutex; @@ -76,14 +93,6 @@ namespace SPTAG return *iter->second; } - static inline unsigned BucketIndex(SizeType idx) - { - unsigned key = static_cast(idx); - return ((unsigned)(key * 99991) + _rotl(key, 2) + 101) & BucketMask; - } - - static const int BucketMask = 32767; - static const int BucketCount = BucketMask + 1; mutable std::unique_ptr m_buckets; }; } diff --git a/AnnService/inc/Core/Common/IVersionMap.h b/AnnService/inc/Core/Common/IVersionMap.h index b939bd534..05d638cd9 100644 --- a/AnnService/inc/Core/Common/IVersionMap.h +++ b/AnnService/inc/Core/Common/IVersionMap.h @@ -43,6 +43,18 @@ namespace SPTAG virtual uint8_t GetVersion(const SizeType& key) = 0; virtual uint8_t GetVersion(const SizeType& key, VersionReadPolicy policy) { return GetVersion(key); } virtual void SetVersion(const SizeType& key, const uint8_t& version) = 0; + + /// Batch SetVersion: apply (vids[i] -> versions[i]) for all i. + /// Default impl is a per-VID loop. TiKV-backed maps override this + /// to group writes by chunk so N records in the same chunk only + /// trigger 1 ReadChunk + 1 WriteChunk RPC pair + virtual void SetVersionBatch(const std::vector& vids, const std::vector& versions) + { + size_t n = std::min(vids.size(), versions.size()); + for (size_t i = 0; i < n; i++) { + SetVersion(vids[i], versions[i]); + } + } /// Increment the version of a VID. /// @param expectedOld If not 0xff, the caller asserts the current version should be this value. /// If TiKV already holds (expectedOld+1)&0x7f, treat as success (another node did the same increment). diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h index 0dce69ce8..69191fe1b 100644 --- a/AnnService/inc/Core/Common/TiKVVersionMap.h +++ b/AnnService/inc/Core/Common/TiKVVersionMap.h @@ -385,6 +385,58 @@ namespace SPTAG else if (oldVal != 0xfe && version == 0xfe) m_deleted++; } + // Group writes by chunk: 1 ReadChunk + N byte-modifications + 1 WriteChunk + // per chunk, instead of N × (ReadChunk + WriteChunk). + void SetVersionBatch(const std::vector& vids, const std::vector& versions) override + { + size_t n = std::min(vids.size(), versions.size()); + if (n == 0) return; + const SizeType localCount = m_count.load(); + + // Group (idx into vids/versions) by chunk id. + std::unordered_map> byChunk; + byChunk.reserve(n); + for (size_t i = 0; i < n; i++) { + SizeType vid = vids[i]; + if (vid < 0 || vid >= localCount) continue; + byChunk[ChunkId(vid)].push_back(i); + } + if (byChunk.empty()) return; + + long deletedDelta = 0; + for (auto& kv : byChunk) { + SizeType cid = kv.first; + auto& idxs = kv.second; + std::lock_guard lock(ChunkMutex(cid)); + std::string chunk = ReadChunkCached(cid); + if (chunk.empty()) { + chunk.assign(m_chunkSize, static_cast(0xff)); + } + bool dirty = false; + for (size_t i : idxs) { + SizeType vid = vids[i]; + uint8_t newVal = versions[i]; + int offset = ChunkOffset(vid); + if (offset < 0 || offset >= (int)chunk.size()) continue; + uint8_t oldVal = static_cast(chunk[offset]); + if (oldVal == newVal) continue; + if (oldVal == 0xfe && newVal != 0xfe) deletedDelta--; + else if (oldVal != 0xfe && newVal == 0xfe) deletedDelta++; + chunk[offset] = static_cast(newVal); + dirty = true; + } + if (dirty) { + auto ret = WriteChunk(cid, chunk); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "TiKVVersionMap::SetVersionBatch: WriteChunk failed chunk=%d layer=%d\n", + cid, m_layer); + } + } + } + if (deletedDelta != 0) m_deleted += deletedDelta; + } + bool IncVersion(const SizeType& key, uint8_t* newVersion, uint8_t expectedOld = 0xff) override { if (key < 0 || key >= m_count.load()) { diff --git a/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h b/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h new file mode 100644 index 000000000..ec5c7855c --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/Common.h" +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Consistent hash ring for distributing headIDs across compute nodes. + /// Uses virtual nodes (vnodes) for balanced distribution. + /// When nodes are added/removed, only ~1/N of keys are remapped. + class ConsistentHashRing { + public: + explicit ConsistentHashRing(int vnodeCount = 150) + : m_vnodeCount(vnodeCount) {} + + /// Add a physical node to the ring with its virtual nodes. + void AddNode(int nodeIndex) { + for (int i = 0; i < m_vnodeCount; i++) { + uint32_t h = HashVNode(nodeIndex, i); + m_ring[h] = nodeIndex; + } + m_nodes.insert(nodeIndex); + } + + /// Remove a physical node and all its virtual nodes from the ring. + void RemoveNode(int nodeIndex) { + for (int i = 0; i < m_vnodeCount; i++) { + uint32_t h = HashVNode(nodeIndex, i); + m_ring.erase(h); + } + m_nodes.erase(nodeIndex); + } + + /// Find the owner node for a given key (headID). + /// Returns -1 if the ring is empty. + int GetOwner(SizeType headID) const { + if (m_ring.empty()) return -1; + uint32_t h = HashKey(headID); + auto it = m_ring.lower_bound(h); + if (it == m_ring.end()) it = m_ring.begin(); + return it->second; + } + + bool Empty() const { return m_ring.empty(); } + size_t NodeCount() const { return m_nodes.size(); } + bool HasNode(int nodeIndex) const { return m_nodes.count(nodeIndex) > 0; } + const std::set& GetNodes() const { return m_nodes; } + int GetVNodeCount() const { return m_vnodeCount; } + + private: + static uint32_t HashKey(SizeType headID) { + uint32_t hash = 2166136261u; // FNV-1a offset basis + uint32_t val = static_cast(headID); + for (int i = 0; i < 4; i++) { + hash ^= (val >> (i * 8)) & 0xFF; + hash *= 16777619u; // FNV prime + } + return hash; + } + + static uint32_t HashVNode(int nodeIndex, int vnodeIdx) { + // Raw FNV-1a on tiny nodeIndex (1, 2, 3) produces a + // pathologically biased ring (71.9% vs 28.1% for nodes 1/2 with + // 150 vnodes). Pre-mix nodeIndex through Knuth's golden-ratio + // multiplier so small node IDs become full-spectrum uint32 values + // before they hit FNV's accumulator. Validated to give ≈50/50 + // for K=2 and stay within ±15% of even split for K up to 8. + uint32_t saltedVnode = + static_cast(vnodeIdx) ^ + (static_cast(nodeIndex) * 2654435761u); + uint32_t hash = 2166136261u; + auto mix = [&](uint32_t v) { + for (int i = 0; i < 4; i++) { + hash ^= (v >> (i * 8)) & 0xFF; + hash *= 16777619u; + } + }; + mix(saltedVnode); + mix(static_cast(nodeIndex)); + return hash; + } + + int m_vnodeCount; + std::map m_ring; // hash position → nodeIndex + std::set m_nodes; // active physical node indices + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h new file mode 100644 index 000000000..8bb32a7eb --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h @@ -0,0 +1,364 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Socket/Client.h" +#include "inc/Socket/Packet.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Coordinates driver↔worker dispatch for distributed benchmarks. + /// + /// The driver broadcasts Insert/Search/Stop commands to all workers and + /// collects their results. Workers execute commands via a callback and + /// report results back. + /// + /// This class is independent of posting routing — it only needs a way to + /// send packets to peer nodes (provided via PeerNetwork interface). + class DispatchCoordinator { + public: + /// Abstract interface for sending packets to peer nodes. + /// NetworkNode implements this so DispatchCoordinator doesn't + /// depend on the full node class. + class PeerNetwork { + public: + virtual ~PeerNetwork() = default; + /// Get connection to a peer node (reconnecting if needed). + virtual Socket::ConnectionID GetPeerConnection(int nodeIndex) = 0; + /// Total number of nodes in the cluster. + virtual int GetNumNodes() const = 0; + /// Index of this node. + virtual int GetLocalNodeIndex() const = 0; + /// Send a packet via the client socket. + virtual void SendPacket(Socket::ConnectionID connID, Socket::Packet&& pkt, + std::function callback) = 0; + }; + + using DispatchCallback = std::function; + + DispatchCoordinator() = default; + + ~DispatchCoordinator() { + ClearDispatchCallback(); + } + + /// Attach to a peer network (must outlive this coordinator). + void SetNetwork(PeerNetwork* network) { + m_network = network; + } + + /// Mark a worker node as "local" — its work is done inline by the + /// driver so it should be skipped during broadcast/result collection. + void SetLocalWorkerIndex(int idx) { m_localWorkerIndex = idx; } + + /// Set the callback for executing dispatch commands (worker side). + void SetDispatchCallback(DispatchCallback cb) { + m_dispatchCallback = std::move(cb); + } + + /// Clear the dispatch callback and wait for in-flight dispatch + /// threads to complete. Call before destroying callback state. + void ClearDispatchCallback() { + m_dispatchCallback = nullptr; + std::unique_lock lock(m_activeDispatchMutex); + m_activeDispatchCV.wait(lock, [this]() { + return m_activeDispatchCount == 0; + }); + } + + // ---- Driver side ---- + + /// Broadcast a dispatch command to all worker nodes. + /// Returns the dispatchId assigned to this command. + std::uint64_t BroadcastDispatchCommand(DispatchCommand::Type type, std::uint32_t round) { + std::uint64_t dispatchId = m_nextDispatchId.fetch_add(1); + + DispatchCommand cmd; + cmd.m_type = type; + cmd.m_dispatchId = dispatchId; + cmd.m_round = round; + + int numNodes = m_network->GetNumNodes(); + int localIdx = m_network->GetLocalNodeIndex(); + + // Build list of nodes to skip (dispatcher + local worker if set) + auto shouldSkip = [&](int i) { + return i == localIdx || i == m_localWorkerIndex; + }; + + // Count remote workers (nodes we will actually dispatch to) + int remoteWorkers = 0; + for (int i = 0; i < numNodes; i++) { + if (!shouldSkip(i)) remoteWorkers++; + } + + // Set up pending state for collecting results (not for Stop / Heartbeat) + if (type != DispatchCommand::Type::Stop && + type != DispatchCommand::Type::Heartbeat && + remoteWorkers > 0) { + auto state = std::make_shared(); + state->remaining.store(remoteWorkers); + for (int i = 0; i < numNodes; i++) { + if (!shouldSkip(i)) state->pendingNodes.insert(i); + } + { + std::lock_guard lock(m_dispatchMutex); + m_pendingDispatches[dispatchId] = state; + } + } + + auto bodySize = static_cast(cmd.EstimateBufferSize()); + + for (int i = 0; i < numNodes; i++) { + if (shouldSkip(i)) continue; + + Socket::ConnectionID connID = m_network->GetPeerConnection(i); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: Cannot dispatch to node %d (no connection)\n", i); + if (type != DispatchCommand::Type::Stop && + type != DispatchCommand::Type::Heartbeat) { + std::lock_guard lock(m_dispatchMutex); + auto it = m_pendingDispatches.find(dispatchId); + if (it != m_pendingDispatches.end()) { + it->second->errors++; + if (it->second->remaining.fetch_sub(1) == 1) { + it->second->done.set_value(); + } + } + } + continue; + } + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::DispatchCommand; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = bodySize; + pkt.AllocateBuffer(bodySize); + cmd.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_network->SendPacket(connID, std::move(pkt), nullptr); + } + + // Heartbeats fire every interval seconds — keep logs clean. + if (type != DispatchCommand::Type::Heartbeat) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatchCoordinator: Dispatched %s (id=%llu round=%u) to %d workers\n", + type == DispatchCommand::Type::Search ? "Search" : + type == DispatchCommand::Type::Insert ? "Insert" : "Stop", + (unsigned long long)dispatchId, round, remoteWorkers); + } + + return dispatchId; + } + + /// Wait for all workers to report results for a dispatch. + /// Returns collected wall times from workers. Empty on timeout. + std::vector WaitForAllResults(std::uint64_t dispatchId, int timeoutSec = 300) { + std::shared_ptr state; + { + std::lock_guard lock(m_dispatchMutex); + auto it = m_pendingDispatches.find(dispatchId); + if (it == m_pendingDispatches.end()) return {}; + state = it->second; + } + + auto future = state->done.get_future(); + auto status = future.wait_for(std::chrono::seconds(timeoutSec)); + + { + std::lock_guard lock(m_dispatchMutex); + m_pendingDispatches.erase(dispatchId); + } + + if (status == std::future_status::timeout) { + std::string nodeList; + { + std::lock_guard lock(state->mutex); + for (int n : state->pendingNodes) { + if (!nodeList.empty()) nodeList += ","; + nodeList += std::to_string(n); + } + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: Timeout waiting for results (id=%llu, %d remaining, nodes=[%s])\n", + (unsigned long long)dispatchId, state->remaining.load(), nodeList.c_str()); + return {}; + } + + if (state->errors > 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: Dispatch %llu completed with %d errors\n", + (unsigned long long)dispatchId, (int)state->errors); + } + + std::lock_guard lock(state->mutex); + return state->wallTimes; + } + + // ---- Worker side ---- + + /// Send a dispatch result back to the driver (worker side). + void SendDispatchResult(const DispatchResult& result) { + int driverNode = 0; + if (driverNode == m_network->GetLocalNodeIndex()) return; + + Socket::ConnectionID connID = m_network->GetPeerConnection(driverNode); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: Cannot send result to driver\n"); + return; + } + + Socket::Packet pkt; + auto bodySize = static_cast(result.EstimateBufferSize()); + pkt.Header().m_packetType = Socket::PacketType::DispatchResult; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = bodySize; + pkt.AllocateBuffer(bodySize); + result.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_network->SendPacket(connID, std::move(pkt), nullptr); + } + + // ---- Packet handlers (called by NetworkNode's server/client) ---- + + /// Handle an incoming dispatch command from the driver (worker side). + void HandleDispatchCommand(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: Empty DispatchCommand received\n"); + return; + } + + DispatchCommand cmd; + if (cmd.Read(packet.Body()) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: DispatchCommand parse failed\n"); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatchCoordinator: Received command type=%d id=%llu round=%u\n", + (int)cmd.m_type, (unsigned long long)cmd.m_dispatchId, cmd.m_round); + + auto callback = m_dispatchCallback; + if (!callback) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: No callback set, ignoring command\n"); + return; + } + + { + std::lock_guard lock(m_activeDispatchMutex); + m_activeDispatchCount++; + } + + auto self = this; + int localIdx = m_network->GetLocalNodeIndex(); + std::thread([self, callback, cmd, localIdx]() { + DispatchResult result = callback(cmd); + result.m_nodeIndex = localIdx; + result.m_dispatchId = cmd.m_dispatchId; + result.m_round = cmd.m_round; + + if (cmd.m_type != DispatchCommand::Type::Stop && + cmd.m_type != DispatchCommand::Type::Heartbeat) { + self->SendDispatchResult(result); + } + + { + std::lock_guard lock(self->m_activeDispatchMutex); + self->m_activeDispatchCount--; + } + self->m_activeDispatchCV.notify_all(); + }).detach(); + } + + /// Handle an incoming dispatch result from a worker (driver side). + void HandleDispatchResult(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) return; + + DispatchResult result; + if (result.Read(packet.Body()) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: DispatchResult parse failed\n"); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatchCoordinator: Result id=%llu round=%u node=%d status=%d wallTime=%.3f\n", + (unsigned long long)result.m_dispatchId, result.m_round, + result.m_nodeIndex, (int)result.m_status, result.m_wallTime); + + std::shared_ptr state; + { + std::lock_guard lock(m_dispatchMutex); + auto it = m_pendingDispatches.find(result.m_dispatchId); + if (it == m_pendingDispatches.end()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: Result for unknown dispatch %llu (late/expired)\n", + (unsigned long long)result.m_dispatchId); + return; + } + state = it->second; + } + + if (result.m_status != DispatchResult::Status::Success) { + state->errors++; + } + + { + std::lock_guard lock(state->mutex); + state->wallTimes.push_back(result.m_wallTime); + if (result.m_nodeIndex >= 0) + state->pendingNodes.erase(result.m_nodeIndex); + } + + if (state->remaining.fetch_sub(1) == 1) { + state->done.set_value(); + } + } + + private: + struct PendingDispatch { + std::atomic remaining{0}; + std::atomic errors{0}; + std::promise done; + std::mutex mutex; + std::vector wallTimes; + std::set pendingNodes; // nodes that haven't responded yet + }; + + PeerNetwork* m_network = nullptr; + int m_localWorkerIndex = -1; // driver's worker node to skip in broadcasts + DispatchCallback m_dispatchCallback; + std::atomic m_nextDispatchId{1}; + std::mutex m_dispatchMutex; + std::unordered_map> m_pendingDispatches; + + std::mutex m_activeDispatchMutex; + std::condition_variable m_activeDispatchCV; + int m_activeDispatchCount{0}; + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h b/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h new file mode 100644 index 000000000..00b7bbdb6 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h @@ -0,0 +1,293 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/SPANN/Distributed/NetworkNode.h" + +namespace SPTAG::SPANN { + + /// Dispatcher node: manages the consistent hash ring and coordinates + /// external dispatch commands (Insert/Search/Stop) to worker nodes. + /// + /// The dispatcher does NOT perform search or posting operations. + /// It is a lightweight coordination point that: + /// - Accepts NodeRegister requests from workers + /// - Maintains the authoritative hash ring and broadcasts updates + /// - Tracks per-worker ACK status with retry + /// - Delegates BroadcastDispatchCommand / WaitForAllResults + class DispatcherNode : public NetworkNode { + public: + using DispatchCallback = DispatchCoordinator::DispatchCallback; + + /// Initialize the dispatcher with separate addresses. + /// Builds the full hash ring at startup (workers 1..N). + bool Initialize( + const std::pair& dispatcherAddr, + const std::vector>& workerAddrs, + int vnodeCount = 150) + { + // Build combined addr list: [dispatcher, worker0, worker1, ...] + std::vector> allAddrs; + allAddrs.push_back(dispatcherAddr); + allAddrs.insert(allAddrs.end(), workerAddrs.begin(), workerAddrs.end()); + + if (!InitializeNetwork(0, allAddrs, vnodeCount)) return false; + + // [Bug 30] Dispatcher has no local data shard; mark with -1. + m_numDispatchNodes = 1; + m_numWorkerNodes = static_cast(workerAddrs.size()); + m_workerNodeIndex = -1; + + // Pre-build complete ring with all workers (internal indices 1..N) + int numWorkers = static_cast(workerAddrs.size()); + auto ring = std::make_shared(vnodeCount); + for (int i = 1; i <= numWorkers; i++) { + ring->AddNode(i); + } + std::atomic_store(&m_hashRing, + std::shared_ptr(std::move(ring))); + m_currentRingVersion.store(1); + + m_dispatch.SetNetwork(this); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: initialized with %d workers, ring v1\n", numWorkers); + return true; + } + + bool Start() { return StartNetwork(); } + + // ---- Dispatch protocol ---- + + /// Mark the driver's local worker node so broadcasts skip it. + void SetLocalWorkerIndex(int idx) { m_dispatch.SetLocalWorkerIndex(idx); } + + std::uint64_t BroadcastDispatchCommand(DispatchCommand::Type type, std::uint32_t round) { + return m_dispatch.BroadcastDispatchCommand(type, round); + } + + std::vector WaitForAllResults(std::uint64_t dispatchId, int timeoutSec = 300) { + return m_dispatch.WaitForAllResults(dispatchId, timeoutSec); + } + + void SetDispatchCallback(DispatchCallback cb) { + m_dispatch.SetDispatchCallback(std::move(cb)); + } + + void ClearDispatchCallback() { + m_dispatch.ClearDispatchCallback(); + } + + // ---- Heartbeat pump ---- + // + // Periodically broadcasts a Heartbeat dispatch to every remote worker. + // Workers use the heartbeat to detect driver failure / network + // partition and exit cleanly rather than relying on a fixed + // wall-clock receiver timeout. + // + // Idempotent: callable from any thread; second call without StopHeartbeat + // is a no-op. StopHeartbeat joins the thread; destructor calls it. + + void StartHeartbeat(int intervalSec) { + if (intervalSec <= 0) return; + if (m_heartbeatThread.joinable()) return; + m_heartbeatStop.store(false); + m_heartbeatThread = std::thread([this, intervalSec]() { + std::uint32_t round = 0; + while (!m_heartbeatStop.load()) { + BroadcastDispatchCommand(DispatchCommand::Type::Heartbeat, round++); + for (int i = 0; i < intervalSec * 10 && !m_heartbeatStop.load(); i++) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + }); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: heartbeat pump started (interval=%ds)\n", intervalSec); + } + + void StopHeartbeat() { + if (!m_heartbeatThread.joinable()) return; + m_heartbeatStop.store(true); + m_heartbeatThread.join(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: heartbeat pump stopped\n"); + } + + ~DispatcherNode() { + StopHeartbeat(); + } + + // ---- Ring management ---- + + bool AllWorkersAcked() const { + std::uint32_t currentVer = m_currentRingVersion.load(); + if (currentVer == 0) return false; + std::lock_guard lock(m_ackMutex); + int numNodes = static_cast(m_nodeAddrs.size()); + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + auto it = m_workerAckedVersion.find(i); + if (it == m_workerAckedVersion.end() || it->second < currentVer) return false; + } + return true; + } + + protected: + void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::NodeRegisterRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { HandleNodeRegisterRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RingUpdateACK, + [this](Socket::ConnectionID c, Socket::Packet p) { HandleRingUpdateACK(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchCommand, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchCommand(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + } + + void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + } + + void BgProtocolStep() override { + if (m_currentRingVersion.load() > 0) { + RetryUnackedRingUpdates(); + } + } + + bool IsRingSettled() const override { + return AllWorkersAcked(); + } + + private: + void HandleNodeRegisterRequest(Socket::ConnectionID connID, Socket::Packet packet) { + NodeRegisterMsg msg; + if (!msg.Read(packet.Body())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatcherNode: Failed to parse NodeRegisterRequest\n"); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: NodeRegister from node %d (%s:%s, store=%s)\n", + msg.m_nodeIndex, msg.m_host.c_str(), msg.m_port.c_str(), msg.m_store.c_str()); + + // Ring is pre-built at startup, just broadcast current ring to the new connection + BroadcastRingUpdate(); + } + + void HandleRingUpdateACK(Socket::ConnectionID connID, Socket::Packet packet) { + RingUpdateACKMsg msg; + if (!msg.Read(packet.Body())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatcherNode: Failed to parse RingUpdateACK\n"); + return; + } + { + std::lock_guard lock(m_ackMutex); + auto& ver = m_workerAckedVersion[msg.m_nodeIndex]; + if (msg.m_ringVersion > ver) ver = msg.m_ringVersion; + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: RingUpdateACK from node %d (v%u)\n", + msg.m_nodeIndex, msg.m_ringVersion); + } + + void BroadcastRingUpdate() { + auto ring = std::atomic_load(&m_hashRing); + if (!ring) return; + + std::uint32_t version = m_currentRingVersion.load(); + RingUpdateMsg msg; + msg.m_ringVersion = version; + msg.m_vnodeCount = ring->GetVNodeCount(); + for (int idx : ring->GetNodes()) { + msg.m_nodeIndices.push_back(idx); + } + + std::size_t bodySize = msg.EstimateBufferSize(); + int numNodes = static_cast(m_nodeAddrs.size()); + + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + auto peerConn = GetPeerConnection(i); + if (peerConn == Socket::c_invalidConnectionID) continue; + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::RingUpdate; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_client->SendPacket(peerConn, std::move(pkt), nullptr); + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: Broadcast RingUpdate v%u (%d nodes)\n", + version, (int)msg.m_nodeIndices.size()); + } + + void RetryUnackedRingUpdates() { + auto ring = std::atomic_load(&m_hashRing); + if (!ring) return; + std::uint32_t currentVer = m_currentRingVersion.load(); + if (currentVer == 0) return; + + std::vector unacked; + { + std::lock_guard lock(m_ackMutex); + int numNodes = static_cast(m_nodeAddrs.size()); + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + auto it = m_workerAckedVersion.find(i); + if (it == m_workerAckedVersion.end() || it->second < currentVer) + unacked.push_back(i); + } + } + if (unacked.empty()) return; + + RingUpdateMsg msg; + msg.m_ringVersion = currentVer; + msg.m_vnodeCount = ring->GetVNodeCount(); + for (int idx : ring->GetNodes()) msg.m_nodeIndices.push_back(idx); + std::size_t bodySize = msg.EstimateBufferSize(); + + for (int nodeIdx : unacked) { + auto peerConn = GetPeerConnection(nodeIdx); + if (peerConn == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: RetryUnackedRingUpdates skip node %d (no peer conn)\n", nodeIdx); + continue; + } + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::RingUpdate; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_client->SendPacket(peerConn, std::move(pkt), nullptr); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: Retried RingUpdate to node %d (connID=%u)\n", nodeIdx, peerConn); + } + } + + DispatchCoordinator m_dispatch; + std::atomic m_currentRingVersion{0}; + mutable std::mutex m_ackMutex; + std::unordered_map m_workerAckedVersion; + + std::thread m_heartbeatThread; + std::atomic m_heartbeatStop{false}; + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h new file mode 100644 index 000000000..b4da82fcc --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h @@ -0,0 +1,651 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/Common.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Serializable request for remote Append operations sent between compute nodes. + /// MirrorVersion 1 added m_layer to disambiguate which ExtraDynamicSearcher on + /// the receiver side handles the request. Version 0 packets default m_layer=0. + struct RemoteAppendRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 1; } + + SizeType m_headID = 0; + std::string m_headVec; // raw head vector bytes + std::int32_t m_appendNum = 0; + std::string m_appendPosting; // serialized posting data + std::int32_t m_layer = 0; // originating ExtraDynamicSearcher layer + + std::size_t EstimateBufferSize() const { + std::size_t size = 0; + size += sizeof(std::uint16_t) * 2; // version fields + size += sizeof(SizeType); // headID + size += sizeof(std::uint32_t) + m_headVec.size(); // headVec (len-prefixed) + size += sizeof(std::int32_t); // appendNum + size += sizeof(std::uint32_t) + m_appendPosting.size(); // appendPosting (len-prefixed) + size += sizeof(std::int32_t); // layer (mirrorVer >= 1) + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_headID, p_buffer); + p_buffer = SimpleWriteBuffer(m_headVec, p_buffer); + p_buffer = SimpleWriteBuffer(m_appendNum, p_buffer); + p_buffer = SimpleWriteBuffer(m_appendPosting, p_buffer); + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + return Read(p_buffer, nullptr); + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) return nullptr; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headID); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headVec); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_appendNum); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_appendPosting); + if (mirrorVer >= 1) { + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_layer); + } else { + m_layer = 0; + } + return p_buffer; + } + }; + + /// Response for remote Append operations. + struct RemoteAppendResponse { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + enum class Status : std::uint8_t { Success = 0, Failed = 1 }; + Status m_status = Status::Success; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_status, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_status); + return p_buffer; + } + }; + + /// Identifies a compute node target for routing decisions. + struct RouteTarget { + int nodeIndex = -1; + bool isLocal = true; + }; + + /// Batch of remote append requests sent to a single node in one round-trip. + struct BatchRemoteAppendRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_count = 0; + std::vector m_items; + + std::size_t EstimateBufferSize() const { + std::size_t size = sizeof(std::uint16_t) * 2; // version + size += sizeof(std::uint32_t); // count + for (auto& item : m_items) size += item.EstimateBufferSize(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_count, p_buffer); + for (auto& item : m_items) p_buffer = item.Write(p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, std::uint32_t bodyLength = 0) { + using namespace Socket::SimpleSerialization; + const std::uint8_t* bufEnd = (bodyLength > 0) ? (p_buffer + bodyLength) : nullptr; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) { + m_items.clear(); + return nullptr; + } + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, m_count); + if (p_buffer == nullptr) { + m_items.clear(); + return nullptr; + } + // Reject obviously corrupt counts before allocating + if (bodyLength > 0 && m_count > bodyLength / 8) { + m_items.clear(); + return nullptr; + } + m_items.resize(m_count); + for (std::uint32_t i = 0; i < m_count; i++) { + if (bufEnd && p_buffer >= bufEnd) { + m_items.clear(); + return nullptr; + } + p_buffer = m_items[i].Read(p_buffer, bufEnd); + if (!p_buffer) { + m_items.clear(); + return nullptr; + } + if (bufEnd && p_buffer > bufEnd) { + m_items.clear(); + return nullptr; + } + } + return p_buffer; + } + }; + + /// Response for batch remote append. + struct BatchRemoteAppendResponse { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_successCount = 0; + std::uint32_t m_failCount = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint32_t) * 2; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_successCount, p_buffer); + p_buffer = SimpleWriteBuffer(m_failCount, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_successCount); + p_buffer = SimpleReadBuffer(p_buffer, m_failCount); + return p_buffer; + } + }; + + /// Cross-node merge hint. Search-side trigger on node X observed that + /// posting `m_headID` (owned by the target node based on consistent-hash + /// ownership) is below the merge threshold. The receiver enqueues a + /// local MergeAsync; the local MergePostings logic decides whether the + /// posting really needs merging at execution time. Fire-and-forget: no + /// response packet, no retry queue. Multiple notifications for the same + /// head are dedup'd by m_mergeList on the receiver. + struct RemoteMergeRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + SizeType m_headID = 0; + std::int32_t m_layer = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(SizeType) + sizeof(std::int32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_headID, p_buffer); + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) return nullptr; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headID); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_layer); + return p_buffer; + } + }; + + /// Batch of cross-node merge hints sent to a single owner node in one + /// fire-and-forget packet. Sender-side dedups by (layer, headID) so + /// each entry appears at most once per flush window. + struct BatchRemoteMergeRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_count = 0; + std::vector m_items; + + std::size_t EstimateBufferSize() const { + std::size_t size = sizeof(std::uint16_t) * 2; + size += sizeof(std::uint32_t); + for (auto& item : m_items) size += item.EstimateBufferSize(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_count, p_buffer); + for (auto& item : m_items) p_buffer = item.Write(p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, std::uint32_t bodyLength = 0) { + using namespace Socket::SimpleSerialization; + const std::uint8_t* bufEnd = (bodyLength > 0) ? (p_buffer + bodyLength) : nullptr; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) { + m_items.clear(); + return nullptr; + } + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, m_count); + if (p_buffer == nullptr) { m_items.clear(); return nullptr; } + if (bodyLength > 0 && m_count > bodyLength / 8) { + m_items.clear(); + return nullptr; + } + m_items.resize(m_count); + for (std::uint32_t i = 0; i < m_count; i++) { + if (bufEnd && p_buffer >= bufEnd) { m_items.clear(); return nullptr; } + p_buffer = m_items[i].Read(p_buffer, bufEnd); + if (!p_buffer) { m_items.clear(); return nullptr; } + if (bufEnd && p_buffer > bufEnd) { m_items.clear(); return nullptr; } + } + return p_buffer; + } + }; + + /// Entry in a head sync broadcast: one add or delete of a head node. + /// `m_layer` identifies the originating ExtraDynamicSearcher so the + /// receiver applies the entry to the matching layer's head index + /// (with multi-layer SPANN, layer 0 and layer 1 both broadcast head + /// add/delete; without the layer field every entry would be misrouted + /// to a single shared callback). + struct HeadSyncEntry { + enum class Op : std::uint8_t { Add = 0, Delete = 1 }; + Op op; + SizeType headVID; + std::string headVector; // only for Add; empty for Delete + std::int32_t m_layer = 0; // originating ExtraDynamicSearcher layer + + size_t EstimateBufferSize() const { + return sizeof(std::uint8_t) // op + + sizeof(SizeType) // headVID + + sizeof(std::uint32_t) // headVector length + + headVector.size() + + sizeof(std::int32_t); // layer + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(static_cast(op), p_buffer); + p_buffer = SimpleWriteBuffer(headVID, p_buffer); + std::uint32_t vecLen = static_cast(headVector.size()); + p_buffer = SimpleWriteBuffer(vecLen, p_buffer); + if (vecLen > 0) { + memcpy(p_buffer, headVector.data(), vecLen); + p_buffer += vecLen; + } + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint8_t rawOp = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawOp); + op = static_cast(rawOp); + p_buffer = SimpleReadBuffer(p_buffer, headVID); + std::uint32_t vecLen = 0; + p_buffer = SimpleReadBuffer(p_buffer, vecLen); + if (vecLen > 0) { + headVector.assign(reinterpret_cast(p_buffer), vecLen); + p_buffer += vecLen; + } else { + headVector.clear(); + } + p_buffer = SimpleReadBuffer(p_buffer, m_layer); + return p_buffer; + } + }; + + /// Dispatch command from driver to workers (replaces file-based barriers). + struct DispatchCommand { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + enum class Type : std::uint8_t { Search = 0, Insert = 1, Stop = 2, Heartbeat = 3 }; + Type m_type = Type::Search; + std::uint64_t m_dispatchId = 0; // unique ID from driver + std::uint32_t m_round = 0; // search round or insert batch index + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + + sizeof(std::uint64_t) + sizeof(std::uint32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_type), p_buffer); + p_buffer = SimpleWriteBuffer(m_dispatchId, p_buffer); + p_buffer = SimpleWriteBuffer(m_round, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawType = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawType); + m_type = static_cast(rawType); + p_buffer = SimpleReadBuffer(p_buffer, m_dispatchId); + p_buffer = SimpleReadBuffer(p_buffer, m_round); + return p_buffer; + } + }; + + /// Result from worker back to driver after executing a dispatch command. + struct DispatchResult { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 1; } + + enum class Status : std::uint8_t { Success = 0, Failed = 1 }; + Status m_status = Status::Success; + std::uint64_t m_dispatchId = 0; + std::uint32_t m_round = 0; + double m_wallTime = 0.0; + std::int32_t m_nodeIndex = -1; // which worker sent this result + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + + sizeof(std::uint64_t) + sizeof(std::uint32_t) + sizeof(double) + + sizeof(std::int32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_status), p_buffer); + p_buffer = SimpleWriteBuffer(m_dispatchId, p_buffer); + p_buffer = SimpleWriteBuffer(m_round, p_buffer); + p_buffer = SimpleWriteBuffer(m_wallTime, p_buffer); + p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawStatus = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawStatus); + m_status = static_cast(rawStatus); + p_buffer = SimpleReadBuffer(p_buffer, m_dispatchId); + p_buffer = SimpleReadBuffer(p_buffer, m_round); + p_buffer = SimpleReadBuffer(p_buffer, m_wallTime); + if (mirrorVer >= 1) { + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); + } + return p_buffer; + } + }; + + /// Request to lock/unlock a headID on its owner node (for cross-node Merge). + /// MirrorVersion 1 added m_layer so multi-layer setups dispatch to the + /// correct lock pool (each ExtraDynamicSearcher owns its own bucket flags). + struct RemoteLockRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 1; } + + enum class Op : std::uint8_t { Lock = 0, Unlock = 1 }; + Op m_op = Op::Lock; + SizeType m_headID = 0; + std::int32_t m_layer = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + + sizeof(SizeType) + sizeof(std::int32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_op), p_buffer); + p_buffer = SimpleWriteBuffer(m_headID, p_buffer); + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawOp = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawOp); + m_op = static_cast(rawOp); + p_buffer = SimpleReadBuffer(p_buffer, m_headID); + if (mirrorVer >= 1) { + p_buffer = SimpleReadBuffer(p_buffer, m_layer); + } else { + m_layer = 0; + } + return p_buffer; + } + }; + + /// Response for remote lock operations. + struct RemoteLockResponse { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + enum class Status : std::uint8_t { Granted = 0, Denied = 1 }; + Status m_status = Status::Granted; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_status), p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawOp = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawOp); + m_status = static_cast(rawOp); + return p_buffer; + } + }; + + /// Worker → dispatcher registration message. + struct NodeRegisterMsg { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::int32_t m_nodeIndex = 0; + std::string m_host; + std::string m_port; + std::string m_store; + + std::size_t EstimateBufferSize() const { + std::size_t size = 0; + size += sizeof(std::uint16_t) * 2; + size += sizeof(std::int32_t); + size += sizeof(std::uint32_t) + m_host.size(); + size += sizeof(std::uint32_t) + m_port.size(); + size += sizeof(std::uint32_t) + m_store.size(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + p_buffer = SimpleWriteBuffer(m_host, p_buffer); + p_buffer = SimpleWriteBuffer(m_port, p_buffer); + p_buffer = SimpleWriteBuffer(m_store, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); + p_buffer = SimpleReadBuffer(p_buffer, m_host); + p_buffer = SimpleReadBuffer(p_buffer, m_port); + p_buffer = SimpleReadBuffer(p_buffer, m_store); + return p_buffer; + } + }; + + /// Dispatcher → worker ring update (full node list, versioned). + struct RingUpdateMsg { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_ringVersion = 0; + std::int32_t m_vnodeCount = 150; + std::vector m_nodeIndices; + + std::size_t EstimateBufferSize() const { + std::size_t size = 0; + size += sizeof(std::uint16_t) * 2; + size += sizeof(std::uint32_t); // ringVersion + size += sizeof(std::int32_t); // vnodeCount + size += sizeof(std::uint32_t); // numNodes + size += sizeof(std::int32_t) * m_nodeIndices.size(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_ringVersion, p_buffer); + p_buffer = SimpleWriteBuffer(m_vnodeCount, p_buffer); + std::uint32_t count = static_cast(m_nodeIndices.size()); + p_buffer = SimpleWriteBuffer(count, p_buffer); + for (auto idx : m_nodeIndices) { + p_buffer = SimpleWriteBuffer(idx, p_buffer); + } + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_ringVersion); + p_buffer = SimpleReadBuffer(p_buffer, m_vnodeCount); + std::uint32_t count = 0; + p_buffer = SimpleReadBuffer(p_buffer, count); + m_nodeIndices.resize(count); + for (std::uint32_t i = 0; i < count; i++) { + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndices[i]); + } + return p_buffer; + } + }; + + /// Worker → dispatcher ACK for a ring update. + struct RingUpdateACKMsg { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::int32_t m_nodeIndex = -1; + std::uint32_t m_ringVersion = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::int32_t) + sizeof(std::uint32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + p_buffer = SimpleWriteBuffer(m_ringVersion, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); + p_buffer = SimpleReadBuffer(p_buffer, m_ringVersion); + return p_buffer; + } + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h b/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h new file mode 100644 index 000000000..4e11a4b08 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h @@ -0,0 +1,319 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_NETWORKNODE_H_ +#define _SPTAG_SPANN_NETWORKNODE_H_ + +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Core/SPANN/Distributed/ConsistentHashRing.h" +#include "inc/Core/SPANN/Distributed/DispatchCoordinator.h" +#include "inc/Core/SPANN/Distributed/RemotePostingOps.h" +#include "inc/Socket/Client.h" +#include "inc/Socket/Server.h" +#include "inc/Socket/Packet.h" +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Base class providing shared networking infrastructure for all + /// distributed node roles. Manages server/client sockets, peer + /// connections, consistent hash ring storage, and a background + /// connection maintenance thread. + /// + /// Subclasses override RegisterHandlers() to wire up their specific + /// packet handlers, and BgProtocolStep() / IsRingSettled() for + /// role-specific background work. + class NetworkNode : public DispatchCoordinator::PeerNetwork, + public RemotePostingOps::NetworkAccess { + public: + NetworkNode() + : m_enabled(false), m_localNodeIndex(-1) {} + + virtual ~NetworkNode() { + m_bgConnectStop.store(true); + if (m_bgConnectThread.joinable()) m_bgConnectThread.join(); + } + + /// Initialize shared networking state. + bool InitializeNetwork( + int localNodeIdx, + const std::vector>& nodeAddrs, + int vnodeCount = 150) + { + if (nodeAddrs.empty() || localNodeIdx < 0 || + localNodeIdx >= static_cast(nodeAddrs.size())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "NetworkNode::Initialize invalid config: %d nodes, localIdx=%d\n", + (int)nodeAddrs.size(), localNodeIdx); + return false; + } + + m_localNodeIndex = localNodeIdx; + m_nodeAddrs = nodeAddrs; + m_vnodeCount = vnodeCount; + + // Start with empty hash ring + std::atomic_store(&m_hashRing, + std::shared_ptr( + std::make_shared(vnodeCount))); + + m_enabled = true; + return true; + } + + /// Start server + client + background connection thread. + /// Subclasses must have called InitializeNetwork() first. + /// Each node listens on its own address from the combined address list. + bool StartNetwork() { + if (!m_enabled) return false; + + // Pre-size m_peerConnections BEFORE the server is started — the + // server's handler threads can dispatch packets immediately on + // bind, and inbound handlers (e.g. HandleRingUpdate -> + // SendRingUpdateACK) call GetPeerConnection which indexes into + // m_peerConnections. Resizing here closes a startup race that + // could segfault when an early peer (typically the dispatcher + // sending the initial RingUpdate) won the race. + m_peerConnections.resize(m_nodeAddrs.size(), Socket::c_invalidConnectionID); + + // --- Client side --- + // Construct the Socket::Client BEFORE starting the + // server. Server handlers (notably HeadSync receiver / ring + // update) can fire as soon as the listening socket accepts a + // peer, and they may call ConnectToPeer → m_client-> + // ConnectToServer. If m_client is still null at that point, + // the call dereferences a null unique_ptr and segfaults + // (Pre-build "All N connection attempts to node X failed" + // crash). Construct the client first so the handler path is + // safe before any socket can be accepted. + Socket::PacketHandlerMapPtr clientHandlers(new Socket::PacketHandlerMap); + RegisterClientHandlers(clientHandlers); + + m_client.reset(new Socket::Client(clientHandlers, 8, 30)); + + // --- Server side --- + { + Socket::PacketHandlerMapPtr serverHandlers(new Socket::PacketHandlerMap); + RegisterServerHandlers(serverHandlers); + + const auto& localAddr = m_nodeAddrs[m_localNodeIndex]; + m_server.reset(new Socket::Server( + localAddr.first, localAddr.second, serverHandlers, 8)); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "NetworkNode server listening on %s:%s\n", + localAddr.first.c_str(), localAddr.second.c_str()); + } + + // --- Background thread --- + m_bgConnectStop.store(false); + m_bgConnectThread = std::thread([this]() { + int numNodes = static_cast(m_nodeAddrs.size()); + int delayMs = 500; + while (!m_bgConnectStop.load()) { + bool allConnected = true; + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + { + std::lock_guard lock(m_connMutex); + if (m_peerConnections[i] != Socket::c_invalidConnectionID) + continue; + } + allConnected = false; + ConnectToPeer(i, 1, 0); + } + + BgProtocolStep(); + + if (allConnected && IsRingSettled()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "NetworkNode: All peers connected and ring synchronized\n"); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(delayMs)); + delayMs = std::min(delayMs + 500, 5000); + } + }); + + return true; + } + + // ---- PeerNetwork + NetworkAccess interface ---- + // + // GetLocalNodeIndex() / GetNumNodes() use NETWORK-SLOT semantics: + // m_nodeAddrs is the flat address table indexed by internal slot + // (slot 0 = dispatcher, slots 1..N = workers). These are the + // values used for raw socket connections and dispatch routing. + // + // For COMPUTE-WORKER semantics (VID interleaving, version-map + // sizing, hash-ring partitioning), use GetNumWorkerNodes() / + // GetWorkerNodeIndex() instead — those exclude the dispatcher + // and use 0-indexed worker shard numbering. Mixing the two + // produces off-by-one shard math + // (AllocateGlobalVID maps to the wrong globalVID range). + + int GetLocalNodeIndex() const override { return m_localNodeIndex; } + + int GetNumNodes() const override { + return static_cast(m_nodeAddrs.size()); + } + + // ---- Compute-role accessors ---- + // + // These describe the LOGICAL cluster composition independent of + // the network slot layout. Subclasses populate the m_num*Nodes / + // m_workerNodeIndex fields during Initialize(). + // + // Use these (NOT GetNumNodes / GetLocalNodeIndex) for: + // * AllocateGlobalVID interleaving math + // * Version-map cross-node bound sizing + // * AddIDCapacity growth multiplier + // * Any "how many shards are storing user data?" question + + int GetNumWorkerNodes() const { return m_numWorkerNodes; } + int GetNumDispatchNodes() const { return m_numDispatchNodes; } + + /// 0-indexed compute-shard position for this node, or -1 if this + /// node is dispatcher-only (has no local data shard). + int GetWorkerNodeIndex() const { return m_workerNodeIndex; } + + Socket::ConnectionID GetPeerConnection(int nodeIndex) override { + { + std::lock_guard lock(m_connMutex); + if (m_peerConnections[nodeIndex] != Socket::c_invalidConnectionID) + return m_peerConnections[nodeIndex]; + } + if (ConnectToPeer(nodeIndex, 5, 1000)) { + std::lock_guard lock(m_connMutex); + return m_peerConnections[nodeIndex]; + } + return Socket::c_invalidConnectionID; + } + + void SendPacket(Socket::ConnectionID connID, Socket::Packet&& pkt, + std::function callback) override { + m_client->SendPacket(connID, std::move(pkt), std::move(callback)); + } + + void InvalidatePeerConnection(int nodeIndex) override { + std::lock_guard lock(m_connMutex); + m_peerConnections[nodeIndex] = Socket::c_invalidConnectionID; + } + + Socket::Client* GetClient() override { return m_client.get(); } + Socket::Server* GetServer() override { return m_server.get(); } + + // ---- Shared accessors ---- + + bool IsEnabled() const { return m_enabled; } + + std::shared_ptr GetHashRing() const { + return std::atomic_load(&m_hashRing); + } + + void SetHashRing(std::shared_ptr ring) { + std::atomic_store(&m_hashRing, std::move(ring)); + } + + bool WaitForAllPeersConnected(int timeoutSec = 120) { + if (!m_enabled) return true; + int numNodes = static_cast(m_nodeAddrs.size()); + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSec); + while (std::chrono::steady_clock::now() < deadline) { + bool allConnected = true; + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + std::lock_guard lock(m_connMutex); + if (m_peerConnections[i] == Socket::c_invalidConnectionID) { + allConnected = false; + break; + } + } + if (allConnected) return true; + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "NetworkNode: Timed out waiting for peer connections (%ds)\n", timeoutSec); + return false; + } + + bool ConnectToPeer(int nodeIndex, int maxRetries = 10, int initialDelayMs = 500) { + if (nodeIndex == m_localNodeIndex) return true; + std::pair addr; + { + std::lock_guard lock(m_connMutex); + if (nodeIndex >= static_cast(m_nodeAddrs.size())) return false; + addr = m_nodeAddrs[nodeIndex]; + } + int delayMs = initialDelayMs; + for (int attempt = 1; attempt <= maxRetries; attempt++) { + ErrorCode ec; + auto connID = m_client->ConnectToServer(addr.first, addr.second, ec); + if (ec == ErrorCode::Success) { + std::lock_guard lock(m_connMutex); + m_peerConnections[nodeIndex] = connID; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "NetworkNode[local=%d]: Connected to node %d (%s:%s), connID=%u (attempt %d)\n", + m_localNodeIndex, nodeIndex, addr.first.c_str(), addr.second.c_str(), connID, attempt); + return true; + } + if (attempt < maxRetries) { + std::this_thread::sleep_for(std::chrono::milliseconds(delayMs)); + delayMs = std::min(delayMs * 2, 5000); + } + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "NetworkNode: All %d connection attempts to node %d failed\n", + maxRetries, nodeIndex); + return false; + } + + protected: + /// Subclasses register their packet handlers here. + virtual void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) = 0; + virtual void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) = 0; + + /// Called each iteration of the bg thread for role-specific protocol work. + virtual void BgProtocolStep() {} + + /// Return true when ring is fully synchronized for this node's role. + virtual bool IsRingSettled() const { return true; } + + bool m_enabled; + int m_localNodeIndex; + int m_vnodeCount = 150; + + // Compute-role accounting. Set by subclass Initialize(). + // m_workerNodeIndex == -1 means this node has no local data shard + // (dispatcher-only role). See GetNumWorkerNodes() / GetWorkerNodeIndex() + // for the rationale on why these are separate from m_nodeAddrs.size(). + int m_numWorkerNodes = 0; + int m_numDispatchNodes = 0; + int m_workerNodeIndex = -1; + + // Consistent hash ring (lock-free RCU: atomic_load to read, copy-on-write to modify) + std::shared_ptr m_hashRing; + std::mutex m_ringWriteMutex; + + // Node addresses + std::vector> m_nodeAddrs; + + // Networking + std::unique_ptr m_server; + std::unique_ptr m_client; + std::mutex m_connMutex; + std::vector m_peerConnections; + + // Background thread + std::thread m_bgConnectThread; + std::atomic m_bgConnectStop{false}; + }; + +} // namespace SPTAG::SPANN + +#endif // _SPTAG_SPANN_NETWORKNODE_H_ diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h new file mode 100644 index 000000000..577b91876 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -0,0 +1,1325 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Helper/ThreadPool.h" +#include "inc/Socket/Client.h" +#include "inc/Socket/Server.h" +#include "inc/Socket/Packet.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + // Per-thread hook so the SPDKThreadPool's pre-allocated ExtraWorkSpace + // (initialised once per worker thread, see SPDKThreadPool::initSPDK) can + // be reached from inside the AppendCallback lambda without changing the + // callback signature. BatchAppendItemJob::exec(workspace*, abort*) sets + // this before invoking the callback so the callback skips the per-item + // InitWorkSpace allocation / m_freeWorkSpaceIds churn that otherwise + // serialises 10k-item batches into ~130s on the receiver. + inline thread_local void* tls_preallocAppendWorkSpace = nullptr; + + /// Handles all node-to-node RPC mechanics for internal posting operations: + /// - Append / BatchAppend (forward writes to the correct owner node) + /// - HeadSync (broadcast head index changes to peers) + /// - RemoteLock (cross-node locking for merge/split) + /// + /// This class owns the request/response matching state and serialization + /// logic. It is independent of routing decisions — WorkerNode decides + /// *where* to send, RemotePostingOps handles *how*. + class RemotePostingOps { + public: + using AppendCallback = std::function headVec, + int appendNum, + std::string& appendPosting)>; + + using HeadSyncCallback = std::function; + using RemoteLockCallback = std::function; + + /// Callback for cross-node merge: search on a peer node observed + /// that posting `headID` (which we own) looks underfull. The peer + /// sent a fire-and-forget MergeRequest to us; we just schedule the + /// local MergeAsync. Returns nothing; receiver-side m_mergeList + /// already dedupes repeated triggers, so dropped notifications + /// are recoverable on the next observation. + using MergeCallback = std::function; + + /// Abstract interface for network access (implemented by NetworkNode). + class NetworkAccess { + public: + virtual ~NetworkAccess() = default; + virtual Socket::ConnectionID GetPeerConnection(int nodeIndex) = 0; + virtual void InvalidatePeerConnection(int nodeIndex) = 0; + virtual int GetLocalNodeIndex() const = 0; + virtual int GetNumNodes() const = 0; + virtual Socket::Client* GetClient() = 0; + virtual Socket::Server* GetServer() = 0; + }; + + RemotePostingOps() { + StartHeadSyncRetryThread(); + } + + ~RemotePostingOps() { + StopHeadSyncRetryThread(); + } + + RemotePostingOps(const RemotePostingOps&) = delete; + RemotePostingOps& operator=(const RemotePostingOps&) = delete; + + void SetNetwork(NetworkAccess* net) { m_net = net; } + + // Inject the searcher's shared compute pool. Receiver-side BatchAppend + // work runs as Jobs on this pool so it shares a single bounded- + // concurrency budget with local Append/Split/Merge/Reassign (instead + // of a separate bg executor + transient std::threads which over- + // subscribed TiKV). Per-layer: each layer's ExtraDynamicSearcher owns + // its own m_splitThreadPool, so BatchAppend items dispatch by the + // request's m_layer to the matching pool. A single submitter would + // pile both layers' remote appends into whichever pool wired last. + using JobSubmitter = std::function; + void SetJobSubmitter(int layer, JobSubmitter submitter) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + if (m_jobSubmitters.size() <= static_cast(layer)) { + m_jobSubmitters.resize(static_cast(layer) + 1); + } + m_jobSubmitters[layer] = std::move(submitter); + } + + // Helper: ensure the per-layer registries are wide enough for `layer`. + // Caller must hold m_callbackLifetimeMutex in exclusive mode. + void EnsureLayerSlot_NoLock(int layer) { + if (layer < 0) return; + const size_t needed = static_cast(layer) + 1; + if (m_appendCallbacks.size() < needed) m_appendCallbacks.resize(needed); + if (m_headSyncCallbacks.size() < needed) m_headSyncCallbacks.resize(needed); + if (m_remoteLockCallbacks.size() < needed) m_remoteLockCallbacks.resize(needed); + if (m_mergeCallbacks.size() < needed) m_mergeCallbacks.resize(needed); + if (m_callbackOwners.size() < needed) { + std::vector> grown(needed); + for (size_t i = 0; i < m_callbackOwners.size(); ++i) { + grown[i].store( + m_callbackOwners[i].load(std::memory_order_acquire), + std::memory_order_release); + } + m_callbackOwners = std::move(grown); + } + } + + void SetAppendCallback(int layer, AppendCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_appendCallbacks[layer] = std::move(cb); + } + void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_headSyncCallbacks[layer] = std::move(cb); + } + void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_remoteLockCallbacks[layer] = std::move(cb); + } + void SetMergeCallback(int layer, MergeCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_mergeCallbacks[layer] = std::move(cb); + } + + /// Atomically clear ALL callbacks (every layer) and wait for any in-flight + /// callback invocation to finish. Required before the owner of the captured + /// `this` pointer (e.g. ExtraDynamicSearcher) is destroyed, otherwise + /// the lambdas registered via SetXxxCallback would dereference a dangling + /// pointer. + void ClearCallbacks() { + std::unique_lock lk(m_callbackLifetimeMutex); + m_appendCallbacks.clear(); + m_headSyncCallbacks.clear(); + m_remoteLockCallbacks.clear(); + m_mergeCallbacks.clear(); + m_callbackOwners = std::vector>(); + } + + /// Claim ownership of the registered callbacks for a SPECIFIC layer. + /// Each ExtraDynamicSearcher owns its own layer slot; per-layer + /// ownership prevents one layer's destructor from wiping another + /// layer's still-valid callbacks (the original 1-layer design used a + /// single ownership token; with Layers>=2 each layer needs its own). + void ClaimCallbackOwnership(int layer, const void* owner) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_callbackOwners[layer].store(owner, std::memory_order_release); + } + + /// Clear callbacks for `layer` ONLY if `owner` is the current registered + /// owner of that layer. Used by ExtraDynamicSearcher destructor: each + /// layer's destructor only clears its own slot. Returns true if cleared. + bool ClearCallbacksIfOwner(int layer, const void* owner) { + std::unique_lock lk(m_callbackLifetimeMutex); + if (layer < 0 || static_cast(layer) >= m_callbackOwners.size()) { + return false; + } + if (m_callbackOwners[layer].load(std::memory_order_acquire) != owner) { + return false; + } + m_appendCallbacks[layer] = nullptr; + m_headSyncCallbacks[layer] = nullptr; + m_remoteLockCallbacks[layer] = nullptr; + if (layer >= 0 && static_cast(layer) < m_mergeCallbacks.size()) { + m_mergeCallbacks[layer] = nullptr; + } + m_callbackOwners[layer].store(nullptr, std::memory_order_release); + return true; + } + + // ----- internal callback lookup helpers (caller holds shared lock) ----- + const AppendCallback* LookupAppendCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_appendCallbacks.size()) return nullptr; + const auto& cb = m_appendCallbacks[layer]; + return cb ? &cb : nullptr; + } + const HeadSyncCallback* LookupHeadSyncCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_headSyncCallbacks.size()) return nullptr; + const auto& cb = m_headSyncCallbacks[layer]; + return cb ? &cb : nullptr; + } + const RemoteLockCallback* LookupRemoteLockCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_remoteLockCallbacks.size()) return nullptr; + const auto& cb = m_remoteLockCallbacks[layer]; + return cb ? &cb : nullptr; + } + // PutPosting/FetchPosting/DeletePosting RPCs lived here historically. + // With shared TiKV every node reads and writes the posting store + // directly (PD routes the key), so the cross-node scatter-gather + // and owner-callback round-trips are unnecessary. + const MergeCallback* LookupMergeCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_mergeCallbacks.size()) return nullptr; + const auto& cb = m_mergeCallbacks[layer]; + return cb ? &cb : nullptr; + } + + // ================================================================== + // Append — single item, synchronous (waits for response) + // ================================================================== + + ErrorCode SendRemoteAppend( + int targetNodeIndex, + int layer, + SizeType headID, + const std::shared_ptr& headVec, + int appendNum, + std::string& appendPosting) + { + Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Cannot connect to node %d for headID %lld\n", + targetNodeIndex, (std::int64_t)headID); + return ErrorCode::Fail; + } + + RemoteAppendRequest req; + req.m_layer = layer; + req.m_headID = headID; + req.m_headVec = *headVec; + req.m_appendNum = appendNum; + req.m_appendPosting = appendPosting; + + Socket::ResourceID resID = m_nextResourceId.fetch_add(1); + auto [future, _] = CreatePendingResponse(resID); + (void)_; + + Socket::Packet packet; + packet.Header().m_packetType = Socket::PacketType::AppendRequest; + packet.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + packet.Header().m_connectionID = Socket::c_invalidConnectionID; + packet.Header().m_resourceID = resID; + + auto bodySize = static_cast(req.EstimateBufferSize()); + packet.Header().m_bodyLength = bodySize; + packet.AllocateBuffer(bodySize); + req.Write(packet.Body()); + packet.Header().WriteBuffer(packet.HeaderBuffer()); + + m_net->GetClient()->SendPacket(connID, std::move(packet), + MakeSendFailHandler(resID)); + + auto status = future.wait_for(std::chrono::seconds(30)); + if (status == std::future_status::timeout) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Timeout waiting for append response for headID %lld from node %d\n", + (std::int64_t)headID, targetNodeIndex); + ErasePending(resID); + return ErrorCode::Fail; + } + return future.get(); + } + + // ================================================================== + // Append — batch, synchronous with retry + // ================================================================== + + ErrorCode SendBatchRemoteAppend( + int targetNodeIndex, + std::vector& items) + { + if (items.empty()) return ErrorCode::Success; + + // Chunk the batch so a single RPC never exceeds kChunkSize items. + // Large batches (millions of items) cannot be processed by the + // receiver within a single timeout window, causing data loss + // when the request is dropped. Chunking keeps each RPC bounded. + // [v38] Reduced 50000 → 10000 to (a) shrink end-of-batch drain + // tail (final chunk no longer 14s wide) and (b) let multiple + // chunks pipeline on the receiver pool. + // [v43] Back to 50000 — v42 (10k) was throughput-best (906/s) + // but during-insert p50 was 222ms; v43 (50k) trades throughput + // (-22% → 704/s) for during-insert p50 (-36% → 141ms) and big + // recovery in post-insert r1 QPS (47→85). v44 (100k) blew up + // tail drain: a single 100k chunk took 116s on the receiver, + // making end-of-batch drain run 40+ min (vs 8 min at 50k). + // 50k is the sweet spot. + // [v47] With shared-pool receiver (BatchAppendItemJob on + // m_splitThreadPool), 50k chunks still occasionally exceed the + // 180s wait_for window under contention → "Timeout waiting for + // batch response" + retries. Drop to 10k so each RPC's worst-case + // receiver wall-clock is ~6× smaller and stays under the timeout. + constexpr size_t kChunkSize = 3000; + const size_t total = items.size(); + size_t offset = 0; + std::vector chunk; + chunk.reserve(std::min(kChunkSize, total)); + + while (offset < total) { + size_t end = std::min(offset + kChunkSize, total); + chunk.clear(); + chunk.reserve(end - offset); + for (size_t i = offset; i < end; ++i) { + chunk.push_back(std::move(items[i])); + } + + ErrorCode chunkRet = SendBatchRemoteAppendChunk(targetNodeIndex, chunk); + if (chunkRet != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Chunk send failed to node %d (offset=%zu/%zu, chunk=%zu items)\n", + targetNodeIndex, offset, total, end - offset); + return chunkRet; + } + offset = end; + } + return ErrorCode::Success; + } + + private: + ErrorCode SendBatchRemoteAppendChunk( + int targetNodeIndex, + std::vector& items) + { + if (items.empty()) return ErrorCode::Success; + + for (int attempt = 0; attempt < 3; attempt++) { + Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Cannot connect to node %d for batch (%d items, attempt %d)\n", + targetNodeIndex, (int)items.size(), attempt + 1); + if (attempt < 2) continue; + return ErrorCode::Fail; + } + + BatchRemoteAppendRequest batchReq; + batchReq.m_count = static_cast(items.size()); + batchReq.m_items = std::move(items); + + Socket::ResourceID resID = m_nextResourceId.fetch_add(1); + auto [future, _] = CreatePendingResponse(resID); + (void)_; + + Socket::Packet packet; + packet.Header().m_packetType = Socket::PacketType::BatchAppendRequest; + packet.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + packet.Header().m_connectionID = Socket::c_invalidConnectionID; + packet.Header().m_resourceID = resID; + + auto bodySize = static_cast(batchReq.EstimateBufferSize()); + packet.Header().m_bodyLength = bodySize; + packet.AllocateBuffer(bodySize); + batchReq.Write(packet.Body()); + items = std::move(batchReq.m_items); // restore for retry + + packet.Header().WriteBuffer(packet.HeaderBuffer()); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "RemotePostingOps: Sending batch of %u appends to node %d (resID=%u, attempt=%d)\n", + batchReq.m_count, targetNodeIndex, resID, attempt + 1); + + auto waitStart = std::chrono::steady_clock::now(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RemotePostingOps: BatchAppendChunk -> node %d (resID=%u, attempt=%d, items=%u) wait_start\n", + targetNodeIndex, resID, attempt + 1, batchReq.m_count); + + m_net->GetClient()->SendPacket(connID, std::move(packet), + MakeSendFailHandler(resID)); + + // Generous timeout: 50k items * (~10ms TiKV roundtrip / 16 worker threads) + // = ~31s typical; cap at 180s to allow for lock contention with merges/splits. + auto status = future.wait_for(std::chrono::seconds(180)); + auto waitMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - waitStart).count(); + if (status == std::future_status::timeout) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Timeout waiting for batch response from node %d (chunk=%u items, attempt=%d, waited=%lldms)\n", + targetNodeIndex, batchReq.m_count, attempt + 1, (long long)waitMs); + ErasePending(resID); + // Do NOT invalidate the connection on timeout — a slow + // response is not a broken connection, and reconnecting + // floods the worker's accept loop. Real connection errors + // are signalled via MakeSendFailHandler (which sets the + // promise to Fail, taking the "result != Success" path + // below). + if (attempt < 2) continue; + return ErrorCode::Fail; + } + + ErrorCode result = future.get(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RemotePostingOps: BatchAppendChunk <- node %d (resID=%u, attempt=%d, items=%u, waited=%lldms, result=%d)\n", + targetNodeIndex, resID, attempt + 1, batchReq.m_count, (long long)waitMs, (int)result); + if (result == ErrorCode::Success) return ErrorCode::Success; + + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Batch to node %d failed (attempt %d), reconnecting...\n", + targetNodeIndex, attempt + 1); + m_net->InvalidatePeerConnection(targetNodeIndex); + } + return ErrorCode::Fail; + } + + public: + + // ================================================================== + // HeadSync — fire-and-forget broadcast + // ================================================================== + + void BroadcastHeadSync(const std::vector& entries) { + if (entries.empty()) return; + + int numNodes = m_net->GetNumNodes(); + int localIdx = m_net->GetLocalNodeIndex(); + + // Count once per peer for sent-entry totals. + std::uint64_t targetCount = 0; + for (int i = 0; i < numNodes; i++) { + if (i != localIdx) targetCount++; + } + m_headSyncBroadcastEntries.fetch_add(entries.size() * targetCount, + std::memory_order_relaxed); + + for (int i = 0; i < numNodes; i++) { + if (i == localIdx) continue; + // Pass a copy of `entries` per peer so each can be re-enqueued + // into its own retry backlog independently on send failure. + SendOneHeadSync(i, std::vector(entries), + /*isRetry=*/false); + } + } + + // Send a HeadSync packet to a single peer. On TCP-level send failure + // (success=false reported by the network stack), the entries are + // appended to the per-peer retry backlog so the background retry + // thread can re-attempt delivery. Counter increments are done + // best-effort once the SendPacket completion lambda fires. + void SendOneHeadSync(int nodeIdx, + std::vector entries, + bool isRetry) + { + if (entries.empty()) return; + + Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIdx); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSync no connection to node %d (count=%zu, isRetry=%d)\n", + nodeIdx, entries.size(), isRetry ? 1 : 0); + EnqueueHeadSyncRetry(nodeIdx, std::move(entries)); + return; + } + + size_t bodySize = sizeof(std::uint32_t); + for (const auto& e : entries) bodySize += e.EstimateBufferSize(); + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::HeadSyncRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + + std::uint8_t* buf = pkt.Body(); + buf = Socket::SimpleSerialization::SimpleWriteBuffer( + static_cast(entries.size()), buf); + for (const auto& e : entries) buf = e.Write(buf); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + const std::uint64_t sentCount = entries.size(); + std::shared_ptr> entriesShared = + std::make_shared>(std::move(entries)); + const bool wasRetry = isRetry; + + m_net->GetClient()->SendPacket(connID, std::move(pkt), + [this, nodeIdx, entriesShared, sentCount, wasRetry](bool success) { + if (success) { + m_headSyncBroadcastSendOK.fetch_add(sentCount, + std::memory_order_relaxed); + if (wasRetry) { + m_headSyncRetrySucceeded.fetch_add(sentCount, + std::memory_order_relaxed); + } + } else { + m_headSyncBroadcastSendFail.fetch_add(sentCount, + std::memory_order_relaxed); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSync send to node %d FAILED " + "(count=%llu, isRetry=%d) — enqueueing for retry\n", + nodeIdx, + (unsigned long long)sentCount, + wasRetry ? 1 : 0); + m_net->InvalidatePeerConnection(nodeIdx); + EnqueueHeadSyncRetry(nodeIdx, std::move(*entriesShared)); + } + }); + } + + void EnqueueHeadSyncRetry(int nodeIdx, std::vector entries) { + if (entries.empty()) return; + auto backlog = GetOrCreateBacklog(nodeIdx); + std::lock_guard g(backlog->mu); + if (backlog->queue.size() + entries.size() > HeadSyncBacklog::kMaxEntries) { + std::uint64_t dropped = entries.size(); + m_headSyncRetryDropped.fetch_add(dropped, std::memory_order_relaxed); + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSync retry queue full for node %d " + "(queue=%zu, dropping=%llu) — index will diverge!\n", + nodeIdx, backlog->queue.size(), + (unsigned long long)dropped); + return; + } + for (auto& e : entries) backlog->queue.push_back(std::move(e)); + m_headSyncRetryEnqueued.fetch_add(entries.size(), + std::memory_order_relaxed); + } + + // Pull up to maxBatch entries from the per-peer backlog and re-send + // them. Called from the retry thread and on demand. Returns the + // total number of entries dispatched (including for retry-of-retry). + size_t DrainHeadSyncBacklog(size_t maxBatch = 1024) { + if (!m_net) return 0; + std::vector nodeIdxs; + { + std::shared_lock lk(m_headSyncBacklogsMu); + nodeIdxs.reserve(m_headSyncBacklogs.size()); + for (auto& kv : m_headSyncBacklogs) nodeIdxs.push_back(kv.first); + } + size_t dispatched = 0; + for (int nodeIdx : nodeIdxs) { + auto backlog = GetOrCreateBacklog(nodeIdx); + std::vector batch; + { + std::lock_guard g(backlog->mu); + if (backlog->queue.empty()) continue; + size_t bs = std::min(backlog->queue.size(), maxBatch); + batch.reserve(bs); + for (size_t i = 0; i < bs; i++) { + batch.push_back(std::move(backlog->queue.front())); + backlog->queue.pop_front(); + } + } + size_t bs = batch.size(); + SendOneHeadSync(nodeIdx, std::move(batch), /*isRetry=*/true); + dispatched += bs; + } + return dispatched; + } + + size_t GetHeadSyncBacklogSize() const { + size_t total = 0; + std::vector> snapshot; + { + std::shared_lock lk(m_headSyncBacklogsMu); + snapshot.reserve(m_headSyncBacklogs.size()); + for (auto& kv : m_headSyncBacklogs) snapshot.push_back(kv.second); + } + for (auto& b : snapshot) { + std::lock_guard g(b->mu); + total += b->queue.size(); + } + return total; + } + + // Best-effort log dump of HeadSync delivery counters. Use whenever a + // checkpoint is needed (start/end of insert phase, before query, on + // SaveIndex, etc.). + void DumpHeadSyncStats(const char* label) const { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "[HeadSync stats %s] broadcast_entries=%llu send_ok=%llu send_fail=%llu " + "recv_entries=%llu apply_add=%llu apply_del=%llu " + "retry_enqueued=%llu retry_succeeded=%llu retry_dropped=%llu " + "backlog_now=%zu\n", + label ? label : "", + (unsigned long long)m_headSyncBroadcastEntries.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncBroadcastSendOK.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncBroadcastSendFail.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRecvEntries.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncApplyAdd.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncApplyDelete.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRetryEnqueued.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRetrySucceeded.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRetryDropped.load(std::memory_order_relaxed), + GetHeadSyncBacklogSize()); + } + + // Counters incremented by the receiver-side HandleHeadSyncRequest / + // AddHeadIndex callback. Public so the ExtraDynamicSearcher + // HeadSyncCallback lambda can bump them after applying each entry. + void NoteHeadSyncApplyAdd() { + m_headSyncApplyAdd.fetch_add(1, std::memory_order_relaxed); + } + void NoteHeadSyncApplyDelete() { + m_headSyncApplyDelete.fetch_add(1, std::memory_order_relaxed); + } + + // Best-effort log dump of cross-node merge-hint channel counters. + // Mirrors DumpHeadSyncStats: sender side tracks how many hints we + // broadcast (send_ok / send_fail); receiver side tracks how many + // hints we got and how many were dropped (callback missing). + void DumpMergeRequestStats(const char* label) const { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "[MergeHint stats %s] send_ok=%llu send_fail=%llu " + "recv_hints=%llu recv_dropped=%llu\n", + label ? label : "", + (unsigned long long)m_mergeBroadcastSendOK.load(std::memory_order_relaxed), + (unsigned long long)m_mergeBroadcastSendFail.load(std::memory_order_relaxed), + (unsigned long long)m_mergeRecvHints.load(std::memory_order_relaxed), + (unsigned long long)m_mergeRecvDropped.load(std::memory_order_relaxed)); + } + + // ================================================================== + // RemoteLock — synchronous request/response + // ================================================================== + + bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) { + Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIndex); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Cannot send remote lock to node %d\n", nodeIndex); + return false; + } + + RemoteLockRequest req; + req.m_op = lock ? RemoteLockRequest::Op::Lock : RemoteLockRequest::Op::Unlock; + req.m_headID = headID; + req.m_layer = layer; + + Socket::ResourceID rid = m_nextResourceId.fetch_add(1); + auto [future, _] = CreatePendingResponse(rid); + (void)_; + + Socket::Packet pkt; + auto bodySize = req.EstimateBufferSize(); + pkt.Header().m_packetType = Socket::PacketType::RemoteLockRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = rid; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + req.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_net->GetClient()->SendPacket(connID, std::move(pkt), + MakeSendFailHandler(rid)); + + auto status = future.wait_for(std::chrono::milliseconds(5000)); + if (status != std::future_status::ready) { + ErasePending(rid); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Lock timeout for headID %lld on node %d\n", + (std::int64_t)headID, nodeIndex); + return false; + } + return future.get() == ErrorCode::Success; + } + + // ================================================================== + // Inbound packet handlers (called by WorkerNode's server/client) + // ================================================================== + + void HandleAppendRequest(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Empty AppendRequest\n"); + return; + } + + if (Socket::c_invalidConnectionID == packet.Header().m_connectionID) + packet.Header().m_connectionID = connID; + + RemoteAppendRequest req; + const std::uint8_t* body = packet.Body(); + const std::uint8_t* bodyEnd = body + packet.Header().m_bodyLength; + if (req.Read(body, bodyEnd) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: AppendRequest version mismatch\n"); + SendAppendResponse(packet, RemoteAppendResponse::Status::Failed); + return; + } + + ErrorCode result = ErrorCode::Fail; + { + std::shared_lock cbLock(m_callbackLifetimeMutex); + const auto* cb = LookupAppendCallback_Locked(req.m_layer); + if (cb) { + auto headVec = std::make_shared(std::move(req.m_headVec)); + result = (*cb)( + req.m_headID, headVec, req.m_appendNum, req.m_appendPosting); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: AppendRequest layer=%d has no callback registered\n", + req.m_layer); + } + } + + auto status = (result == ErrorCode::Success) + ? RemoteAppendResponse::Status::Success + : RemoteAppendResponse::Status::Failed; + SendAppendResponse(packet, status); + } + + void HandleAppendResponse(Socket::ConnectionID connID, Socket::Packet packet) { + Socket::ResourceID resID = packet.Header().m_resourceID; + auto promise = TakePendingResponse(resID); + if (!promise) return; + + if (packet.Header().m_processStatus != Socket::PacketProcessStatus::Ok) { + promise->set_value(ErrorCode::Fail); + return; + } + + RemoteAppendResponse resp; + if (resp.Read(packet.Body()) == nullptr) { + promise->set_value(ErrorCode::Fail); + return; + } + + promise->set_value( + resp.m_status == RemoteAppendResponse::Status::Success + ? ErrorCode::Success : ErrorCode::Fail); + } + + void HandleBatchAppendRequest(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Empty BatchAppendRequest\n"); + return; + } + + if (Socket::c_invalidConnectionID == packet.Header().m_connectionID) + packet.Header().m_connectionID = connID; + + auto batchReq = std::make_shared(); + if (batchReq->Read(packet.Body(), packet.Header().m_bodyLength) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: BatchAppendRequest parse failed\n"); + SendBatchAppendResponse(packet, 0, 1); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count); + + // Submit each item as a high-priority Job to the searcher's + // shared compute pool. Pool workers run the local Append callback + // exactly like a local insert would. Last completion ACKs the + // sender. This puts remote work on the SAME concurrency budget + // as local Split/Merge/Reassign — eliminating the over-subscribed + // TiKV behaviour of the old separate bg executor + transient + // sub-worker threads. + auto packetPtr = std::make_shared(std::move(packet)); + const size_t total = batchReq->m_items.size(); + if (total == 0) { + SendBatchAppendResponse(*packetPtr, 0, 0); + return; + } + auto remaining = std::make_shared>(total); + auto successCount = std::make_shared>(0); + auto failCount = std::make_shared>(0); + + if (m_jobSubmitters.empty()) { + // Fallback: process inline on the network thread. Should not + // happen once ExtraDynamicSearcher has wired its pool. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: no job submitter wired; running BatchAppend synchronously\n"); + std::shared_lock cbLock(m_callbackLifetimeMutex); + for (auto& req : batchReq->m_items) { + ErrorCode r = ErrorCode::Fail; + const auto* cb = LookupAppendCallback_Locked(req.m_layer); + if (cb) { + auto hv = std::make_shared(std::move(req.m_headVec)); + r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting); + } + (r == ErrorCode::Success ? *successCount : *failCount).fetch_add(1); + } + SendBatchAppendResponse(*packetPtr, successCount->load(), failCount->load()); + return; + } + + for (size_t i = 0; i < total; i++) { + auto* job = new BatchAppendItemJob( + this, batchReq, i, remaining, successCount, failCount, packetPtr); + // Route to the per-layer searcher pool matching this item's + // m_layer so local Append/Split/Merge on layer N and remote + // appends targeting layer N share the same 16-thread budget. + // A single global submitter sent both layers' work into one + // pool, causing 35k+ queue depth on the receiver side. + int layer = batchReq->m_items[i].m_layer; + const JobSubmitter* sub = nullptr; + if (layer >= 0 && static_cast(layer) < m_jobSubmitters.size() + && m_jobSubmitters[layer]) { + sub = &m_jobSubmitters[layer]; + } else { + // Layer's pool not yet wired — fall back to whichever + // submitter we have. + for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } + } + // Normal priority. Per-layer routing (m_jobSubmitters[layer]) + // already isolates layer-N append items from other layers' + // pools. High priority starved split entirely (split:N + // in_flight, 0 completed) because once all 16 worker threads + // are running long-tail append items, fresh high-prio appends + // keep cutting in front of split. Append throughput per chunk + // is limited by pool concurrency × per-item RMW; widen the + // pool (AppendThreadNum) instead of using priority hacks. + if (sub) (*sub)(job, /*high=*/false); + else { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); } + } + } + + void HandleBatchAppendResponse(Socket::ConnectionID connID, Socket::Packet packet) { + Socket::ResourceID resID = packet.Header().m_resourceID; + auto promise = TakePendingResponse(resID); + if (!promise) return; + + if (packet.Header().m_processStatus != Socket::PacketProcessStatus::Ok) { + promise->set_value(ErrorCode::Fail); + return; + } + + BatchRemoteAppendResponse resp; + if (resp.Read(packet.Body()) == nullptr) { + promise->set_value(ErrorCode::Fail); + return; + } + + promise->set_value(resp.m_failCount == 0 ? ErrorCode::Success : ErrorCode::Fail); + } + + void HandleHeadSyncRequest(Socket::ConnectionID connID, Socket::Packet packet) { + std::shared_lock cbLock(m_callbackLifetimeMutex); + if (m_headSyncCallbacks.empty()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSyncRequest but no callbacks registered\n"); + return; + } + + const std::uint8_t* buf = packet.Body(); + const std::uint8_t* bufEnd = buf + packet.Header().m_bodyLength; + std::uint32_t entryCount = 0; + buf = Socket::SimpleSerialization::SimpleReadBuffer(buf, entryCount); + + std::uint32_t bodyLength = packet.Header().m_bodyLength; + if (bodyLength < sizeof(std::uint32_t) || + entryCount > (bodyLength - sizeof(std::uint32_t)) / 8) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSyncRequest entryCount=%u exceeds bodyLength=%u\n", + entryCount, bodyLength); + return; + } + + for (std::uint32_t i = 0; i < entryCount; i++) { + if (buf >= bufEnd) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSync buffer overrun at entry %u/%u\n", i, entryCount); + break; + } + HeadSyncEntry entry; + buf = entry.Read(buf); + if (!buf || buf > bufEnd) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSync parse error at entry %u/%u\n", i, entryCount); + break; + } + m_headSyncRecvEntries.fetch_add(1, std::memory_order_relaxed); + const auto* cb = LookupHeadSyncCallback_Locked(entry.m_layer); + if (cb) { + (*cb)(entry); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSyncEntry layer=%d has no callback registered (op=%d, vid=%d)\n", + entry.m_layer, static_cast(entry.op), (int)entry.headVID); + } + } + } + + // ================================================================== + // Merge — fire-and-forget cross-node hint + // ================================================================== + + /// Send a batch of merge hints to one peer. Fire-and-forget: no + /// response is expected and no retry queue is maintained. Receiver- + /// side m_mergeList dedups, and the owner discovers underfull + /// postings through its own paths (own search, own Append) if any + /// notification is dropped. + void SendBatchRemoteMerge(int targetNodeIndex, + const std::vector& items) + { + if (items.empty()) return; + + Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); + if (connID == Socket::c_invalidConnectionID) { + m_mergeBroadcastSendFail.fetch_add(items.size(), std::memory_order_relaxed); + return; + } + + BatchRemoteMergeRequest batch; + batch.m_count = static_cast(items.size()); + batch.m_items = items; + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::MergeRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + + auto bodySize = static_cast(batch.EstimateBufferSize()); + pkt.Header().m_bodyLength = bodySize; + pkt.AllocateBuffer(bodySize); + batch.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + const std::uint64_t sentCount = items.size(); + m_net->GetClient()->SendPacket(connID, std::move(pkt), + [this, targetNodeIndex, sentCount](bool success) { + if (success) { + m_mergeBroadcastSendOK.fetch_add(sentCount, std::memory_order_relaxed); + } else { + m_mergeBroadcastSendFail.fetch_add(sentCount, std::memory_order_relaxed); + m_net->InvalidatePeerConnection(targetNodeIndex); + } + }); + } + + void HandleMergeRequest(Socket::ConnectionID connID, Socket::Packet packet) { + (void)connID; + BatchRemoteMergeRequest batch; + if (batch.Read(packet.Body(), packet.Header().m_bodyLength) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: MergeRequest parse failed (bodyLength=%u)\n", + packet.Header().m_bodyLength); + return; + } + + std::shared_lock cbLock(m_callbackLifetimeMutex); + for (const auto& item : batch.m_items) { + const auto* cb = LookupMergeCallback_Locked(item.m_layer); + if (cb) { + (*cb)(item.m_headID); + m_mergeRecvHints.fetch_add(1, std::memory_order_relaxed); + } else { + m_mergeRecvDropped.fetch_add(1, std::memory_order_relaxed); + } + } + } + + void HandleRemoteLockRequest(Socket::ConnectionID connID, Socket::Packet packet) { + RemoteLockRequest req; + if (req.Read(packet.Body()) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Failed to parse RemoteLockRequest\n"); + return; + } + + RemoteLockResponse resp; + resp.m_status = RemoteLockResponse::Status::Denied; + + { + std::shared_lock cbLock(m_callbackLifetimeMutex); + const auto* cb = LookupRemoteLockCallback_Locked(req.m_layer); + if (cb) { + bool isLock = (req.m_op == RemoteLockRequest::Op::Lock); + bool success = (*cb)(req.m_headID, isLock); + if (success) resp.m_status = RemoteLockResponse::Status::Granted; + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: RemoteLockRequest layer=%d has no callback registered\n", + req.m_layer); + } + } + + Socket::Packet ret; + auto bodySize = resp.EstimateBufferSize(); + ret.Header().m_packetType = Socket::PacketType::RemoteLockResponse; + ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + ret.Header().m_connectionID = connID; + ret.Header().m_resourceID = packet.Header().m_resourceID; + ret.Header().m_bodyLength = static_cast(bodySize); + ret.AllocateBuffer(static_cast(bodySize)); + resp.Write(ret.Body()); + ret.Header().WriteBuffer(ret.HeaderBuffer()); + + m_net->GetServer()->SendPacket(connID, std::move(ret), nullptr); + } + + void HandleRemoteLockResponse(Socket::ConnectionID connID, Socket::Packet packet) { + Socket::ResourceID rid = packet.Header().m_resourceID; + auto promise = TakePendingResponse(rid); + if (!promise) return; + + RemoteLockResponse resp; + if (resp.Read(packet.Body()) == nullptr) { + promise->set_value(ErrorCode::Fail); + return; + } + + promise->set_value(resp.m_status == RemoteLockResponse::Status::Granted + ? ErrorCode::Success : ErrorCode::Fail); + } + + // ---- Response matching helpers ---- + + std::pair, bool> CreatePendingResponse(Socket::ResourceID resID) { + std::promise promise; + auto future = promise.get_future(); + std::lock_guard lock(m_pendingMutex); + m_pendingResponses.emplace(resID, std::move(promise)); + return {std::move(future), true}; + } + + void ErasePending(Socket::ResourceID resID) { + std::lock_guard lock(m_pendingMutex); + m_pendingResponses.erase(resID); + } + + /// Take a pending promise out of the map (returns nullptr if not found). + std::unique_ptr> TakePendingResponse(Socket::ResourceID resID) { + std::lock_guard lock(m_pendingMutex); + auto it = m_pendingResponses.find(resID); + if (it == m_pendingResponses.end()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Response for unknown resourceID %u\n", resID); + return nullptr; + } + auto p = std::make_unique>(std::move(it->second)); + m_pendingResponses.erase(it); + return p; + } + + /// Create a send-failure callback that resolves the pending promise. + std::function MakeSendFailHandler(Socket::ResourceID resID) { + return [resID, this](bool success) { + if (!success) { + std::lock_guard lock(m_pendingMutex); + auto it = m_pendingResponses.find(resID); + if (it != m_pendingResponses.end()) { + it->second.set_value(ErrorCode::Fail); + m_pendingResponses.erase(it); + } + } + }; + } + + void SendAppendResponse(Socket::Packet& srcPacket, RemoteAppendResponse::Status status) { + RemoteAppendResponse resp; + resp.m_status = status; + + Socket::Packet ret; + ret.Header().m_packetType = Socket::PacketType::AppendResponse; + ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + ret.Header().m_connectionID = srcPacket.Header().m_connectionID; + ret.Header().m_resourceID = srcPacket.Header().m_resourceID; + + auto bodySize = static_cast(resp.EstimateBufferSize()); + ret.Header().m_bodyLength = bodySize; + ret.AllocateBuffer(bodySize); + resp.Write(ret.Body()); + ret.Header().WriteBuffer(ret.HeaderBuffer()); + + m_net->GetServer()->SendPacket(srcPacket.Header().m_connectionID, std::move(ret), nullptr); + } + + void SendBatchAppendResponse(Socket::Packet& srcPacket, + std::uint32_t successCount, std::uint32_t failCount) { + BatchRemoteAppendResponse resp; + resp.m_successCount = successCount; + resp.m_failCount = failCount; + + Socket::Packet ret; + ret.Header().m_packetType = Socket::PacketType::BatchAppendResponse; + ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + ret.Header().m_connectionID = srcPacket.Header().m_connectionID; + ret.Header().m_resourceID = srcPacket.Header().m_resourceID; + + auto bodySize = static_cast(resp.EstimateBufferSize()); + ret.Header().m_bodyLength = bodySize; + ret.AllocateBuffer(bodySize); + resp.Write(ret.Body()); + ret.Header().WriteBuffer(ret.HeaderBuffer()); + + m_net->GetServer()->SendPacket(srcPacket.Header().m_connectionID, std::move(ret), nullptr); + } + + // ================================================================== + // [Bug 26] Background executor — slow-lane for batch RPC handlers + // ================================================================== + // + // Why: the network server thread pool has only 8 threads + // (NetworkNode.h). HandleBatchAppendRequest does heavy TiKV work + // (fan out to 4 sub-workers and join), each call tying up its + // network thread for tens of seconds during inserts. + // Once 4–8 such handlers run concurrently, every network thread is + // blocked and latency-sensitive RPCs (HeadSync, RemoteLock) cannot be + // serviced. + // + // Fix: parse on the network thread (fast), then enqueue the heavy + // work onto a dedicated background thread pool and return. The + // network thread immediately becomes available for other RPCs. + // The background worker eventually sends the response itself. + // + // Sizing rationale: + // - Threads default to 8: matches the network pool so we never + // under-utilize CPU even if every network thread is parsing a + // batch. Tunable via env SPTAG_BG_EXEC_THREADS. + // - Queue cap default 256: plenty of headroom for typical bursts; + // when full, falls back to synchronous execution to preserve + // correctness rather than dropping requests. + + // Background executor removed: BatchAppend now runs as sub-Jobs on + // the searcher's shared compute pool via SetJobSubmitter() so it + // shares a single concurrency budget with local Split/Merge/Reassign + // (with high-priority jumping the queue). See HandleBatchAppendRequest. + + // ================================================================== + // HeadSync retry thread — periodic best-effort drain of per-peer + // backlogs that were populated by failed BroadcastHeadSync sends. + // + // Why: BroadcastHeadSync is fire-and-forget by design (we don't + // want to block the layer-1 split path on a slow peer). When the + // TCP send completion reports failure, we previously dropped the + // entries forever and the peer's headIndex / m_pSamples diverged, + // causing the receiver's BKTree to miss heads at search time and + // recall to collapse on later batches. The retry queue + this + // thread make HeadSync delivery reliable best-effort. + // ================================================================== + + struct HeadSyncBacklog { + std::mutex mu; + std::deque queue; + // Matches m_addCountForRebuild scale per peer. If we ever hit + // this we log + drop (fall back to manual reconcile). + static constexpr size_t kMaxEntries = 1u << 18; // 262144 + }; + + void StartHeadSyncRetryThread() { + const char* envIntervalMs = std::getenv("SPTAG_HEADSYNC_RETRY_INTERVAL_MS"); + int intervalMs = 500; + if (envIntervalMs) { + try { intervalMs = std::max(50, std::stoi(envIntervalMs)); } catch (...) {} + } + m_headSyncRetryIntervalMs = intervalMs; + m_headSyncRetryStop.store(false, std::memory_order_release); + m_headSyncRetryThread = std::thread([this]() { HeadSyncRetryLoop(); }); + } + + void StopHeadSyncRetryThread() { + m_headSyncRetryStop.store(true, std::memory_order_release); + if (m_headSyncRetryThread.joinable()) m_headSyncRetryThread.join(); + } + + void HeadSyncRetryLoop() { + using namespace std::chrono; + while (!m_headSyncRetryStop.load(std::memory_order_acquire)) { + std::this_thread::sleep_for(milliseconds(m_headSyncRetryIntervalMs)); + if (m_net) DrainHeadSyncBacklog(); + } + // Final drain pass to give the network a chance to flush. + for (int i = 0; i < 5 && m_net; i++) { + size_t dispatched = DrainHeadSyncBacklog(); + if (dispatched == 0) break; + std::this_thread::sleep_for(milliseconds(200)); + } + if (m_headSyncBroadcastEntries.load(std::memory_order_relaxed) > 0 + || m_headSyncRecvEntries.load(std::memory_order_relaxed) > 0) { + DumpHeadSyncStats("shutdown"); + } + if (m_mergeBroadcastSendOK.load(std::memory_order_relaxed) > 0 + || m_mergeRecvHints.load(std::memory_order_relaxed) > 0) { + DumpMergeRequestStats("shutdown"); + } + } + + std::shared_ptr GetOrCreateBacklog(int nodeIdx) { + { + std::shared_lock lk(m_headSyncBacklogsMu); + auto it = m_headSyncBacklogs.find(nodeIdx); + if (it != m_headSyncBacklogs.end()) return it->second; + } + std::unique_lock lk(m_headSyncBacklogsMu); + auto& slot = m_headSyncBacklogs[nodeIdx]; + if (!slot) slot = std::make_shared(); + return slot; + } + + // ---- State ---- + + NetworkAccess* m_net = nullptr; + + // Per-layer callback registries. Indexed by ExtraDynamicSearcher layer + // (m_layer at the call site). Resized lazily by SetXxxCallback. The + // empty/null entry at layer 0 is preserved so a single-layer caller + // (legacy or test) without explicit Set keeps the no-op default. + // + // The shared-callback design existed because the original SPANN had + // a single ExtraDynamicSearcher (Layers=1). With Layers>=2, each + // layer's lambda captures its own `this` (hence m_layer) and dispatch + // by request.m_layer is required to avoid routing layer-0 events to + // layer-1's storage and vice versa. + std::vector m_appendCallbacks; + std::vector m_headSyncCallbacks; + std::vector m_remoteLockCallbacks; + std::vector m_mergeCallbacks; + + // Per-layer ownership tokens. Each ExtraDynamicSearcher claims its + // layer slot at SetWorker time and releases it on destruction; this + // prevents earlier-layer destructors from wiping a later-layer's + // callbacks (the original ClaimCallbackOwnership purpose, now + // applied per-layer instead of globally). + std::vector> m_callbackOwners; + + // Guards the lifetime of the captured `this` inside the callbacks. + // Held in shared mode by every callback invocation site, and in + // exclusive mode by ClearCallbacks() / SetXxxCallback() so that + // (re)assigning a callback can never race with an in-flight invocation. + mutable std::shared_timed_mutex m_callbackLifetimeMutex; + + std::atomic m_nextResourceId{1}; + std::mutex m_pendingMutex; + std::unordered_map> m_pendingResponses; + + // Per-item Job: each remote append request becomes one Job submitted + // to the searcher's shared SPDKThreadPool. The last completing Job + // ACKs the sender. Identical to how a local insert thread would call + // Append; the only difference is the request originated on a peer. + class BatchAppendItemJob : public Helper::ThreadPool::Job { + public: + BatchAppendItemJob(RemotePostingOps* ops, + std::shared_ptr batchReq, + size_t index, + std::shared_ptr> remaining, + std::shared_ptr> successCount, + std::shared_ptr> failCount, + std::shared_ptr replyPacket) + : m_ops(ops), m_batchReq(std::move(batchReq)), m_index(index), + m_remaining(std::move(remaining)), + m_success(std::move(successCount)), + m_fail(std::move(failCount)), + m_replyPacket(std::move(replyPacket)) {} + + void exec(IAbortOperation*) override { run(); } + void exec(void* workspace, IAbortOperation*) override { + void* prev = tls_preallocAppendWorkSpace; + tls_preallocAppendWorkSpace = workspace; + run(); + tls_preallocAppendWorkSpace = prev; + } + + private: + void run() { + { + std::shared_lock cbLock(m_ops->m_callbackLifetimeMutex); + auto& req = m_batchReq->m_items[m_index]; + ErrorCode r = ErrorCode::Fail; + const auto* cb = m_ops->LookupAppendCallback_Locked(req.m_layer); + if (cb) { + auto hv = std::make_shared(std::move(req.m_headVec)); + r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting); + } + if (r == ErrorCode::Success) m_success->fetch_add(1); + else m_fail->fetch_add(1); + } + if (m_remaining->fetch_sub(1) == 1) { + m_ops->SendBatchAppendResponse( + *m_replyPacket, m_success->load(), m_fail->load()); + } + } + + RemotePostingOps* m_ops; + std::shared_ptr m_batchReq; + size_t m_index; + std::shared_ptr> m_remaining; + std::shared_ptr> m_success; + std::shared_ptr> m_fail; + std::shared_ptr m_replyPacket; + }; + + // [Bug 26 retired] bg executor removed — see HandleBatchAppendRequest. + // m_bgWorkers etc were replaced by per-layer job submission into the + // searcher's shared SPDKThreadPool via m_jobSubmitters[layer]. + std::vector m_jobSubmitters; + + // HeadSync delivery diagnostics + retry queue (v33). Counters give + // observability for sender/receiver gaps; per-peer backlogs + + // retry thread make broadcast reliable best-effort. + std::atomic m_headSyncBroadcastEntries{0}; + std::atomic m_headSyncBroadcastSendOK{0}; + std::atomic m_headSyncBroadcastSendFail{0}; + std::atomic m_headSyncRecvEntries{0}; + std::atomic m_headSyncApplyAdd{0}; + std::atomic m_headSyncApplyDelete{0}; + std::atomic m_headSyncRetryEnqueued{0}; + std::atomic m_headSyncRetrySucceeded{0}; + std::atomic m_headSyncRetryDropped{0}; + + // Cross-node merge hint counters. No retry queue: dropped + // notifications are recoverable since the owner discovers underfull + // postings via its own paths too. + std::atomic m_mergeBroadcastSendOK{0}; + std::atomic m_mergeBroadcastSendFail{0}; + std::atomic m_mergeRecvHints{0}; + std::atomic m_mergeRecvDropped{0}; + + mutable std::shared_timed_mutex m_headSyncBacklogsMu; + std::unordered_map> m_headSyncBacklogs; + std::thread m_headSyncRetryThread; + std::atomic m_headSyncRetryStop{false}; + int m_headSyncRetryIntervalMs{500}; + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h new file mode 100644 index 000000000..8af906fcc --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -0,0 +1,616 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_WORKERNODE_H_ +#define _SPTAG_SPANN_WORKERNODE_H_ + +#include "inc/Core/SPANN/Distributed/NetworkNode.h" +#include "inc/Helper/KeyValueIO.h" +#include "inc/Helper/CommonHelper.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Distributed compute worker node. + /// + /// Responsibilities: + /// - Route headIDs to owner nodes via consistent hash ring + /// - Queue and flush remote appends (batched RPC) + /// - HeadSync broadcast and remote locking + /// - Register with dispatcher and receive ring updates + /// - Handle incoming dispatch commands from the driver + class WorkerNode : public NetworkNode { + public: + using AppendCallback = RemotePostingOps::AppendCallback; + using DispatchCallback = DispatchCoordinator::DispatchCallback; + using HeadSyncCallback = RemotePostingOps::HeadSyncCallback; + using RemoteLockCallback = RemotePostingOps::RemoteLockCallback; + + /// Initialize with separate dispatcher/worker/store addresses. + /// workerIndex is 0-based (0 = driver/local, 1+ = remote). + /// Internal node index = workerIndex + 1 (0 is reserved for dispatcher). + bool Initialize( + std::shared_ptr p_db, + int workerIndex, + const std::pair& dispatcherAddr, + const std::vector>& workerAddrs, + const std::vector& storeAddrs, + int vnodeCount = 150) + { + if (storeAddrs.empty()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "WorkerNode::Initialize: storeAddrs is empty\n"); + return false; + } + + // Build combined addr list: [dispatcher, worker0, worker1, ...] + std::vector> allAddrs; + allAddrs.push_back(dispatcherAddr); + allAddrs.insert(allAddrs.end(), workerAddrs.begin(), workerAddrs.end()); + + int internalIdx = workerIndex + 1; // 0 = dispatcher, 1..N = workers + if (!InitializeNetwork(internalIdx, allAddrs, vnodeCount)) return false; + + // [Bug 30] Populate compute-role fields so callers can ask + // "how many data shards?" / "which shard am I?" without + // accidentally including the dispatcher slot. + m_numDispatchNodes = 1; + m_numWorkerNodes = static_cast(workerAddrs.size()); + m_workerNodeIndex = workerIndex; + + m_db = p_db; + m_nodeStores = storeAddrs; + + // Build store → node list mapping (worker internal indices 1..N) + int numWorkers = static_cast(workerAddrs.size()); + int numStores = static_cast(storeAddrs.size()); + for (int wi = 0; wi < numWorkers; wi++) { + int storeIdx = wi % numStores; + m_storeToNodes[storeAddrs[storeIdx]].push_back(wi + 1); + } + for (auto& [store, nodes] : m_storeToNodes) { + std::string nodeList; + for (int n : nodes) { nodeList += std::to_string(n) + " "; } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: store %s → nodes [%s]\n", store.c_str(), nodeList.c_str()); + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: initialized (workerIndex=%d, internalIdx=%d, %d stores, %d vnodes/node)\n", + workerIndex, internalIdx, numStores, vnodeCount); + + m_dispatch.SetNetwork(this); + m_remoteOps.SetNetwork(this); + + return true; + } + + public: + bool Start() { return StartNetwork(); } + + // ---- Callbacks ---- + // + // ExtraDynamicSearcher passes its m_layer when binding callbacks so + // that with multi-layer SPANN (Layers >= 2) each layer has its own + // captured `this` and request dispatch on the receiver side routes by + // request.m_layer. + + void SetAppendCallback(int layer, AppendCallback cb) { m_remoteOps.SetAppendCallback(layer, std::move(cb)); } + void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { m_remoteOps.SetHeadSyncCallback(layer, std::move(cb)); } + void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { m_remoteOps.SetRemoteLockCallback(layer, std::move(cb)); } + // Inject the searcher's shared compute pool so receiver-side + // BatchAppend work runs there (high-priority Jobs) instead of in a + // separate executor. Idempotent: safe to call multiple times. + void SetJobSubmitter(int layer, RemotePostingOps::JobSubmitter s) { + m_remoteOps.SetJobSubmitter(layer, std::move(s)); + } + /// Atomically clear all RPC callbacks (every layer) and wait for any + /// in-flight invocation to finish. + void ClearCallbacks() { + m_remoteOps.ClearCallbacks(); + } + /// Per-layer ownership API used by ExtraDynamicSearcher to avoid having + /// one layer's destructor wipe another layer's still-active callbacks. + /// SetWorker calls ClaimCallbackOwnership(m_layer, this) before + /// registering; the destructor calls ClearCallbacksIfOwner(m_layer, this). + void ClaimCallbackOwnership(int layer, const void* owner) { + m_remoteOps.ClaimCallbackOwnership(layer, owner); + } + bool ClearCallbacksIfOwner(int layer, const void* owner) { + return m_remoteOps.ClearCallbacksIfOwner(layer, owner); + } + void SetDispatchCallback(DispatchCallback cb) { m_dispatch.SetDispatchCallback(std::move(cb)); } + void ClearDispatchCallback() { m_dispatch.ClearDispatchCallback(); } + + // ---- Routing ---- + + RouteTarget GetOwner(SizeType headID) { + RouteTarget target; + target.isLocal = true; + target.nodeIndex = m_localNodeIndex; + + if (!m_enabled) { + m_routeStats.disabled++; + return target; + } + { + auto ring = std::atomic_load(&m_hashRing); + if (!ring || ring->NodeCount() <= 1) { + m_routeStats.local++; + return target; + } + target.nodeIndex = ring->GetOwner(headID); + } + target.isLocal = (target.nodeIndex == m_localNodeIndex); + if (target.isLocal) m_routeStats.local++; + else m_routeStats.remote++; + return target; + } + + void LogRouteStats(const char* context = "") { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode stats%s: local=%d remote=%d disabled=%d keyMiss=%d noMapping=%d\n", + context, (int)m_routeStats.local, (int)m_routeStats.remote, + (int)m_routeStats.disabled, (int)m_routeStats.keyMiss, + (int)m_routeStats.noMapping); + } + + void ResetRouteStats() { + m_routeStats.local.store(0); + m_routeStats.remote.store(0); + m_routeStats.disabled.store(0); + m_routeStats.keyMiss.store(0); + m_routeStats.noMapping.store(0); + } + + // ---- Remote posting ops ---- + + ErrorCode SendRemoteAppend(int targetNodeIndex, int layer, SizeType headID, + const std::shared_ptr& headVec, int appendNum, + std::string& appendPosting) + { + return m_remoteOps.SendRemoteAppend(targetNodeIndex, layer, headID, headVec, appendNum, appendPosting); + } + + ErrorCode SendBatchRemoteAppend(int targetNodeIndex, std::vector& items) { + return m_remoteOps.SendBatchRemoteAppend(targetNodeIndex, items); + } + + void BroadcastHeadSync(const std::vector& entries) { + if (!m_enabled) return; + m_remoteOps.BroadcastHeadSync(entries); + } + + // v33: expose HeadSync delivery diagnostics + retry queue. + void DumpHeadSyncStats(const char* label) const { + m_remoteOps.DumpHeadSyncStats(label); + } + // Cross-node merge-hint channel diagnostics. + void DumpMergeRequestStats(const char* label) const { + m_remoteOps.DumpMergeRequestStats(label); + } + size_t GetHeadSyncBacklogSize() const { + return m_remoteOps.GetHeadSyncBacklogSize(); + } + size_t DrainHeadSyncBacklog(size_t maxBatch = 1024) { + return m_remoteOps.DrainHeadSyncBacklog(maxBatch); + } + void NoteHeadSyncApplyAdd() { + m_remoteOps.NoteHeadSyncApplyAdd(); + } + void NoteHeadSyncApplyDelete() { + m_remoteOps.NoteHeadSyncApplyDelete(); + } + + bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) { + if (!m_enabled) return false; + return m_remoteOps.SendRemoteLock(nodeIndex, layer, headID, lock); + } + + void SetMergeCallback(int layer, RemotePostingOps::MergeCallback cb) { + m_remoteOps.SetMergeCallback(layer, std::move(cb)); + } + + // ---- Append queue ---- + + void QueueRemoteAppend(int nodeIndex, RemoteAppendRequest req) { + std::vector toFlush; + bool didReserveSlot = false; + { + std::lock_guard lock(m_appendQueueMutex); + auto& q = m_appendQueue[nodeIndex]; + q.push_back(std::move(req)); + m_remoteQueueSize.fetch_add(1, std::memory_order_relaxed); + // [PERF] Auto-flush per node once we have a full chunk worth + // (kAutoFlushThreshold items). Without this, every remote + // append accumulates until end-of-batch FlushRemoteAppends — + // which then sends hundreds of thousands of items serially + // (10k chunks * ~3s/chunk) AFTER all insert compute is done. + // Auto-flushing while inserts keep running overlaps the + // network with CPU and drops end-of-batch tail latency. + // + // [v38] Allow up to kMaxInflightPerNode concurrent in-flight + // chunks per node so a producer burst (split fan-out, reassign + // wave) can saturate the receiver's bg-executor pool instead of + // queueing up serially behind a single per-node mutex. + if (q.size() >= kAutoFlushThreshold + && m_perNodeInflight[nodeIndex] < kMaxInflightPerNode) { + toFlush.swap(q); + m_remoteQueueSize.fetch_sub(toFlush.size(), std::memory_order_relaxed); + ++m_perNodeInflight[nodeIndex]; + didReserveSlot = true; + } + } + if (!didReserveSlot) return; + + // Fire-and-forget async send. After the initial chunk completes, + // the same thread loops to pick up any further accumulation so we + // avoid thread-spawn churn while keeping per-node concurrency at + // kMaxInflightPerNode. Order across batches is best-effort: the + // receiver runs 8 worker threads on each chunk that already + // interleave items within a chunk, so cross-chunk ordering adds + // no extra correctness risk for the per-posting RMW path. + auto items = std::make_shared>(std::move(toFlush)); + m_inflightAppendFlushes.fetch_add(1, std::memory_order_relaxed); + std::thread([this, nodeIndex, items]() { + while (true) { + ErrorCode ret = SendBatchRemoteAppend(nodeIndex, *items); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "QueueRemoteAppend auto-flush: batch to node %d failed (%zu items)\n", + nodeIndex, items->size()); + } + items->clear(); + { + std::lock_guard lock(m_appendQueueMutex); + auto it = m_appendQueue.find(nodeIndex); + if (it == m_appendQueue.end() + || it->second.size() < kAutoFlushThreshold) { + --m_perNodeInflight[nodeIndex]; + break; + } + items->swap(it->second); + m_remoteQueueSize.fetch_sub(items->size(), + std::memory_order_relaxed); + } + } + m_inflightAppendFlushes.fetch_sub(1, std::memory_order_relaxed); + }).detach(); + } + + size_t GetRemoteQueueSize() const { + return m_remoteQueueSize.load(std::memory_order_relaxed); + } + + ErrorCode FlushRemoteAppends() { + // Drain the queue under m_flushMutex so concurrent flush callers + // serialize. Loop in case items get queued mid-send. This avoids + // the thundering-herd of 100+ concurrent FlushRemoteAppends calls + // (one per split worker) overwhelming the remote node's tiny + // (8-thread, 256-connection-pool) network server. + std::lock_guard flushGuard(m_flushMutex); + + // Wait for any in-flight async auto-flushes triggered by + // QueueRemoteAppend (>= kAutoFlushThreshold) to drain so the + // residue we send below is the actual tail. Callers invoke + // FlushRemoteAppends after all producers (AddIndex / split / + // reassign) have quiesced, so no new auto-flushes will start + // here. + while (m_inflightAppendFlushes.load(std::memory_order_relaxed) > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + + int errors = 0; + int iterations = 0; + while (true) { + std::unordered_map> toSend; + { + std::lock_guard lock(m_appendQueueMutex); + if (m_appendQueue.empty()) break; + toSend.swap(m_appendQueue); + m_remoteQueueSize.store(0, std::memory_order_relaxed); + } + if (toSend.empty()) break; + ++iterations; + + std::atomic iterErrors{0}; + std::vector threads; + for (auto& [nodeIdx, items] : toSend) { + if (items.empty()) continue; + threads.emplace_back([this, &iterErrors, nodeIdx, &items]() { + // Per-node mutex serializes against any straggler + // auto-flush still in flight for this node. + std::mutex& nodeMtx = GetPerNodeAppendFlushMutex(nodeIdx); + std::lock_guard nlock(nodeMtx); + ErrorCode ret = SendBatchRemoteAppend(nodeIdx, items); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "FlushRemoteAppends: batch to node %d failed (%d items)\n", + nodeIdx, (int)items.size()); + iterErrors++; + } + }); + } + for (auto& t : threads) t.join(); + errors += iterErrors.load(); + } + return errors > 0 ? ErrorCode::Fail : ErrorCode::Success; + } + + // ---- Cross-node merge hint queue ---- + // + // Search-side fire-and-forget notifications: node X sees posting H + // underfull, where H is owned by Y. We dedup (layer, headID) within + // a flush window and batch-send to Y in one packet. The receiver's + // m_mergeList dedups on top of this, so an occasional dropped or + // duplicated notification only costs a few cycles. + void QueueRemoteMerge(int nodeIndex, int layer, SizeType headID) { + std::vector toFlush; + { + std::lock_guard lock(m_mergeQueueMutex); + std::int64_t key = (static_cast(layer) << 32) + | static_cast(headID); + auto& bucket = m_mergeQueue[nodeIndex]; + if (!bucket.insert(key).second) return; // already pending + m_mergeQueueSize.fetch_add(1, std::memory_order_relaxed); + + if (bucket.size() >= kMergeAutoFlushThreshold) { + toFlush.reserve(bucket.size()); + for (std::int64_t k : bucket) { + RemoteMergeRequest req; + req.m_layer = static_cast(k >> 32); + req.m_headID = static_cast(static_cast(k & 0xFFFFFFFF)); + toFlush.push_back(std::move(req)); + } + m_mergeQueueSize.fetch_sub(bucket.size(), std::memory_order_relaxed); + bucket.clear(); + } + } + if (!toFlush.empty()) { + m_remoteOps.SendBatchRemoteMerge(nodeIndex, toFlush); + } + } + + ErrorCode FlushRemoteMerges() { + std::unordered_map> toSend; + { + std::lock_guard lock(m_mergeQueueMutex); + if (m_mergeQueue.empty()) return ErrorCode::Success; + for (auto& [nodeIdx, bucket] : m_mergeQueue) { + auto& vec = toSend[nodeIdx]; + vec.reserve(bucket.size()); + for (std::int64_t k : bucket) { + RemoteMergeRequest req; + req.m_layer = static_cast(k >> 32); + req.m_headID = static_cast(static_cast(k & 0xFFFFFFFF)); + vec.push_back(std::move(req)); + } + } + m_mergeQueue.clear(); + m_mergeQueueSize.store(0, std::memory_order_relaxed); + } + for (auto& [nodeIdx, items] : toSend) { + if (!items.empty()) m_remoteOps.SendBatchRemoteMerge(nodeIdx, items); + } + return ErrorCode::Success; + } + + // ---- Ring protocol (worker side) ---- + + bool WaitForRing(int timeoutSec = 120) { + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSec); + while (std::chrono::steady_clock::now() < deadline) { + auto ring = std::atomic_load(&m_hashRing); + if (ring && ring->NodeCount() > 0) return true; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "WorkerNode: Timed out waiting for ring (%ds)\n", timeoutSec); + return false; + } + + // ---- Data members (public for ExtraDynamicSearcher access) ---- + + std::shared_ptr m_db; + std::vector m_nodeStores; + std::unordered_map> m_storeToNodes; + + struct RouteStats { + std::atomic local{0}; + std::atomic remote{0}; + std::atomic disabled{0}; + std::atomic keyMiss{0}; + std::atomic noMapping{0}; + } m_routeStats; + + protected: + void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::AppendRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleAppendRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::BatchAppendRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleBatchAppendRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::HeadSyncRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleHeadSyncRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RemoteLockRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleRemoteLockRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::MergeRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleMergeRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchCommand, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchCommand(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RingUpdate, + [this](Socket::ConnectionID c, Socket::Packet p) { HandleRingUpdate(c, std::move(p)); }); + } + + void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::AppendResponse, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleAppendResponse(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::BatchAppendResponse, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleBatchAppendResponse(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RemoteLockResponse, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleRemoteLockResponse(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + } + + void BgProtocolStep() override { + // Keep sending NodeRegister until ring is populated + auto ring = std::atomic_load(&m_hashRing); + if (!ring || ring->NodeCount() == 0) { + Socket::ConnectionID connID = Socket::c_invalidConnectionID; + { + std::lock_guard lock(m_connMutex); + if (m_dispatcherNodeIndex < (int)m_peerConnections.size()) + connID = m_peerConnections[m_dispatcherNodeIndex]; + } + if (connID != Socket::c_invalidConnectionID) { + SendNodeRegister(); + } + } + } + + bool IsRingSettled() const override { + auto ring = std::atomic_load(&m_hashRing); + return ring && ring->NodeCount() > 0; + } + + private: + void SendNodeRegister() { + NodeRegisterMsg msg; + msg.m_nodeIndex = m_localNodeIndex; + msg.m_host = m_nodeAddrs[m_localNodeIndex].first; + msg.m_port = m_nodeAddrs[m_localNodeIndex].second; + // Worker's 0-based index = m_localNodeIndex - 1 (since 0 is dispatcher) + int workerIdx = m_localNodeIndex - 1; + int numStores = static_cast(m_nodeStores.size()); + msg.m_store = (numStores > 0) ? m_nodeStores[workerIdx % numStores] : ""; + + std::size_t bodySize = msg.EstimateBufferSize(); + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::NodeRegisterRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + auto connID = GetPeerConnection(m_dispatcherNodeIndex); + if (connID != Socket::c_invalidConnectionID) { + m_client->SendPacket(connID, std::move(pkt), nullptr); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: Sent NodeRegister (node %d) to dispatcher\n", m_localNodeIndex); + } + } + + void HandleRingUpdate(Socket::ConnectionID connID, Socket::Packet packet) { + RingUpdateMsg msg; + if (!msg.Read(packet.Body())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "WorkerNode: Failed to parse RingUpdate\n"); + return; + } + + auto newRing = std::make_shared(msg.m_vnodeCount); + for (auto idx : msg.m_nodeIndices) { + newRing->AddNode(idx); + } + { + std::lock_guard guard(m_ringWriteMutex); + std::atomic_store(&m_hashRing, + std::shared_ptr(std::move(newRing))); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: Ring updated — %d nodes (v%u)\n", + (int)msg.m_nodeIndices.size(), msg.m_ringVersion); + + SendRingUpdateACK(msg.m_ringVersion); + } + + void SendRingUpdateACK(std::uint32_t ringVersion) { + RingUpdateACKMsg msg; + msg.m_nodeIndex = m_localNodeIndex; + msg.m_ringVersion = ringVersion; + + std::size_t bodySize = msg.EstimateBufferSize(); + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::RingUpdateACK; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + auto connID = GetPeerConnection(m_dispatcherNodeIndex); + if (connID != Socket::c_invalidConnectionID) { + m_client->SendPacket(connID, std::move(pkt), nullptr); + } + } + + int m_dispatcherNodeIndex = 0; + RemotePostingOps m_remoteOps; + DispatchCoordinator m_dispatch; + + mutable std::mutex m_appendQueueMutex; + std::unordered_map> m_appendQueue; + std::atomic m_remoteQueueSize{0}; + // Serializes concurrent FlushRemoteAppends() callers so we don't open + // hundreds of simultaneous RPC streams to the remote worker (which has + // only 8 server threads / 256 connection slots). With this mutex, only + // one thread sends at a time; concurrent callers either wait for the + // current flush to finish or contribute their items to the queue. + std::mutex m_flushMutex; + + // Per-node mutex used by end-of-batch FlushRemoteAppends so concurrent + // sends to the SAME node from the final-drain path remain ordered. + // Auto-flushes (QueueRemoteAppend) instead use m_perNodeInflight to + // cap concurrency at kMaxInflightPerNode per node. + std::mutex m_perNodeAppendFlushMutexMapLock; + std::unordered_map> m_perNodeAppendFlushMutex; + std::atomic m_inflightAppendFlushes{0}; + std::unordered_map m_perNodeInflight; // guarded by m_appendQueueMutex + static constexpr size_t kAutoFlushThreshold = 50000; + static constexpr int kMaxInflightPerNode = 4; + + std::mutex& GetPerNodeAppendFlushMutex(int nodeIndex) { + std::lock_guard lk(m_perNodeAppendFlushMutexMapLock); + auto it = m_perNodeAppendFlushMutex.find(nodeIndex); + if (it == m_perNodeAppendFlushMutex.end()) { + auto ins = m_perNodeAppendFlushMutex.emplace( + nodeIndex, std::make_unique()); + return *ins.first->second; + } + return *it->second; + } + + // Cross-node merge hint queue. Per-target dedup set of packed + // (layer << 32 | headID) values; QueueRemoteMerge inserts and + // auto-flushes when the per-target bucket reaches threshold. + mutable std::mutex m_mergeQueueMutex; + std::unordered_map> m_mergeQueue; + std::atomic m_mergeQueueSize{0}; + // Merge hints are non-urgent (best-effort optimization). A larger + // bucket trades a small amount of latency for much better dedup and + // network batching. End-of-batch FlushRemoteMerges() guarantees no + // hint is permanently dropped. + static constexpr size_t kMergeAutoFlushThreshold = 8192; + }; + +} // namespace SPTAG::SPANN + +#endif // _SPTAG_SPANN_WORKERNODE_H_ diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index fe3d306a1..29129bdb4 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -19,6 +19,7 @@ #include "inc/Core/Common/LocalVersionMap.h" #include "inc/Core/Common/TiKVVersionMap.h" #include "ExtraFileController.h" +#include "Distributed/WorkerNode.h" #include #include #include @@ -207,15 +208,29 @@ namespace SPTAG::SPANN { }; private: + std::atomic m_workspaceCount = 0; + std::shared_ptr db; + WorkerNode* m_worker = nullptr; // externally owned, set via SetWorker() + + public: + // Expose the underlying KV handle so a standalone WorkerNode can be wired to the + // same DB this searcher already opened, instead of opening a second one. + std::shared_ptr GetDB() const { return db; } + private: SPANN::Index* m_headIndex; std::unique_ptr m_versionMap; Options* m_opt; int m_layer; + SizeType m_initialVectorSize = 0; // vector count at build time (before inserts) COMMON::FineGrainedRWLock m_rwLocks; + // Per-bucket flags for remote (cross-node) locking. + static constexpr int kRemoteLockPoolSize = 32767; + std::unique_ptr[]> m_remoteBucketLocked; + IndexStats m_stat; std::shared_ptr m_wal; @@ -339,9 +354,247 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Posting size limit: %d, search limit: %f, merge threshold: %d\n", m_postingSizeLimit, p_opt.m_latencyLimit, m_mergeThreshold); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[CONFIG] layer=%d DistributedVersionMap=%s SearchCheckVersionMapOnlyLayer0=%s UseMultiChunkPosting=%s PostingPageLimit=%d\n", layer, p_opt.m_distributedVersionMap ? "true" : "false", p_opt.m_searchCheckVersionMapOnlyLayer0 ? "true" : "false", p_opt.m_useMultiChunkPosting ? "true" : "false", p_opt.m_postingPageLimit); + + // Initialize per-bucket remote lock flags + m_remoteBucketLocked.reset(new std::atomic[kRemoteLockPoolSize + 1]{}); + } + + ~ExtraDynamicSearcher() { + if (m_worker) { + m_worker->ClearCallbacksIfOwner(m_layer, this); + m_worker = nullptr; + } + } + + int GetNumWorkerNodes() const { + if (m_worker && m_worker->IsEnabled()) { + return std::max(1, m_worker->GetNumWorkerNodes()); + } + return 1; + } + + int GetWorkerNodeIndex() const { + if (m_worker && m_worker->IsEnabled()) { + int idx = m_worker->GetWorkerNodeIndex(); + return idx >= 0 ? idx : 0; + } + return 0; + } + + // Stripe globalVID across worker nodes (only for vectors added after build). + SizeType AllocateGlobalVID(SizeType localVID) const override { + int numWorkers = GetNumWorkerNodes(); + if (numWorkers <= 1 || localVID < m_initialVectorSize) return localVID; + return m_initialVectorSize + (localVID - m_initialVectorSize) * numWorkers + GetWorkerNodeIndex(); + } + + // Idempotent: wires the receiver's BatchAppend Jobs onto our shared + // SPDKThreadPool. Called both after pool creation and from + // SetWorker(); whichever happens last actually binds the submitter. + void WireJobSubmitterIfReady() { + if (!m_worker || !m_splitThreadPool) return; + auto pool = m_splitThreadPool; + m_worker->SetJobSubmitter(m_layer, + [pool](Helper::ThreadPool::Job* j, bool high) { + if (high) pool->add_high(j); + else pool->add(j); + }); + } + + /// Set the external WorkerNode pointer and bind all callbacks + /// (append, head-sync, remote-lock, merge-hint) at THIS instance's m_layer. + void SetWorker(WorkerNode* router) override { + m_worker = router; + if (!m_worker) return; + + WireJobSubmitterIfReady(); + + // Claim ownership so the matching destructor's IfOwner check + // clears the right slot if/when we are deleted (multi-layer SPANN + // each layer has its own slot keyed by m_layer). + m_worker->ClaimCallbackOwnership(m_layer, this); + + // Append callback: routes incoming remote appends to local Append() + m_worker->SetAppendCallback(m_layer, + [this](SizeType headID, std::shared_ptr headVec, + int appendNum, std::string& appendPosting) -> ErrorCode { + // Reuse SPDKThreadPool's per-worker pre-allocated workspace + // when called from BatchAppendItemJob on m_splitThreadPool. + ExtraWorkSpace localWorkSpace; + ExtraWorkSpace* ws = static_cast(tls_preallocAppendWorkSpace); + if (!ws) { + m_headIndex->InitWorkSpace(&localWorkSpace); + ws = &localWorkSpace; + } + bool wasMissing = !m_headIndex->ContainSample(headID, m_layer + 1); + if (wasMissing && headVec && !headVec->empty()) { + DimensionType dim = static_cast( + headVec->size() / sizeof(ValueType)); + m_headIndex->AddHeadIndex(headVec->data(), headID, 0, + dim, m_layer + 1, ws); + } + + // Mirror sender's version map for the records we're about + // to persist so MergePostings + SearchIndex don't drop + // them as "stale". See HEAD git history for rationale. + { + const uint8_t* basePtr = reinterpret_cast(appendPosting.data()); + size_t totalRec = appendPosting.size() / m_vectorInfoSize; + EnsureVersionMapCoversPosting(basePtr, totalRec, "AppendCallback", headID); + + const SizeType localCount = m_versionMap->Count(); + std::vector batchVids; + std::vector batchVers; + batchVids.reserve(totalRec); + batchVers.reserve(totalRec); + for (size_t i = 0; i < totalRec; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + uint8_t recVer = *(p + sizeof(SizeType)); + if (vid < 0 || vid >= localCount) continue; + if (recVer == 0xfe) continue; + uint8_t curVer = m_versionMap->GetVersion(vid); + if (curVer == 0xfe) continue; + if (curVer == recVer) continue; + batchVids.push_back(vid); + batchVers.push_back(recVer); + } + if (!batchVids.empty()) { + m_versionMap->SetVersionBatch(batchVids, batchVers); + } + } + return Append(ws, headID, appendNum, appendPosting, 0); + }); + + // Head sync callback: apply head index updates from peers + auto* headIndex = m_headIndex; + int layer = m_layer; + auto* worker = m_worker; + m_worker->SetHeadSyncCallback(m_layer, [headIndex, layer, worker](const HeadSyncEntry& entry) { + if (entry.op == HeadSyncEntry::Op::Add) { + headIndex->AddHeadIndex(entry.headVector.data(), entry.headVID, 0, + static_cast(entry.headVector.size() / sizeof(ValueType)), + layer + 1, nullptr); + if (worker) worker->NoteHeadSyncApplyAdd(); + } else { + headIndex->DeleteIndex(entry.headVID, layer + 1); + if (worker) worker->NoteHeadSyncApplyDelete(); + } + }); + + // Remote lock callback: per-bucket atomic flags + m_worker->SetRemoteLockCallback(m_layer, [this](SizeType headID, bool lock) -> bool { + unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); + if (lock) { + bool expected = false; + if (!m_remoteBucketLocked[bucket].compare_exchange_strong(expected, true)) { + return false; + } + if (!m_rwLocks[headID].try_lock()) { + m_remoteBucketLocked[bucket].store(false); + return false; + } + m_rwLocks[headID].unlock(); + return true; + } else { + m_remoteBucketLocked[bucket].store(false); + return true; + } + }); + + // Cross-node merge hint callback + m_worker->SetMergeCallback(m_layer, [this](SizeType headID) { + MergeAsync(headID); + }); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode bound to ExtraDynamicSearcher (layer %d)\n", m_layer); } - ~ExtraDynamicSearcher() {} + // Owner-side wait for any in-flight remote lock on this bucket. + void WaitForRemoteBucketUnlocked(SizeType headID) const { + if (!m_worker || !m_worker->IsEnabled()) return; + unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); + if (!m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) return; + constexpr int kMaxRemoteBucketWaitMs = 5000; + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(kMaxRemoteBucketWaitMs); + while (m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) { + if (std::chrono::steady_clock::now() > deadline) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "WaitForRemoteBucketUnlocked: headID=%lld bucket=%u stuck for %d ms, proceeding\n", + (std::int64_t)headID, bucket, kMaxRemoteBucketWaitMs); + return; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + } + + // Pack and enqueue a RemoteAppendRequest for an already-resolved + // remote owner. headVecBytes may be nullptr when the caller has no + // centroid bytes (plain Append into an existing head). + void EnqueueRemoteAppend(int nodeIndex, + SizeType headID, + int appendNum, + std::string posting, + const void* headVecBytes = nullptr) { + RemoteAppendRequest req; + req.m_headID = headID; + req.m_layer = m_layer; + if (headVecBytes != nullptr) { + req.m_headVec.assign(static_cast(headVecBytes), + m_vectorDataSize); + } + req.m_appendNum = appendNum; + req.m_appendPosting = std::move(posting); + m_worker->QueueRemoteAppend(nodeIndex, std::move(req)); + } + + // If headID is owned by a remote node, queue the append for that + // node and return true; otherwise return false (caller continues + // with local write logic). + bool TryRouteRemoteAppend(SizeType headID, + int appendNum, + std::string posting, + const void* headVecBytes = nullptr) { + if (!m_worker || !m_worker->IsEnabled()) return false; + // Only the outer (head) layer participates in the owner-ring + // route. Inner layers (m_layer > 0) hold per-node-local state + // (no shared head VID space, no cross-node TiKV key naming + // contract), so each node services its own inner layer + // independently. Without this gate inner-layer appends would + // also dispatch RPCs that the receiver can't meaningfully + // apply. + if (m_layer != 0) return false; + auto target = m_worker->GetOwner(headID); + if (target.isLocal) return false; + EnqueueRemoteAppend(target.nodeIndex, headID, appendNum, + std::move(posting), headVecBytes); + return true; + } + + // Validate (and lazily extend) the local version map so that + // every (vid, ver) tuple in a posting we are about to write is + // representable. Without this, remote-originated postings carrying + // VIDs above our current Count() get dropped silently. + void EnsureVersionMapCoversPosting(const uint8_t* p_basePtr, size_t p_totalRec, + const char* p_caller, SizeType p_headID) { + const SizeType localCount = m_versionMap->Count(); + SizeType maxVid = -1; + for (size_t i = 0; i < p_totalRec; ++i) { + const uint8_t* p = p_basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + if (vid > maxVid) maxVid = vid; + } + if (maxVid >= localCount) { + SizeType need = maxVid + 1 - localCount; + m_versionMap->AddBatch(need); + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "%s: extended local versionMap by %lld (head=%lld maxVid=%lld localCount=%lld)\n", + p_caller, (std::int64_t)need, (std::int64_t)p_headID, + (std::int64_t)maxVid, (std::int64_t)localCount); + } + } virtual bool Available() override { @@ -419,7 +672,12 @@ namespace SPTAG::SPANN { virtual ErrorCode AddIDCapacity(SizeType capa, bool deleted) override { - return m_versionMap->AddBatch(capa, deleted); + // Distributed: grow the version map by the FULL batch size + // (capa * numWorkers), not just this node's slice. Stripe formula + // in AllocateGlobalVID produces globalVIDs up to + // m_initialVectorSize + insertCount * numWorkers. + int numWorkers = GetNumWorkerNodes(); + return m_versionMap->AddBatch(capa * numWorkers, deleted); } SPANN::Index* GetHeadIndex() const { return m_headIndex; } @@ -616,6 +874,23 @@ namespace SPTAG::SPANN { double elapsedMSeconds; uint64_t splitPostingVectors = 0; uint64_t splitNewHeadCount = 0; + + // Only the OWNER of headID should run Split. Remote-issued + // splits get dropped early so we don't mutate a posting that + // doesn't live on this node. + if (m_worker && m_worker->IsEnabled()) { + auto target = m_worker->GetOwner(headID); + if (!target.isLocal) { + std::unique_lock tmplock(m_splitListLock); + m_splitList.unsafe_erase(headID); + return ErrorCode::Success; + } + } + + // Owner-side: wait for any in-flight remote-initiated lock on + // this bucket to release the advisory flag before we mutate. + WaitForRemoteBucketUnlocked(headID); + { std::unique_lock lock(m_rwLocks[headID], std::defer_lock); if (requirelock) { @@ -838,6 +1113,17 @@ namespace SPTAG::SPANN { //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Split: new head VID %lld already exists in head index. Do merging...\n", (std::int64_t)(newHeadVID)); m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); + // If newHeadVID's owner is a remote node, route + // the new posting via RemoteAppend; the owner + // will merge it into the existing posting list. + if (TryRouteRemoteAppend( + newHeadVID, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], + args.centers + k * args._D)) { + if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + continue; + } std::string mergedPostingList; std::set vectorIdSet; @@ -925,20 +1211,36 @@ namespace SPTAG::SPANN { SplitAsync(newHeadVID, currentLength); } } else { - auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); - return ret; + // If newHeadVID's owner is a remote node, route + // the initial posting via RemoteAppend so it + // ends up in the owner's TiKV. We still add the + // head locally and rely on BroadcastHeadSync + // (after this loop) to spread the head index + // update to all nodes. The receiver's + // AppendCallback materializes the head if its + // HeadSync hasn't arrived yet. + bool remoteCreated = TryRouteRemoteAppend( + newHeadVID, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], + args.centers + k * args._D); + + if (!remoteCreated) { + auto splitPutBegin = std::chrono::high_resolution_clock::now(); + if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); + return ret; + } + CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); + auto splitPutEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); + m_stat.m_putCost += elapsedMSeconds; } - CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); - auto splitPutEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); - m_stat.m_putCost += elapsedMSeconds; auto updateHeadBegin = std::chrono::high_resolution_clock::now(); if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); - if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { + if (!remoteCreated && db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); } return ret; @@ -962,6 +1264,35 @@ namespace SPTAG::SPANN { } } + // Broadcast HeadSync to peer nodes when the head update lands + // in our local BKT (in-memory, per-compute). Lower-layer head + // adds that resolve to m_extraSearchers[m_layer+1]->AddIndex + // already write to shared TiKV so re-broadcasting them would + // only duplicate. + if (m_worker && m_worker->IsEnabled() + && m_headIndex->GetDiskIndex(m_layer + 1) == nullptr) { + std::vector headSyncEntries; + for (int k = 0; k < 2; k++) { + if (args.counts[k] == 0 || (int)newHeadsID.size() <= k) continue; + HeadSyncEntry entry; + entry.op = HeadSyncEntry::Op::Add; + entry.headVID = newHeadsID[k]; + entry.m_layer = m_layer; + entry.headVector.assign(args.centers + k * args._D, args.centers + k * args._D + m_vectorDataSize); + headSyncEntries.push_back(std::move(entry)); + } + if (!theSameHead) { + HeadSyncEntry entry; + entry.op = HeadSyncEntry::Op::Delete; + entry.headVID = headID; + entry.m_layer = m_layer; + headSyncEntries.push_back(std::move(entry)); + } + if (!headSyncEntries.empty()) { + m_worker->BroadcastHeadSync(headSyncEntries); + } + } + { std::unique_lock tmplock(m_splitListLock); //SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"erase: %d\n", headID); @@ -1003,6 +1334,18 @@ namespace SPTAG::SPANN { ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID) { + // The owner runs its own merge passes. Skip when this head is + // owned by another node — we'd just be racing the owner. + if (m_worker && m_worker->IsEnabled()) { + auto target = m_worker->GetOwner(headID); + if (!target.isLocal) { + std::unique_lock tmplock(m_mergeListLock); + m_mergeList.unsafe_erase(headID); + return ErrorCode::Success; + } + } + WaitForRemoteBucketUnlocked(headID); + std::unique_lock lock(m_rwLocks[headID]); if (!m_headIndex->ContainSample(headID, m_layer + 1)) { @@ -1102,23 +1445,61 @@ namespace SPTAG::SPANN { int deletedLength = 0; { std::unique_lock anotherLock(m_rwLocks[queryResult->VID], std::defer_lock); - // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID); - if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { - if (!anotherLock.try_lock()) { - auto* curJob = new MergeAsyncJob(this, headID, nullptr); - // Re-queue counts as a new submission; matched by the - // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in - // MergeAsyncJob::exec(). Without these increments - // m_mergeJobsInFlight underflows to a huge uint64 - // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. - m_mergeJobsInFlight++; - m_totalMergeSubmitted++; - m_splitThreadPool->add(curJob); - return ErrorCode::Success; + + // RAII guard for the advisory remote bucket lock. + struct RemoteLockGuard { + WorkerNode* router = nullptr; + int nodeIndex = -1; + int layer = 0; + SizeType headID = -1; + bool active = false; + ~RemoteLockGuard() { if (active && router) router->SendRemoteLock(nodeIndex, layer, headID, false); } + void release() { active = false; } + } remoteLockGuard; + + bool isRemoteCandidate = false; + int remoteNodeIndex = -1; + if (m_worker && m_worker->IsEnabled()) { + auto target = m_worker->GetOwner(queryResult->VID); + if (!target.isLocal) { + isRemoteCandidate = true; + remoteNodeIndex = target.nodeIndex; + if (!m_worker->SendRemoteLock(remoteNodeIndex, m_layer, queryResult->VID, true)) { + // Remote owner busy; skip this candidate. + continue; + } + remoteLockGuard.router = m_worker; + remoteLockGuard.nodeIndex = remoteNodeIndex; + remoteLockGuard.layer = m_layer; + remoteLockGuard.headID = queryResult->VID; + remoteLockGuard.active = true; } } - if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; + + if (!isRemoteCandidate) { + // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID); + if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { + if (!anotherLock.try_lock()) { + auto* curJob = new MergeAsyncJob(this, headID, nullptr); + // Re-queue counts as a new submission; matched by the + // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in + // MergeAsyncJob::exec(). Without these increments + // m_mergeJobsInFlight underflows to a huge uint64 + // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. + m_mergeJobsInFlight++; + m_totalMergeSubmitted++; + m_splitThreadPool->add(curJob); + return ErrorCode::Success; + } + } + if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; + } + if ((ret=db->Get(DBKey(queryResult->VID), &nextPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if (isRemoteCandidate) { + // Stale fetch on remote side; skip and let next round retry. + continue; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get to be merged posting: %lld, get size:%d\n", (std::int64_t)(queryResult->VID), (int)(nextPostingList.size())); @@ -1143,6 +1524,14 @@ namespace SPTAG::SPANN { nextLength++; } if (resultVec == nullptr) { + if (isRemoteCandidate) { + // Stale fetch / version skew on remote side. Skip + // and let the next merge round retry. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: remote candidate %lld has no head record in fetched posting, skipping\n", + (std::int64_t)(queryResult->VID)); + continue; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail: cannot find another head vector in posting! headID:%lld\n", (std::int64_t)(queryResult->VID)); return ErrorCode::Fail; } @@ -1158,11 +1547,25 @@ namespace SPTAG::SPANN { return ret; } CheckCentroid(headID, mergedPostingList, "MergePostings-currentLength >= nextLength"); - m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); - if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID)); - return ret; + if (isRemoteCandidate) { + // Survivor is local; delete remote loser first + // (so we don't have duplicate VID across nodes), + // then drop local head-index entry. + if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success + && ret != ErrorCode::Key_NotFound) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: remote-loser Delete(%lld) failed; survivor %lld is durable\n", + (std::int64_t)queryResult->VID, (std::int64_t)headID); + return ret; + } + m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); + } else { + m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); + if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID)); + return ret; + } } nextHeadID = headID; nextHeadVec = headVec; @@ -1175,6 +1578,12 @@ namespace SPTAG::SPANN { mergedPostingList += *resultVec; } if ((ret=db->Put(DBKey(queryResult->VID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if (isRemoteCandidate) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: remote-survivor Put(%lld) failed; no state mutated, next round will retry\n", + (std::int64_t)queryResult->VID); + return ret; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail to override posting %lld after merge\n", (std::int64_t)(queryResult->VID)); return ret; } @@ -1182,6 +1591,12 @@ namespace SPTAG::SPANN { m_headIndex->DeleteIndex(headID, m_layer + 1); if ((ret = db->Delete(DBKey(headID))) != ErrorCode::Success) { + if (isRemoteCandidate) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: local-loser Delete(%lld) failed; remote survivor %lld is durable\n", + (std::int64_t)headID, (std::int64_t)queryResult->VID); + return ret; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(headID)); return ret; } @@ -1191,7 +1606,15 @@ namespace SPTAG::SPANN { deletedPostingList = ¤tPostingList; deletedLength = currentLength; } - if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + if (isRemoteCandidate) { + // Release advisory remote lock before reassign below. + if (remoteLockGuard.active) { + remoteLockGuard.router->SendRemoteLock( + remoteLockGuard.nodeIndex, remoteLockGuard.layer, + remoteLockGuard.headID, false); + remoteLockGuard.release(); + } + } else if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); } // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Release: %d, Release: %d\n", headID, queryResult->VID); @@ -1553,6 +1976,38 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Error!, headID :%lld, appendNum:%d\n", (std::int64_t)headID, appendNum); } + // If this head is owned by a remote node, route the append via + // QueueRemoteAppend instead of touching local TiKV. appendNum is + // captured BEFORE std::move(appendPosting) to avoid use-after-move. + // If the batch carries the head's own self-entry (VID == headID), + // forward its vector bytes so the receiver can materialize the + // head index before the BroadcastHeadSync arrives. See the + // matching scan in BatchAppend() for rationale. + { + const uint8_t* basePtr = + reinterpret_cast(appendPosting.data()); + const void* headVecBytes = nullptr; + for (int i = 0; i < appendNum; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + if (vid == headID) { + headVecBytes = p + m_metaDataSize; + break; + } + } + if (TryRouteRemoteAppend(headID, appendNum, appendPosting, headVecBytes)) { + if (!reassignThreshold) { + m_totalAppendCount++; + m_stat.m_appendTaskNum++; + } + return ErrorCode::Success; + } + } + + // If a remote initiator is currently holding the advisory lock + // on this bucket, wait it out before we touch the posting. + WaitForRemoteBucketUnlocked(headID); + checkDeleted: if (!m_headIndex->ContainSample(headID, m_layer + 1)) { for (int i = 0; i < appendNum; i++) @@ -1684,6 +2139,41 @@ namespace SPTAG::SPANN { auto appendIt = headAppends.find(headID); if (appendIt == headAppends.end()) continue; + // Owner gate: forward heads owned by a remote node via the + // batched RemoteAppend queue. Local heads fall through to + // the standard MultiMerge path below. Without this hook, + // every node writes to every head's TiKV key and the owner + // ring is ignored (no remote RPC, no route stats). + // + // Pass headVecBytes when this batch carries the head's own + // self-entry (VID == headID). During Build-time seed the + // receiver may not yet have the head index entry; without + // headVecBytes its AppendCallback can't materialize the head + // and falls into the ReassignAsync redirect path, dropping + // the self-entry from the posting and later causing + // "MergePostings fail: cannot find head vector in posting!". + { + const std::string& posting = appendIt->second; + const uint8_t* basePtr = + reinterpret_cast(posting.data()); + size_t totalRec = posting.size() / m_vectorInfoSize; + const void* headVecBytes = nullptr; + for (size_t i = 0; i < totalRec; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + if (vid == headID) { + headVecBytes = p + m_metaDataSize; + break; + } + } + if (TryRouteRemoteAppend(headID, + (int)(posting.size() / m_vectorInfoSize), + posting, + headVecBytes)) { + continue; + } + } + std::unique_lock headLock(m_rwLocks[headID]); if (!m_headIndex->ContainSample(headID, m_layer + 1)) { @@ -1788,6 +2278,10 @@ namespace SPTAG::SPANN { //LOG(Helper::LogLevel::LL_Info, "Reassign: oldVID:%d, replicaCount:%d, candidateNum:%d, dist0:%f\n", oldVID, replicaCount, i, selections[0].distance); for (int i = 0; i < replicaCount && m_versionMap->GetVersion(VID) == version; i++) { //LOG(Helper::LogLevel::LL_Info, "Reassign: headID :%d, oldVID:%d, newVID:%d, posting length: %d, dist: %f, string size: %d\n", headID, oldVID, VID, m_postingSizes[headID].load(), selections[i].distance, newPart.size()); + if (TryRouteRemoteAppend(selections[i].VID, 1, *vectorInfo, + selections[i].Vec.Data())) { + continue; + } // [FIX H3] use reassignThreshold=0 so that an oversized // target posting triggers SplitAsync (not a synchronous // Split on this worker thread). This matches the @@ -1813,6 +2307,7 @@ namespace SPTAG::SPANN { bool LoadIndex(Options& p_opt) override { m_opt = &p_opt; + m_initialVectorSize = p_opt.m_vectorSize; // initial count for VID stripe SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "DataBlockSize: %d, Capacity: %d\n", m_opt->m_datasetRowsInBlock, m_opt->m_datasetCapacity); std::string versionmapPath = m_opt->m_indexDirectory + FolderSep + m_opt->m_deleteIDFile + "_" + std::to_string(m_layer); if (m_opt->m_recovery) { @@ -1901,13 +2396,33 @@ namespace SPTAG::SPANN { } if (m_opt->m_update) { if (m_splitThreadPool == nullptr) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); - - m_splitThreadPool = std::make_shared(); - m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); - //m_reassignThreadPool = std::make_shared(); - //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n"); + // Only layer 0 participates in the shared-pool slot: + // it both adopts (if a sibling published first) and + // publishes (so the WorkerNode receiver and any later + // layer-0 instance can reuse the same threads). + // Inner layers (m_layer > 0) always create their own + // pool, matching qianxi's per-instance pool design. + if (m_layer == 0 && m_headIndex) { + auto shared = m_headIndex->GetSharedSplitPool(); + if (shared) { + m_splitThreadPool = std::static_pointer_cast(shared); + } + } + if (m_splitThreadPool == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); + + m_splitThreadPool = std::make_shared(); + m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); + //m_reassignThreadPool = std::make_shared(); + //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n"); + if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: adopted shared split pool from sibling layer\n"); + } + // Pool is now ready: re-attempt wiring the worker's job + // submitter (may have been set before pool was alive). + WireJobSubmitterIfReady(); } if (m_opt->m_enableWAL && !m_opt->m_persistentBufferPath.empty()) { @@ -2345,6 +2860,7 @@ namespace SPTAG::SPANN { { auto fullVectors = p_reader->GetVectorSet(); fullCount = fullVectors->Count(); + m_initialVectorSize = fullCount; // remember bulk-build count for stripe formula m_metaDataSize = sizeof(SizeType) + sizeof(uint8_t); m_vectorDataSize = fullVectors->PerVectorDataSize(); m_vectorInfoSize = m_vectorDataSize + m_metaDataSize; @@ -2556,10 +3072,20 @@ namespace SPTAG::SPANN { if (m_opt->m_update && !m_opt->m_allowZeroReplica && zeroReplicaCount > 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); - m_splitThreadPool = std::make_shared(); - m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount); + if (m_splitThreadPool == nullptr && m_layer == 0 && m_headIndex) { + auto shared = m_headIndex->GetSharedSplitPool(); + if (shared) { + m_splitThreadPool = std::static_pointer_cast(shared); + } + } + if (m_splitThreadPool == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); + m_splitThreadPool = std::make_shared(); + m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount); + if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool); + } + WireJobSubmitterIfReady(); uint32_t splitNumBeforeZeroReplica = m_stat.m_splitNum; uint32_t reassignNumBeforeZeroReplica = m_stat.m_reAssignNum; @@ -2834,6 +3360,16 @@ namespace SPTAG::SPANN { return ErrorCode::VectorNotFound; } + ErrorCode FlushRemoteAppends() { + if (m_worker && m_worker->IsEnabled()) { + ErrorCode ret = m_worker->FlushRemoteAppends(); + m_worker->LogRouteStats(" (batch flush)"); + m_worker->ResetRouteStats(); + return ret; + } + return ErrorCode::Success; + } + bool AllFinished() { if (!m_splitThreadPool) return true; diff --git a/AnnService/inc/Core/SPANN/ExtraTiKVController.h b/AnnService/inc/Core/SPANN/ExtraTiKVController.h index d7528d479..0541eaad1 100644 --- a/AnnService/inc/Core/SPANN/ExtraTiKVController.h +++ b/AnnService/inc/Core/SPANN/ExtraTiKVController.h @@ -12,6 +12,7 @@ #include "kvproto/tikvpb.grpc.pb.h" #include "kvproto/kvrpcpb.pb.h" #include "kvproto/metapb.pb.h" +#include "kvproto/pdpb.pb.h" #include "kvproto/pdpb.grpc.pb.h" #include diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index 554b02421..ec8d8bf95 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -22,6 +22,11 @@ namespace SPTAG { namespace SPANN { + // Forward declaration; the only IExtraSearcher API that touches WorkerNode + // is the SetWorker() hook below. Concrete searchers that care + // (ExtraDynamicSearcher) include the full header and override. + class WorkerNode; + struct SearchStats { SearchStats() @@ -589,6 +594,11 @@ namespace SPTAG { SizeType p_begin) { return ErrorCode::Undefined; } virtual ErrorCode DeleteIndex(SizeType p_id) { return ErrorCode::Undefined; } + // Allocate globalVID to this node's BKT counter. + // ExtraDynamicSearcher overrides this with + // the stripe formula when m_worker is enabled. + virtual SizeType AllocateGlobalVID(SizeType p_localVID) const { return p_localVID; } + virtual SizeType GetNumSamples() const = 0; virtual bool ContainSample(const SizeType idx) const @@ -624,6 +634,11 @@ namespace SPTAG { return ErrorCode::Undefined; } + // Bind a routing worker (no-op by default). ExtraDynamicSearcher + // overrides this to install the cross-node append + put + + // fetch-postings callbacks. ExtraStaticSearcher etc. ignore it. + virtual void SetWorker(WorkerNode* /*worker*/) {} + virtual bool AllFinished() { return false; } virtual void GetDBStats() { return; } virtual int64_t GetNumBlocks() { return 0; } @@ -640,6 +655,8 @@ namespace SPTAG { } virtual ErrorCode Checkpoint(std::string prefix) { return ErrorCode::Success; } + + virtual void InitWorkSpace(ExtraWorkSpace* p_exWorkSpace, bool clear = false) {} }; } // SPANN } // SPTAG diff --git a/AnnService/inc/Core/SPANN/Index.h b/AnnService/inc/Core/SPANN/Index.h index 5479d2d42..255043a58 100644 --- a/AnnService/inc/Core/SPANN/Index.h +++ b/AnnService/inc/Core/SPANN/Index.h @@ -47,6 +47,11 @@ namespace SPTAG template class SPANNResultIterator; + // Forward-declare so Index can hold/forward a WorkerNode pointer + // without dragging in the full Distributed/WorkerNode.h header (and + // thus its boost-asio + grpc transitive deps) into Index.h. + class WorkerNode; + template class Index; template @@ -63,6 +68,12 @@ namespace SPTAG std::vector> m_extraSearchers; std::unique_ptr> m_workSpaceFactory; + // Routing worker bound BEFORE BuildIndex so that + // ExtraDynamicSearcher::WriteDownAllPostingToDB and other build + // hooks see a non-null m_worker as each layer's searcher is + // emplaced. SPFreshTest sets this in BuildOnly+Distributed mode. + WorkerNode* m_pendingWorker = nullptr; + Options m_options; std::function m_fComputeDistance; @@ -85,6 +96,14 @@ namespace SPTAG std::shared_ptr> m_freeWorkSpaceIds; std::atomic m_workspaceCount = 0; + // Single split/append thread pool shared by all extraSearchers + // (one per layer). Lazily populated by the first layer that + // initializes its pool inside LoadIndex; subsequent layers + // adopt the same shared instance so the total worker count + // is AppendThreadNum (not AppendThreadNum * layers). + mutable std::mutex m_sharedSplitPoolMutex; + std::shared_ptr m_sharedSplitPool; + public: Index() { @@ -124,6 +143,27 @@ namespace SPTAG inline std::shared_ptr GetDiskIndex(int layer = 0) { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]; else return nullptr; } inline Options* GetOptions() { return &m_options; } + // Bind a routing worker. Forwards to all currently-existing + // extraSearchers and remembers the pointer so newly-emplaced + // searchers (created during BuildIndexInternalLayer) also pick + // it up. Pass nullptr to detach. + void SetWorker(WorkerNode* worker) { + m_pendingWorker = worker; + for (auto& searcher : m_extraSearchers) { + if (searcher) searcher->SetWorker(worker); + } + } + inline WorkerNode* GetPendingWorker() const { return m_pendingWorker; } + + inline std::shared_ptr GetSharedSplitPool() const { + std::lock_guard lk(m_sharedSplitPoolMutex); + return m_sharedSplitPool; + } + inline void SetSharedSplitPool(std::shared_ptr pool) { + std::lock_guard lk(m_sharedSplitPoolMutex); + m_sharedSplitPool = std::move(pool); + } + inline SizeType GetNumSamples() const { return GetNumSamples(0); } inline SizeType GetNumSamples(int layer) const { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]->GetNumSamples(); else return m_topIndex->GetNumSamples(); } inline DimensionType GetFeatureDim() const { return m_topIndex->GetFeatureDim(); } diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h index a25bf1e63..62e2ca843 100644 --- a/AnnService/inc/Core/VectorIndex.h +++ b/AnnService/inc/Core/VectorIndex.h @@ -5,6 +5,7 @@ #define _SPTAG_VECTORINDEX_H_ #include +#include #include "Common.h" #include "Common/WorkSpace.h" #include "inc/Helper/DiskIO.h" @@ -160,6 +161,14 @@ class VectorIndex static ErrorCode LoadIndex(const std::string& p_loaderFilePath, std::shared_ptr& p_vectorIndex); + /// LoadIndex with config overrides applied between LoadIndexConfig and LoadIndexData, + /// so settings such as TiKVPDAddresses take effect before the underlying KV connection + /// is constructed. Override keys may be section-qualified ("Section.Param"); unqualified + /// keys default to the "BuildSSDIndex" section. + static ErrorCode LoadIndex(const std::string& p_loaderFilePath, + const std::map& p_paramOverrides, + std::shared_ptr& p_vectorIndex); + static ErrorCode LoadIndexFromFile(const std::string& p_file, std::shared_ptr& p_vectorIndex); static ErrorCode LoadIndex(const std::string& p_config, const std::vector& p_indexBlobs, std::shared_ptr& p_vectorIndex); diff --git a/AnnService/inc/Helper/KeyValueIO.h b/AnnService/inc/Helper/KeyValueIO.h index a7c3c25b8..9d7c1e2a3 100644 --- a/AnnService/inc/Helper/KeyValueIO.h +++ b/AnnService/inc/Helper/KeyValueIO.h @@ -34,6 +34,20 @@ namespace SPTAG virtual ErrorCode Put(const SizeType key, const std::string& value, const std::chrono::microseconds& timeout, std::vector* reqs) = 0; + // Batched writes/deletes. Default implementations return Undefined so that + // backends without native batching (RocksDB, FileIO) can ignore them. + // TiKVIO overrides these to issue a single batched RPC per region group, + // which dramatically reduces the number of synchronous gRPC round-trips + // when callers (e.g. SPANN AddIndex Phase 2 / PutPostingToDB) want to + // commit several keys at once. + virtual ErrorCode MultiPut(const std::vector& keys, + const std::vector& values, + const std::chrono::microseconds& timeout, + std::vector* reqs) { return ErrorCode::Undefined; } + + virtual ErrorCode MultiDelete(const std::vector& keys, + const std::chrono::microseconds& timeout) { return ErrorCode::Undefined; } + virtual ErrorCode Merge(const SizeType key, const std::string &value, const std::chrono::microseconds &timeout, std::vector *reqs, int& size) = 0; diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h index 01c82e2a7..a351a75c8 100644 --- a/AnnService/inc/Helper/ThreadPool.h +++ b/AnnService/inc/Helper/ThreadPool.h @@ -5,7 +5,7 @@ #define _SPTAG_HELPER_THREADPOOL_H_ #include -#include +#include #include #include #include @@ -78,28 +78,42 @@ namespace SPTAG { { std::lock_guard lock(m_lock); - m_jobs.push_back(j); + m_jobs.push(j); } m_cond.notify_one(); } - void addfront(Job* j) + // High-priority push: jobs in m_highJobs always run before m_jobs. + // Used by the distributed receiver to let inbound BatchAppend RPC + // work jump ahead of local Split/Merge/Reassign so the sender + // (driver) doesn't time out waiting for the chunk ack while the + // local pool drains long-running rebalance work. + void add_high(Job* j) { { std::lock_guard lock(m_lock); - m_jobs.push_front(j); + m_highJobs.push(j); } m_cond.notify_one(); } + // Alias kept for compatibility with code that calls addfront() + // (e.g., split-async path). Same semantics as add_high. + void addfront(Job* j) { add_high(j); } + bool get(Job*& j) { std::unique_lock lock(m_lock); - while (m_jobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); + while (m_jobs.empty() && m_highJobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); if (!m_abort.ShouldAbort()) { - j = m_jobs.front(); + if (!m_highJobs.empty()) { + j = m_highJobs.front(); + m_highJobs.pop(); + } else { + j = m_jobs.front(); + m_jobs.pop(); + } currentJobs++; - m_jobs.pop_front(); return true; } return false; @@ -108,7 +122,7 @@ namespace SPTAG size_t jobsize() { std::lock_guard lock(m_lock); - return m_jobs.size(); + return m_jobs.size() + m_highJobs.size(); } inline uint32_t runningJobs() { return currentJobs; } @@ -122,7 +136,8 @@ namespace SPTAG protected: std::atomic_uint32_t currentJobs{ 0 }; - std::deque m_jobs; + std::queue m_jobs; + std::queue m_highJobs; Abort m_abort; std::mutex m_lock; std::condition_variable m_cond; diff --git a/AnnService/inc/Socket/ConnectionManager.h b/AnnService/inc/Socket/ConnectionManager.h index e487c6105..0c199ecb1 100644 --- a/AnnService/inc/Socket/ConnectionManager.h +++ b/AnnService/inc/Socket/ConnectionManager.h @@ -41,7 +41,11 @@ class ConnectionManager : public std::enable_shared_from_this inline static std::uint32_t GetPosition(ConnectionID p_connectionID); private: - static constexpr std::uint32_t c_connectionPoolSize = 1 << 8; + // Bumped from 1<<8 (256) to 1<<12 (4096) to avoid silently dropping new + // connections when reconnect storms (e.g., from concurrent FlushRemoteAppends + // timeouts) saturate the pool. Each ConnectionItem is small; 4096 slots is + // ~64KB per ConnectionManager, which is negligible. + static constexpr std::uint32_t c_connectionPoolSize = 1 << 12; static constexpr std::uint32_t c_connectionPoolMask = c_connectionPoolSize - 1; diff --git a/AnnService/inc/Socket/Packet.h b/AnnService/inc/Socket/Packet.h index 8c99b09fe..6d8c1d146 100644 --- a/AnnService/inc/Socket/Packet.h +++ b/AnnService/inc/Socket/Packet.h @@ -27,13 +27,47 @@ enum class PacketType : std::uint8_t SearchRequest = 0x03, + AppendRequest = 0x04, + + BatchAppendRequest = 0x05, + + HeadSyncRequest = 0x07, + + RemoteLockRequest = 0x08, + + DispatchCommand = 0x09, + + NodeRegisterRequest = 0x0A, + + RingUpdate = 0x0B, + + RingUpdateACK = 0x0C, + + // Cross-node merge hint. Search on node X observes posting H is + // underfull, but H is owned by node Y. X sends MergeRequest to Y so + // Y can schedule its own MergeAsync(H). Fire-and-forget (no response + // packet): the receiver's MergeAsync already dedups via m_mergeList, + // a lost notification just means Y discovers H underfull via some + // other path (own search, own Append, explicit RefineIndex). + MergeRequest = 0x11, + ResponseMask = 0x80, + NodeRegisterResponse = ResponseMask | NodeRegisterRequest, + HeartbeatResponse = ResponseMask | HeartbeatRequest, RegisterResponse = ResponseMask | RegisterRequest, - SearchResponse = ResponseMask | SearchRequest + SearchResponse = ResponseMask | SearchRequest, + + AppendResponse = ResponseMask | AppendRequest, + + BatchAppendResponse = ResponseMask | BatchAppendRequest, + + RemoteLockResponse = ResponseMask | RemoteLockRequest, + + DispatchResult = ResponseMask | DispatchCommand, }; diff --git a/AnnService/inc/Socket/SimpleSerialization.h b/AnnService/inc/Socket/SimpleSerialization.h index 6da925625..e0b8141dd 100644 --- a/AnnService/inc/Socket/SimpleSerialization.h +++ b/AnnService/inc/Socket/SimpleSerialization.h @@ -82,6 +82,58 @@ namespace SimpleSerialization } + /// Bounds-checked variants of SimpleReadBuffer. + /// All return nullptr if a read would overrun [p_buffer, p_bufEnd). + /// p_buffer is also returned as nullptr (and p_val left unchanged) if it is already nullptr. + template + inline const std::uint8_t* + SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, T& p_val) + { + static_assert(std::is_fundamental::value || std::is_enum::value, + "Only applied for fundanmental type."); + + if (p_buffer == nullptr) return nullptr; + if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < sizeof(T)) return nullptr; + p_val = *(reinterpret_cast(p_buffer)); + return p_buffer + sizeof(T); + } + + + inline const std::uint8_t* + SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, std::string& p_val) + { + p_val.clear(); + if (p_buffer == nullptr) return nullptr; + std::uint32_t len = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, len); + if (p_buffer == nullptr) return nullptr; + if (len > 0) + { + if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < len) return nullptr; + p_val.assign(reinterpret_cast(p_buffer), len); + } + return p_buffer + len; + } + + + inline const std::uint8_t* + SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, ByteArray& p_val) + { + p_val.Clear(); + if (p_buffer == nullptr) return nullptr; + std::uint32_t len = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, len); + if (p_buffer == nullptr) return nullptr; + if (len > 0) + { + if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < len) return nullptr; + p_val = ByteArray::Alloc(len); + std::memcpy(p_val.Data(), p_buffer, len); + } + return p_buffer + len; + } + + template<> inline std::size_t EstimateBufferSize(const std::string& p_val) diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp index 24c839455..b5db83822 100644 --- a/AnnService/src/Core/SPANN/ExtraFileController.cpp +++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp @@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer) #ifndef _MSC_VER O_RDWR | O_DIRECT, numblocks, 2, 2, max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) + - (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))), + p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)), ((std::uint64_t)p_opt.m_startFileSize) << 30 #else GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2, diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index f3f83dca6..38ea1c72d 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -1227,6 +1227,15 @@ template ErrorCode Index::BuildIndexInternalLayer(std::shared_pt m_extraSearchers.emplace_back(std::make_shared>(m_options, m_extraSearchers.size(), this, m_db)); } + // Hand the routing worker (if any) to the freshly-created searcher + // before BuildIndex runs. Build itself no longer routes postings + // (shared TiKV cluster — the driver writes straight to TiKV and PD + // routes each key to the owning store), but other build-time hooks + // that consult m_worker still benefit from seeing a non-null value. + if (m_pendingWorker) { + m_extraSearchers.back()->SetWorker(m_pendingWorker); + } + { std::shared_ptr ptr = SPTAG::f_createIO(); if (ptr == nullptr || @@ -1862,7 +1871,74 @@ ErrorCode Index::AddIndex(const void *p_data, SizeType p_vectorNum, Dimension } workSpace->m_deduper.clear(); workSpace->m_postingIDs.clear(); - return m_extraSearchers[0]->AddIndex(workSpace.get(), vectorSet, begin); + + // Use multiple threads for RNGSelection + Append when vector count is large enough. + // Each thread fetch_add's one vector and calls ExtraDynamicSearcher::AddIndex with a + // single-vector view, so AppendBatchAsync flushes per-vector and pipelines with the + // worker side rather than queuing the whole batch behind a single huge flush. + if (p_vectorNum > 1 && m_options.m_iSSDNumberOfThreads > 1) { + int numThreads = std::min((int)p_vectorNum, m_options.m_iSSDNumberOfThreads); + std::atomic_int nextVec{0}; + std::atomic globalError{ErrorCode::Success}; + int printStep = std::max(1, p_vectorNum / 50); + + auto worker = [&](bool isFirst) { + std::unique_ptr ws; + ExtraWorkSpace* wsPtr; + if (isFirst) { + wsPtr = workSpace.get(); + } else { + ws = m_workSpaceFactory->GetWorkSpace(); + if (!ws) { + ws.reset(new ExtraWorkSpace()); + InitWorkSpace(ws.get(), false); + } else { + InitWorkSpace(ws.get(), true); + } + ws->m_deduper.clear(); + ws->m_postingIDs.clear(); + wsPtr = ws.get(); + } + + while (globalError.load(std::memory_order_relaxed) == ErrorCode::Success) { + int v = nextVec.fetch_add(1); + if (v >= p_vectorNum) break; + + if (v % printStep == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "AddIndex bulk: %d/%d (%.1f%%)\n", + v, p_vectorNum, v * 100.0 / p_vectorNum); + GetDBStat(); + } + + std::shared_ptr singleVec = std::make_shared( + ByteArray((std::uint8_t*)vectorSet->GetVector(v), + sizeof(T) * p_dimension, false), + GetEnumValueType(), p_dimension, 1); + ErrorCode ret = m_extraSearchers[0]->AddIndex(wsPtr, singleVec, + m_extraSearchers[0]->AllocateGlobalVID(begin + v)); + if (ret != ErrorCode::Success) { + globalError.store(ret, std::memory_order_relaxed); + } + } + + if (!isFirst && ws) { + m_workSpaceFactory->ReturnWorkSpace(std::move(ws)); + } + }; + + std::vector threads; + threads.reserve(numThreads - 1); + for (int t = 1; t < numThreads; t++) { + threads.emplace_back(worker, false); + } + worker(true); + for (auto& t : threads) t.join(); + + return globalError.load(); + } + + return m_extraSearchers[0]->AddIndex(workSpace.get(), vectorSet, + m_extraSearchers[0]->AllocateGlobalVID(begin)); } template diff --git a/AnnService/src/Core/VectorIndex.cpp b/AnnService/src/Core/VectorIndex.cpp index 2f8ebfd13..35bcaf585 100644 --- a/AnnService/src/Core/VectorIndex.cpp +++ b/AnnService/src/Core/VectorIndex.cpp @@ -793,6 +793,14 @@ std::shared_ptr VectorIndex::CreateInstance(IndexAlgoType p_algo, V } ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, std::shared_ptr &p_vectorIndex) +{ + static const std::map emptyOverrides; + return LoadIndex(p_loaderFilePath, emptyOverrides, p_vectorIndex); +} + +ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, + const std::map &p_paramOverrides, + std::shared_ptr &p_vectorIndex) { std::string folderPath(p_loaderFilePath); if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) @@ -816,6 +824,23 @@ ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, std::share if ((ret = p_vectorIndex->LoadIndexConfig(iniReader)) != ErrorCode::Success) return ret; + // Apply param overrides AFTER LoadIndexConfig but BEFORE LoadIndexData, so that + // settings like TiKVPDAddresses are reflected in m_options before the KV connection + // is constructed inside LoadIndexData -> PrepareDB. + for (const auto &kv : p_paramOverrides) + { + const std::string &key = kv.first; + const std::string &val = kv.second; + auto dotPos = key.find('.'); + if (dotPos != std::string::npos) { + std::string section = key.substr(0, dotPos); + std::string param = key.substr(dotPos + 1); + p_vectorIndex->SetParameter(param.c_str(), val.c_str(), section.c_str()); + } else { + p_vectorIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex"); + } + } + std::shared_ptr> indexfiles = p_vectorIndex->GetIndexFiles(); if (iniReader.DoesSectionExist("MetaData")) { diff --git a/AnnService/src/Socket/Connection.cpp b/AnnService/src/Socket/Connection.cpp index 150889d2f..444c7afb0 100644 --- a/AnnService/src/Socket/Connection.cpp +++ b/AnnService/src/Socket/Connection.cpp @@ -26,10 +26,19 @@ Connection::Connection(ConnectionID p_connectionID, boost::asio::ip::tcp::socket void Connection::Start() { - SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n", - static_cast(m_socket.local_endpoint().port()), - m_socket.remote_endpoint().address().to_string().c_str(), - static_cast(m_socket.remote_endpoint().port())); + boost::system::error_code epEc; + auto localEp = m_socket.local_endpoint(epEc); + auto remoteEp = m_socket.remote_endpoint(epEc); + if (!epEc) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n", + static_cast(localEp.port()), + remoteEp.address().to_string().c_str(), + static_cast(remoteEp.port())); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Connection Start, socket not connected: %s\n", + epEc.message().c_str()); + return; + } if (!m_stopped.exchange(false)) { @@ -42,10 +51,15 @@ void Connection::Start() void Connection::Stop() { - SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n", - static_cast(m_socket.local_endpoint().port()), - m_socket.remote_endpoint().address().to_string().c_str(), - static_cast(m_socket.remote_endpoint().port())); + boost::system::error_code epEc; + auto localEp = m_socket.local_endpoint(epEc); + auto remoteEp = m_socket.remote_endpoint(epEc); + if (!epEc) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n", + static_cast(localEp.port()), + remoteEp.address().to_string().c_str(), + static_cast(remoteEp.port())); + } if (m_stopped.exchange(true)) { diff --git a/AnnService/src/Socket/Server.cpp b/AnnService/src/Socket/Server.cpp index 9781bf1d4..8be0682c6 100644 --- a/AnnService/src/Socket/Server.cpp +++ b/AnnService/src/Socket/Server.cpp @@ -26,7 +26,7 @@ Server::Server(const std::string &p_address, const std::string &p_port, const Pa boost::asio::ip::tcp::endpoint endpoint = *(endPoints.begin()); m_acceptor.open(endpoint.protocol()); - m_acceptor.set_option(boost::asio::ip::tcp::acceptor::reuse_address(false)); + m_acceptor.set_option(boost::asio::ip::tcp::acceptor::reuse_address(true)); m_acceptor.bind(endpoint, errCode); if (errCode) diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt index 52f4168a9..27bdeebb5 100644 --- a/Test/CMakeLists.txt +++ b/Test/CMakeLists.txt @@ -24,7 +24,7 @@ if (NOT LIBRARYONLY) file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h) file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp) add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES}) - target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES}) + target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) install(TARGETS SPTAGTest RUNTIME DESTINATION bin diff --git a/Test/inc/TestDataGenerator.h b/Test/inc/TestDataGenerator.h index 5820c8422..9f958f43d 100644 --- a/Test/inc/TestDataGenerator.h +++ b/Test/inc/TestDataGenerator.h @@ -29,7 +29,20 @@ namespace TestUtils { static std::shared_ptr LoadMetadataSet(const std::string pmetaset, const std::string pmetaidx, SPTAG::SizeType start = 0, SPTAG::SizeType count = -1); - static float EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches); + // Compute recall against truth file. + // + // Distributed (per-node) recall: when each node only owns a SUBSET of + // the global query set, pass the global query count and this node's + // query offset so the truth row indexing is computed in global terms. + // The truth file is laid out as: + // [iter=0 VIDs for queries 0..Q-1] [iter=1 VIDs ...] ... + // [iter=0 dists for queries 0..Q-1] [iter=1 dists ...] ... + // where Q is the GLOBAL query count, NOT res.size(). With the legacy + // res.size()-based formula, distributed batches > 0 read the wrong + // rows (off by Q-myCount), giving near-random recall that's noise. + // totalQueries=-1 (default) preserves the legacy single-node formula. + static float EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches, + int totalQueries = -1, int queryOffset = 0); void RunBatches(std::shared_ptr &vecset, std::shared_ptr &metaset, std::shared_ptr &addvecset, std::shared_ptr &addmetaset, diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 95c1fc4d5..9ab420db9 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -5,6 +5,10 @@ #include "inc/Core/Common/DistanceUtils.h" #include "inc/Core/Common/QueryResultSet.h" #include "inc/Core/SPANN/Index.h" +#include "inc/Core/SPANN/Distributed/WorkerNode.h" +#include "inc/Core/SPANN/Distributed/DispatcherNode.h" +#include "inc/Core/SPANN/ExtraDynamicSearcher.h" +#include "inc/Core/SPANN/ExtraTiKVController.h" #include "inc/Core/SPANN/SPANNResultIterator.h" #include "inc/Core/VectorIndex.h" #include "inc/Core/Common/IQuantizer.h" @@ -17,10 +21,13 @@ #include "inc/Test.h" #include "inc/TestDataGenerator.h" +#include #include #include +#include #include #include +#include #include #include #include @@ -55,6 +62,181 @@ static __attribute__((constructor)) void install_segfault_handler() { using namespace SPTAG; +// --------------------------------------------------------------------------- +// Stride sharding (a.k.a. odd/even sharding) experiment +// --------------------------------------------------------------------------- +// When the env var SPFRESH_SHARD_STRIDE is set to "1"/"true", each node, instead +// of inserting a contiguous slice [n*B/N, (n+1)*B/N) of the per-iteration batch, +// inserts the strided rows {n, n+N, n+2*N, ...} where n=nodeIndex, N=numNodes. +// This breaks any spatial structure in the input dataset (e.g. SIFT files that +// are roughly sorted by visual feature), letting us check whether the layer-0 +// split skew (driver 71 vs worker 2 in v18) is caused by contiguous slicing +// landing similar vectors on the same node and overflowing a small set of heads. +// +// The total number of vectors inserted across all nodes per iteration is the +// same; only the assignment changes. Recall measurement still works because +// the dataset and ground truth are unchanged — only insert routing differs. +static bool IsStrideShardEnabled() { + const char* e = std::getenv("SPFRESH_SHARD_STRIDE"); + if (!e) return false; + std::string v(e); + return v == "1" || v == "true" || v == "TRUE" || v == "yes"; +} + +// Compute count of indices i in [0, total) with (i % stride) == offset. +static SizeType StrideCount(SizeType total, int stride, int offset) { + if (stride <= 1) return total; + if (offset < 0 || offset >= stride) return 0; + if (total <= offset) return 0; + return (total - 1 - offset) / stride + 1; +} + +// Build a strided sub-VectorSet by copying every `stride`-th vector starting +// at `offset` into a contiguous packed ByteArray. Returns a BasicVectorSet. +static std::shared_ptr ExtractStridedVectors( + const std::shared_ptr& full, int stride, int offset) +{ + if (!full) return nullptr; + SizeType totalCount = full->Count(); + SizeType outCount = StrideCount(totalCount, stride, offset); + auto vt = full->GetValueType(); + auto dim = full->Dimension(); + size_t perVecSize = full->PerVectorDataSize(); + if (outCount <= 0) { + return std::make_shared(ByteArray::Alloc(0), vt, dim, 0); + } + ByteArray buf = ByteArray::Alloc(static_cast(outCount) * perVecSize); + for (SizeType i = 0; i < outCount; ++i) { + SizeType srcIdx = static_cast(offset) + i * static_cast(stride); + std::memcpy(buf.Data() + static_cast(i) * perVecSize, + full->GetVector(srcIdx), + perVecSize); + } + return std::make_shared(buf, vt, dim, outCount); +} + +// Build a strided sub-MetadataSet. Two-pass: first compute offsets, then copy. +static std::shared_ptr ExtractStridedMetadata( + const std::shared_ptr& full, int stride, int offset) +{ + if (!full) return nullptr; + SizeType totalCount = full->Count(); + SizeType outCount = StrideCount(totalCount, stride, offset); + if (outCount <= 0) { + ByteArray emptyMeta = ByteArray::Alloc(0); + ByteArray offBuf = ByteArray::Alloc(sizeof(std::uint64_t)); + *reinterpret_cast(offBuf.Data()) = 0ULL; + return std::make_shared(emptyMeta, offBuf, 0); + } + std::vector offsets(static_cast(outCount) + 1, 0ULL); + std::uint64_t total = 0; + for (SizeType i = 0; i < outCount; ++i) { + SizeType srcIdx = static_cast(offset) + i * static_cast(stride); + ByteArray meta = full->GetMetadata(srcIdx); + offsets[i] = total; + total += meta.Length(); + } + offsets[outCount] = total; + ByteArray metaBuf = ByteArray::Alloc(total > 0 ? total : 1); + for (SizeType i = 0; i < outCount; ++i) { + SizeType srcIdx = static_cast(offset) + i * static_cast(stride); + ByteArray meta = full->GetMetadata(srcIdx); + if (meta.Length() > 0) { + std::memcpy(metaBuf.Data() + offsets[i], meta.Data(), meta.Length()); + } + } + ByteArray offBuf = ByteArray::Alloc((static_cast(outCount) + 1) * sizeof(std::uint64_t)); + std::memcpy(offBuf.Data(), offsets.data(), offsets.size() * sizeof(std::uint64_t)); + return std::make_shared(metaBuf, offBuf, outCount); +} + +// Helper: parse "host:port,host:port,..." into vector of pairs. +static std::vector> ParseNodeAddrs(const std::string& addrStr) { + std::vector> result; + auto parts = Helper::StrUtils::SplitString(addrStr, ","); + for (auto& part : parts) { + auto hp = Helper::StrUtils::SplitString(part, ":"); + if (hp.size() == 2) result.emplace_back(hp[0], hp[1]); + } + return result; +} + +// Helper: bind a WorkerNode to ALL ExtraDynamicSearcher layers inside a VectorIndex. +// Calls SetWorker() which wires up append, head-sync, and remote-lock callbacks. +// All layers must have the worker bound so that AddIDCapacity (called per-layer) sees +// the correct numNodes and grows each layer's TiKVVersionMap to cover the full global +// VID space (capa * numNodes), not just this node's slice. +template +static void BindWorkerToIndex(SPANN::WorkerNode* worker, std::shared_ptr& index) { + auto* spannIndex = dynamic_cast*>(index.get()); + if (!spannIndex) return; + for (int layer = 0; ; ++layer) { + auto diskIndex = spannIndex->GetDiskIndex(layer); + if (!diskIndex) break; + auto* searcher = dynamic_cast*>(diskIndex.get()); + if (searcher) searcher->SetWorker(worker); + } +} + +// Helper: same as BindWorkerToIndex but takes a raw SPANN::Index* directly +// (for sites that have already extracted the spannIndex pointer). +template +static void BindWorkerToAllLayers(SPANN::WorkerNode* worker, SPANN::Index* spannIndex) { + if (!spannIndex) return; + for (int layer = 0; ; ++layer) { + auto diskIndex = spannIndex->GetDiskIndex(layer); + if (!diskIndex) break; + auto* searcher = dynamic_cast*>(diskIndex.get()); + if (searcher) searcher->SetWorker(worker); + } +} + +// Configuration for distributed mode, read from [Distributed] ini section. +struct DistributedConfig { + bool enabled = false; + int workerIndex = 0; // 0-based: 0 = driver (dispatcher + worker 0), 1+ = remote worker + std::string dispatcherAddr; // "host:port" + std::string workerAddrs; // "host:port,host:port,..." + std::string storeAddrs; // "addr,addr,..." + std::string pdAddrs; // "host:port,host:port,..." (per-worker PD) + + // Number of workers (for query/insert partitioning) + int GetNumWorkers() const { + if (!enabled || workerAddrs.empty()) return 1; + return (int)std::count(workerAddrs.begin(), workerAddrs.end(), ',') + 1; + } + + // Parse dispatcher address into host:port pair + std::pair GetDispatcherAddr() const { + auto hp = Helper::StrUtils::SplitString(dispatcherAddr, ":"); + if (hp.size() == 2) return {hp[0], hp[1]}; + return {"", ""}; + } + + // Get PD address for this worker (falls back to global TiKVPDAddresses) + std::string GetLocalPDAddr() const { + if (pdAddrs.empty()) return ""; + auto addrs = Helper::StrUtils::SplitString(pdAddrs, ","); + if (workerIndex < (int)addrs.size()) return addrs[workerIndex]; + return addrs[0]; + } + + static DistributedConfig FromIni(Helper::IniReader& ini) { + DistributedConfig cfg; + cfg.enabled = ini.GetParameter("Distributed", "Enabled", false); + cfg.dispatcherAddr = ini.GetParameter("Distributed", "DispatcherAddr", std::string("")); + cfg.workerAddrs = ini.GetParameter("Distributed", "WorkerAddrs", std::string("")); + cfg.storeAddrs = ini.GetParameter("Distributed", "StoreAddrs", std::string("")); + cfg.pdAddrs = ini.GetParameter("Distributed", "PDAddrs", std::string("")); + + // Worker index from env var (0 = driver, 1+ = remote worker) + const char* wiEnv = std::getenv("WORKER_INDEX"); + cfg.workerIndex = wiEnv ? std::atoi(wiEnv) : 0; + + return cfg; + } +}; + namespace SPFreshTest { SizeType N = 10000; @@ -306,13 +488,17 @@ std::shared_ptr BuildIndex(const std::string &outDirectory, std::sh template std::shared_ptr BuildLargeIndex(const std::string &outDirectory, std::string &pvecset, - std::string& pmetaset, std::string& pmetaidx, Helper::IniReader& iniReader, const std::string &distMethod = "L2", + std::string& pmetaset, std::string& pmetaidx, const std::string &distMethod = "L2", int searchthread = 2, int insertthread = 2, int layers = 1, - std::shared_ptr quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin") + std::shared_ptr quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin", + const std::map& ssdOverrides = {}, + bool ssdOnly = false, + SPANN::WorkerNode* p_worker = nullptr) { auto vecIndex = VectorIndex::CreateInstance(IndexAlgoType::SPANN, GetEnumValueType()); int maxthreads = std::thread::hardware_concurrency(); int postingLimit = 4 * sizeof(T); + remove((outDirectory + FolderSep + "ssdmapping_0_postings").c_str()); std::string configuration = R"( [Base] DistCalcMethod=)" + distMethod + R"( @@ -399,15 +585,29 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st } } - for (const auto &sec : sections) + // Apply overrides (e.g., Storage, TiKV settings, SelectHead/BuildHead params) + for (const auto &[key, val] : ssdOverrides) { - auto params = iniReader.GetParameters(sec.c_str()); - for (const auto &[key, val] : params) - { - vecIndex->SetParameter(key.c_str(), val.c_str(), sec.c_str()); + // Keys prefixed with "SectionName." are routed to the corresponding section + auto dotPos = key.find('.'); + if (dotPos != std::string::npos) { + std::string section = key.substr(0, dotPos); + std::string param = key.substr(dotPos + 1); + vecIndex->SetParameter(param.c_str(), val.c_str(), section.c_str()); + } else { + vecIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex"); } } + // SSD-only mode: skip SelectHead and BuildHead, resume from specified layer + if (ssdOnly) + { + // Allow explicit ResumeLayer from config/overrides; otherwise default to layer 0 + // (rebuild SSD for all layers, reusing existing head indexes) + int resumeLayer = 0; + vecIndex->SetParameter("ResumeLayer", std::to_string(resumeLayer).c_str(), "BuildSSDIndex"); + } + if (quantizer) { vecIndex->SetParameter("QuantizerFilePath", quantizerFilePath.c_str(), "Base"); @@ -415,6 +615,20 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st vecIndex->SetQuantizerADC(false); vecIndex->SetParameter("Dim", std::to_string(quantizer->GetNumSubvectors()).c_str(), "Base"); } + + // Bind a routing worker (if any) to the freshly-created SSD searcher + // before BuildIndex runs. Build itself does not route postings any more + // (shared TiKV cluster — driver writes directly), so in buildOnly mode + // the workerPtr will simply be nullptr and this block is a no-op. + if (p_worker) { + if (auto* spannIdx = dynamic_cast*>(vecIndex.get())) { + spannIdx->SetWorker(p_worker); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "BuildLargeIndex: bound routing worker (numNodes=%d)\n", + p_worker->GetNumNodes()); + } + } + auto buildStatus = vecIndex->BuildIndex(); if (buildStatus != ErrorCode::Success) return nullptr; @@ -452,9 +666,19 @@ float Search(std::shared_ptr &vecIndex, std::shared_ptr return TestUtils::TestDataGenerator::EvaluateRecall(results, truth, k, k, batch, totalbatches); } +template +double ExecutePartitionedSearch(VectorIndex* index, + std::shared_ptr& queryset, + int myStart, int myCount, + int searchK, int numThreads, + std::vector& results, + std::vector* latenciesOut, + std::vector* statsOut); + template void InsertVectors(SPANN::Index *p_index, int insertThreads, int step, - std::shared_ptr addset, std::shared_ptr &metaset, int searchThreads = 0, std::shared_ptr queryset = nullptr, int numQueries = 0, int k = 5, std::ostream* benchmarkData = nullptr, int start = 0) + std::shared_ptr addset, std::shared_ptr &metaset, int searchThreads = 0, std::shared_ptr queryset = nullptr, int numQueries = 0, int k = 5, std::ostream* benchmarkData = nullptr, int start = 0, + SPANN::WorkerNode* router = nullptr) { p_index->ForceCompaction(); p_index->GetDBStat(); @@ -462,8 +686,15 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step std::vector threads; int printstep = step / 50; + + // Bulk path: single AddIndex call amortizes remote-append RPCs into one AppendBatchAsync. + // Per-vector RNGSelection is parallelized inside ExtraDynamicSearcher::AddIndex so we + // keep insertThreads-way parallelism while saving N-1 RPCs. + bool useBulk = (router && router->GetNumNodes() > 1); + + // Per-vector insert (original path): each thread grabs one vector at a time std::atomic_size_t vectorsSent(start); - auto func = [&]() { + auto perVecFunc = [&]() { size_t index = start; while (true) { @@ -500,43 +731,48 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step } }; - if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) { - std::vector latencies(numQueries); - std::vector results(numQueries); - std::vector duration(searchThreads); - - for (int i = 0; i < numQueries; i++) + // Bulk insert (router path): single call, parallelism inside SPANNIndex::AddIndex + auto bulkFunc = [&]() { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "InsertVectors: bulk AddIndex for %d vectors (router enabled)\n", step); + ErrorCode ret = p_index->AddIndex(addset->GetVector((SizeType)start), step, addset->Dimension(), metaset, true); + if (ret != ErrorCode::Success) { - results[i] = QueryResult((const ValueType *)queryset->GetVector(i), k, false); + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "AddIndex bulk failed. start:%d count:%d Dim:%d Error:%d\n", + start, step, addset->Dimension(), static_cast(ret)); } + BOOST_REQUIRE(ret == ErrorCode::Success); + }; - std::atomic_size_t queriesSent(0); - auto search = [&](int tid) { - auto s1 = std::chrono::high_resolution_clock::now(); - size_t qid; - while ((qid = queriesSent.fetch_add(1)) < numQueries) - { - auto t1 = std::chrono::high_resolution_clock::now(); - p_index->SearchIndex(results[qid]); - auto t2 = std::chrono::high_resolution_clock::now(); - latencies[qid] = std::chrono::duration_cast(t2 - t1).count() / 1000.0f; - } - auto s2 = std::chrono::high_resolution_clock::now(); - duration[tid] = std::chrono::duration_cast(s2 - s1).count() / 1000.0f; - }; + std::function func; + int insertThreadCount; + if (useBulk) { + func = bulkFunc; + insertThreadCount = 1; + } else { + func = perVecFunc; + insertThreadCount = insertThreads; + } + + if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) { + std::vector latencies; + std::vector results; + double searchWallSeconds = 0.0; - for (int j = 0; j < insertThreads; j++) + for (int j = 0; j < insertThreadCount; j++) { threads.emplace_back(func); } - for (int j = 0; j < searchThreads; j++) - { - threads.emplace_back(search, j); - } + std::thread searchThread([&]() { + searchWallSeconds = ExecutePartitionedSearch( + p_index, queryset, /*myStart=*/0, numQueries, k, searchThreads, + results, &latencies, /*statsOut=*/nullptr); + }); for (auto &thread : threads) { thread.join(); } + searchThread.join(); // Calculate statistics float mean = 0, minLat = (std::numeric_limits::max)(), maxLat = 0; @@ -553,10 +789,7 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step float p90 = latencies[static_cast(numQueries * 0.90)]; float p95 = latencies[static_cast(numQueries * 0.95)]; float p99 = latencies[static_cast(numQueries * 0.99)]; - float maxBatchLatency = 1e-6; - for (int i = 0; i < searchThreads; i++) - if (maxBatchLatency < duration[i]) maxBatchLatency = duration[i]; - float qps = numQueries / maxBatchLatency; + float qps = numQueries / std::max(static_cast(searchWallSeconds), 1e-6f); *benchmarkData << " \"numQueries\": " << numQueries << ",\n"; *benchmarkData << " \"meanLatency\": " << mean << ",\n"; @@ -567,6 +800,17 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step *benchmarkData << " \"minLatency\": " << minLat << ",\n"; *benchmarkData << " \"maxLatency\": " << maxLat << ",\n"; *benchmarkData << " \"qps\": " << qps << ",\n"; + } else { + // No search-during-insert path: just run the insert threads. + // (Used by worker dispatch and any caller that doesn't need stats.) + for (int j = 0; j < insertThreadCount; j++) + { + threads.emplace_back(func); + } + for (auto &thread : threads) + { + thread.join(); + } } auto barrierStart = std::chrono::high_resolution_clock::now(); size_t barrierPolls = 0; @@ -587,72 +831,82 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step } + + + template void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ptr &queryset, std::shared_ptr &truth, const std::string &truthPath, SizeType baseVectorCount, int topK, int searchK, int numThreads, int numQueries, int batches, int totalbatches, - std::ostream &benchmarkData, std::string prefix = "") + std::ostream &benchmarkData, std::string prefix = "", + int nodeIndex = 0, SPANN::WorkerNode* router = nullptr, + SPANN::DispatcherNode* dispatcher = nullptr) { - // Benchmark: Query performance with detailed latency stats - std::vector latencies(numQueries); - std::atomic_size_t queriesSent(0); - std::vector results(numQueries); - std::vector searchStats(numQueries); - auto* spannIndex = dynamic_cast*>(index.get()); - - for (int i = 0; i < numQueries; i++) - { - results[i] = QueryResult((const T *)queryset->GetVector(i), searchK, false); + // Use hash ring node count (workers only) for partitioning, not GetNumNodes() (includes dispatcher) + auto ring = (router && router->IsEnabled()) ? router->GetHashRing() : nullptr; + int nodeCount = ring ? static_cast(ring->NodeCount()) : 1; + bool distributed = (dispatcher != nullptr && router != nullptr && router->IsEnabled() && nodeCount > 1); + + // Determine this node's query range (balanced contiguous partition) + int myStart = 0, myCount = numQueries; + if (distributed) { + myStart = (int)((long long)nodeIndex * numQueries / nodeCount); + int myEnd = (int)((long long)(nodeIndex + 1) * numQueries / nodeCount); + myCount = myEnd - myStart; } - std::vector threads; - threads.reserve(numThreads); - - auto batchStart = std::chrono::high_resolution_clock::now(); - - for (int i = 0; i < numThreads; i++) - { - threads.emplace_back([&]() { - size_t qid; - while ((qid = queriesSent.fetch_add(1)) < numQueries) - { - auto t1 = std::chrono::high_resolution_clock::now(); - if (spannIndex != nullptr) - { - spannIndex->SearchIndex(results[qid], &searchStats[qid]); - } - else - { - index->SearchIndex(results[qid]); - } - auto t2 = std::chrono::high_resolution_clock::now(); - latencies[qid] = std::chrono::duration_cast(t2 - t1).count() / 1000.0f; - } - }); + // Dispatch search command to all workers via TCP (distributed only) + std::int64_t dispatchId = -1; + int round = 0; + if (distributed) { + static std::atomic s_searchRound{0}; + round = s_searchRound.fetch_add(1); + dispatchId = dispatcher->BroadcastDispatchCommand( + SPANN::DispatchCommand::Type::Search, static_cast(round)); } - for (auto &thread : threads) - thread.join(); + // Run this node's share of queries. + std::vector results; + std::vector latencies; + std::vector searchStats; + double localWallTime = ExecutePartitionedSearch( + index.get(), queryset, myStart, myCount, searchK, numThreads, + results, &latencies, &searchStats); + float batchLatency = static_cast(localWallTime); + auto* spannIndex = dynamic_cast*>(index.get()); - auto batchEnd = std::chrono::high_resolution_clock::now(); - float batchLatency = - std::chrono::duration_cast(batchEnd - batchStart).count() / 1000000.0f; + if (distributed) { + // Driver also runs searches against its local node, so it can have + // outgoing merge hints queued. Drain before we move on. + if (router) { + router->FlushRemoteMerges(); + } + // Collect worker timings via TCP; QPS is governed by the slowest node. + auto workerTimes = dispatcher->WaitForAllResults(dispatchId, 300); + for (double wt : workerTimes) { + batchLatency = std::max(batchLatency, static_cast(wt)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "BenchmarkQueryPerformance round %d: local=%.1fms (%d queries), max=%.1fms, QPS=%.1f\n", + round, localWallTime * 1000, myCount, batchLatency * 1000, numQueries / batchLatency); + } - // Calculate statistics + // Calculate statistics (from this node's queries) + int statsCount = myCount; float mean = 0, minLat = (std::numeric_limits::max)(), maxLat = 0; - for (int i = 0; i < numQueries; i++) + for (int i = 0; i < statsCount; i++) { mean += latencies[i]; minLat = (std::min)(minLat, latencies[i]); maxLat = (std::max)(maxLat, latencies[i]); } - mean /= numQueries; + mean /= statsCount; std::sort(latencies.begin(), latencies.end()); - float p50 = latencies[static_cast(numQueries * 0.50)]; - float p90 = latencies[static_cast(numQueries * 0.90)]; - float p95 = latencies[static_cast(numQueries * 0.95)]; - float p99 = latencies[static_cast(numQueries * 0.99)]; + float p50 = latencies[static_cast(statsCount * 0.50)]; + float p90 = latencies[static_cast(statsCount * 0.90)]; + float p95 = latencies[static_cast(statsCount * 0.95)]; + float p99 = latencies[static_cast(statsCount * 0.99)]; float qps = numQueries / batchLatency; BOOST_TEST_MESSAGE(" Queries: " << numQueries); @@ -749,7 +1003,7 @@ void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ benchmarkData << prefix << " },\n"; } - // Recall evaluation (if truth file provided) + // Recall evaluation if (!truth || truthPath.empty() || truthPath == "none") { BOOST_TEST_MESSAGE(" Recall evaluation skipped (no truth data)"); @@ -760,7 +1014,13 @@ void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ BOOST_TEST_MESSAGE("Checking for truth file: " << truthPath); std::shared_ptr pvecset, paddvecset; - float avgRecall = TestUtils::TestDataGenerator::EvaluateRecall(results, truth, topK, searchK, batches, totalbatches); + // In distributed mode, this node only searched queries [myStart, myStart+myCount). + // Pass the global query count and this node's offset so EvaluateRecall indexes + // the truth file in global terms (BATCH > 0 reads the wrong truth rows otherwise). + int recallTotalQueries = distributed ? numQueries : -1; + int recallQueryOffset = distributed ? myStart : 0; + float avgRecall = TestUtils::TestDataGenerator::EvaluateRecall(results, truth, topK, searchK, batches, totalbatches, + recallTotalQueries, recallQueryOffset); BOOST_TEST_MESSAGE(" Recall" << topK << "@" << searchK << " = " << (avgRecall * 100.0f) << "%"); BOOST_TEST_MESSAGE(" (Evaluated on " << numQueries << " queries against base vectors)"); benchmarkData << std::fixed << std::setprecision(4); @@ -772,6 +1032,115 @@ void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ benchmarkData << prefix << " }"; } +// Run [myStart, myStart+myCount) queries against `index` using `numThreads` workers. +// Returns wall time in seconds. Fills `results` and (when non-null) per-query +// `latenciesOut` (ms) and `statsOut` (SPANN SearchStats). When `statsOut` is +// non-null and the index is a SPANN index, the stats overload of SearchIndex +// is used; otherwise the plain SearchIndex path runs. +template +double ExecutePartitionedSearch(VectorIndex* index, + std::shared_ptr& queryset, + int myStart, int myCount, + int searchK, int numThreads, + std::vector& results, + std::vector* latenciesOut, + std::vector* statsOut) +{ + auto* spannIndex = dynamic_cast*>(index); + bool useStats = (statsOut != nullptr && spannIndex != nullptr); + + results.resize(myCount); + for (int i = 0; i < myCount; i++) { + results[i] = QueryResult((const T*)queryset->GetVector(myStart + i), searchK, false); + } + if (useStats) statsOut->assign(myCount, SPANN::SearchStats()); + if (latenciesOut) latenciesOut->assign(myCount, 0.0f); + + std::atomic_size_t queriesSent(0); + int nThreads = std::min(numThreads, std::max(myCount, 1)); + std::vector threads; + threads.reserve(nThreads); + + auto t0 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nThreads; i++) { + threads.emplace_back([&]() { + size_t qid; + while ((qid = queriesSent.fetch_add(1)) < static_cast(myCount)) { + auto t1 = std::chrono::high_resolution_clock::now(); + if (useStats) { + spannIndex->SearchIndex(results[qid], &(*statsOut)[qid]); + } else if (spannIndex != nullptr) { + spannIndex->SearchIndex(results[qid]); + } else { + index->SearchIndex(results[qid]); + } + auto t2 = std::chrono::high_resolution_clock::now(); + if (latenciesOut) { + (*latenciesOut)[qid] = + std::chrono::duration_cast(t2 - t1).count() / 1000.0f; + } + } + }); + } + for (auto& t : threads) t.join(); + auto t3 = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(t3 - t0).count() / 1000000.0; +} + +ErrorCode QuantizeVectors(const std::shared_ptr& quantizer, + const std::shared_ptr& source, + ByteArray& dest); + +template +void LoadAndInsertBatch(SPANN::Index* spannIndex, + const std::string& paddset, + const std::string& paddmeta, + const std::string& paddmetaidx, + int dimension, + int insertStart, int loadCount, int perNodeBatch, + bool strideShard, int numNodes, int nodeIndex, + int numInsertThreads, + SPANN::WorkerNode* router, + std::shared_ptr quantizer, + int searchDuringInsertThreads, + std::shared_ptr queryset, + int numQueries, int searchK, + std::ostream* benchmarkData, + const char* logPrefix) +{ + auto addset = TestUtils::TestDataGenerator::LoadVectorSet(paddset, dimension, insertStart, loadCount); + if (quantizer) { + auto addFloat = ConvertToFloatVectorSet(addset); + BOOST_REQUIRE(addFloat != nullptr); + ByteArray quantizedAddBytes = + ByteArray::Alloc((size_t)addFloat->Count() * (size_t)(quantizer->GetNumSubvectors())); + BOOST_REQUIRE(QuantizeVectors(quantizer, addFloat, quantizedAddBytes) == ErrorCode::Success); + addset = std::make_shared(quantizedAddBytes, + VectorValueType::UInt8, + quantizer->GetNumSubvectors(), + addFloat->Count()); + } + auto addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, loadCount); + if (strideShard) { + addset = ExtractStridedVectors(addset, numNodes, nodeIndex); + addmetaset = ExtractStridedMetadata(addmetaset, numNodes, nodeIndex); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "%s: stride-shard batchStart=%d loadCount=%d -> kept=%d (every %d-th, offset=%d)\n", + logPrefix, insertStart, loadCount, + (int)(addset ? addset->Count() : 0), numNodes, nodeIndex); + } + InsertVectors(spannIndex, numInsertThreads, perNodeBatch, + addset, addmetaset, + searchDuringInsertThreads, queryset, numQueries, searchK, + benchmarkData, 0, router); + if (router) { + router->FlushRemoteAppends(); + router->FlushRemoteMerges(); + router->LogRouteStats(" (batch flush)"); + router->ResetRouteStats(); + } +} + template void LogCheckpointLayerStats(const std::shared_ptr& index, int layers, int currentBatch, int totalBatches) { @@ -836,9 +1205,13 @@ ErrorCode QuantizeVectors(const std::shared_ptr& quantizer, template void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, const std::string &truthPath, DistCalcMethod distMethod, const std::string &indexPath, int dimension, int baseVectorCount, - int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries, Helper::IniReader& iniReader, + int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries, const std::string &outputFile = "output.json", const bool rebuild = true, const int resume = -1, - const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1) + const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1, + const std::map& ssdOverrides = {}, + bool rebuildSsdOnly = false, + bool buildOnly = false, + const DistributedConfig& distCfg = {}) { int oldM = M, oldK = K, oldN = N, oldQueries = queries; N = baseVectorCount; @@ -849,6 +1222,27 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c int insertBatchSize = insertVectorCount / max(batches, 1); int deleteBatchSize = deleteVectorCount / max(batches, 1); + // Use distributed config for multi-node partitioning + int nodeIndex = distCfg.workerIndex; + int numNodes = distCfg.GetNumWorkers(); + bool strideShard = IsStrideShardEnabled() && numNodes > 1; + int myInsertStart, myInsertEnd, perNodeBatch; + if (strideShard) { + // Stride mode: each node loads the FULL per-iter batch then keeps rows + // where (rowIdx % numNodes) == nodeIndex. myInsertStart/End span the + // full batch; perNodeBatch is the count of strided rows. + myInsertStart = 0; + myInsertEnd = insertBatchSize; + perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); + } else { + myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + perNodeBatch = myInsertEnd - myInsertStart; + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d strideShard=%d\n", + nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch, strideShard ? 1 : 0); + // Variables to collect JSON output data std::ostringstream tmpbenchmark; @@ -902,12 +1296,78 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c jsonFile << " \"results\": {\n"; int SearchK = enableQuantization? topK * 4 : topK; + // Distributed routing: dispatcher + local worker (driver node is both) + std::unique_ptr dispatcher; + std::unique_ptr worker; + SPANN::WorkerNode* workerPtr = nullptr; // convenience alias std::shared_ptr index; std::shared_ptr quantizer; - + + // Distributed setup: when running a non-buildOnly distributed benchmark + // (i.e. the search/insert run phase), create the dispatcher + worker0 + // so the driver can broadcast the hash ring and accept remote callbacks. + // BuildOnly mode skips this entirely — build runs single-node and writes + // straight to the shared TiKV cluster (PD routes each key to the owning + // store), so no dispatcher / worker plumbing is needed for the build + // path. + if (distCfg.enabled && !buildOnly) { + auto dispAddr = distCfg.GetDispatcherAddr(); + auto workerAddrs = ParseNodeAddrs(distCfg.workerAddrs); + auto storeAddrs = Helper::StrUtils::SplitString(distCfg.storeAddrs, ","); + + dispatcher.reset(new SPANN::DispatcherNode()); + BOOST_REQUIRE_MESSAGE(dispatcher->Initialize(dispAddr, workerAddrs), + "DispatcherNode initialization failed (build-phase setup)"); + BOOST_REQUIRE(dispatcher->Start()); + + worker.reset(new SPANN::WorkerNode()); + // Pre-build: pass nullptr DB. After BuildIndex, swap in the real DB + // via SetDB() (or rebuild the worker on top of it for run mode). + BOOST_REQUIRE_MESSAGE( + worker->Initialize(nullptr, 0, dispAddr, workerAddrs, storeAddrs), + "WorkerNode initialization failed (build-phase setup)"); + BOOST_REQUIRE(worker->Start()); + workerPtr = worker.get(); + + dispatcher->SetLocalWorkerIndex(worker->GetLocalNodeIndex()); + worker->SetHashRing(dispatcher->GetHashRing()); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Pre-build: waiting for all peer connections...\n"); + BOOST_REQUIRE_MESSAGE(dispatcher->WaitForAllPeersConnected(180), + "Timed out waiting for peer connections (build-phase)"); + + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(180); + while (std::chrono::steady_clock::now() < deadline) { + if (dispatcher->AllWorkersAcked()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + BOOST_REQUIRE_MESSAGE(dispatcher->AllWorkersAcked(), + "Timed out waiting for workers to ACK ring (build-phase)"); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Pre-build: all %d workers connected and ring synchronized\n", numNodes); + + // Start heartbeat pump so remote workers can detect driver failure + // and exit cleanly instead of relying on a fixed wall-clock receiver + // timeout. Worker side enforces HeartbeatTimeoutSec (default 180s). + // Interval is fixed at 30s; six missed pings before worker bails. + dispatcher->StartHeartbeat(30); + } + // Build initial index BOOST_TEST_MESSAGE("\n=== Building Index ==="); - if (rebuild || !direxists(indexPath.c_str())) { + if (rebuild || rebuildSsdOnly || !direxists(indexPath.c_str())) { + if (!rebuildSsdOnly) { + // Allow empty or non-existent directories; block only if index files already exist + if (direxists(indexPath.c_str()) && fileexists((indexPath + FolderSep + "indexloader.ini").c_str())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "Index directory '%s' already exists with index files. Refusing to delete. " + "Remove it manually or use RebuildSSDOnly=true to resume.\n", + indexPath.c_str()); + BOOST_FAIL("Index directory already exists: " + indexPath); + return; + } + } auto buildstart = std::chrono::high_resolution_clock::now(); if (enableQuantization) @@ -932,13 +1392,13 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c quantizedBase->Save(pquanvecset); } - index = BuildLargeIndex(indexPath, pquanvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin"); + index = BuildLargeIndex(indexPath, pquanvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin", ssdOverrides, rebuildSsdOnly, workerPtr); BOOST_REQUIRE(index != nullptr); index->SetQuantizerADC(true); } else { - index = BuildLargeIndex(indexPath, pvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers); + index = BuildLargeIndex(indexPath, pvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, nullptr, "quantizer.bin", ssdOverrides, rebuildSsdOnly, workerPtr); BOOST_REQUIRE(index != nullptr); } @@ -954,6 +1414,23 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c BOOST_REQUIRE(index != nullptr); } + // Set up distributed routing for RUN mode if configured. + // (Build-phase needs no dispatcher/worker; the run-phase dispatcher+worker + // were created in the pre-build block above.) The driver node is both + // dispatcher (ring management) and worker 0 (compute). + if (distCfg.enabled && !buildOnly) { + // Bind worker to ALL searcher layers (wires append + headsync + lock + fetch callbacks). + // Every layer must see the worker so AddIDCapacity grows each layer's + // version map by capa * numNodes (not just capa). + auto* spannIndex = dynamic_cast*>(index.get()); + BOOST_REQUIRE(spannIndex != nullptr); + BindWorkerToAllLayers(workerPtr, spannIndex); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Run mode: worker bound to all %d layers\n", + (int)spannIndex->GetOptions()->m_layers); + } + auto queryset = TestUtils::TestDataGenerator::LoadVectorSet(pqueryset, M); BOOST_REQUIRE(queryset != nullptr); @@ -973,32 +1450,50 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c truth = TestUtils::TestDataGenerator::LoadVectorSet(ptruth, K); } - // Benchmark 0: Query performance before insertions (round 1 — cold cache) - BOOST_TEST_MESSAGE("\n=== Benchmark 0: Query Before Insertions (Round 1) ==="); - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, tmpbenchmark); - jsonFile << " \"benchmark0_query_before_insert\": "; - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, jsonFile); - jsonFile << ",\n"; - jsonFile.flush(); - - // Benchmark 0b: Query performance before insertions (round 2 — warm cache) - BOOST_TEST_MESSAGE("\n=== Benchmark 0b: Query Before Insertions (Round 2) ==="); - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, tmpbenchmark); - jsonFile << " \"benchmark0b_query_before_insert_round2\": "; - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, jsonFile); - jsonFile << ",\n"; - jsonFile.flush(); + // Benchmark 0/0b: query performance before insertions. Skip in BuildOnly + // mode (no point measuring queries when we're about to exit; queries also + // require workers to be running for distributed scatter-gather). + if (!buildOnly) { + // Benchmark 0: Query performance before insertions (round 1 — cold cache) + BOOST_TEST_MESSAGE("\n=== Benchmark 0: Query Before Insertions (Round 1) ==="); + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, tmpbenchmark, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << " \"benchmark0_query_before_insert\": "; + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, jsonFile, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << ",\n"; + jsonFile.flush(); + + // Benchmark 0b: Query performance before insertions (round 2 — warm cache) + BOOST_TEST_MESSAGE("\n=== Benchmark 0b: Query Before Insertions (Round 2) ==="); + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, tmpbenchmark, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << " \"benchmark0b_query_before_insert_round2\": "; + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, jsonFile, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << ",\n"; + jsonFile.flush(); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "BuildOnly=true: skipping Benchmark 0/0b query rounds\n"); + jsonFile << " \"benchmark0_query_before_insert\": {},\n"; + jsonFile << " \"benchmark0b_query_before_insert_round2\": {},\n"; + jsonFile.flush(); + } BOOST_REQUIRE(index->SaveIndex(indexPath) == ErrorCode::Success); index = nullptr; // Benchmark 1: Insert performance - if (insertBatchSize > 0) + if (buildOnly) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "BuildOnly=true: skipping insert batches, index saved to %s\n", indexPath.c_str()); + jsonFile << " \"benchmark1_insert\": {}\n"; + } + else if (insertBatchSize > 0) { BOOST_TEST_MESSAGE("\n=== Benchmark 1: Insert Performance ==="); { @@ -1076,31 +1571,53 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Cloned index from %s to %s, check:%d, time: %f seconds\n", prevPath.c_str(), clonePath.c_str(), (int)(cloneret == ErrorCode::Success), seconds); - int insertStart = iter * insertBatchSize; + // Re-bind the worker to ALL layers of the new cloned index's searchers + // (every layer must see the worker so AddIDCapacity grows each layer's + // version map by capa * numNodes). + if (workerPtr) { + BindWorkerToIndex(workerPtr, cloneIndex); + } + + // Dispatch insert command to workers via TCP + std::uint64_t insertDispatchId = 0; + if (dispatcher && numNodes > 1) { + insertDispatchId = dispatcher->BroadcastDispatchCommand( + SPANN::DispatchCommand::Type::Insert, static_cast(iter)); + } + + // Each node inserts its partition. Default mode: contiguous slice + // [iter*batchSize + myInsertStart, +perNodeBatch). Stride mode: + // every numNodes-th row of the full batch starting at nodeIndex + // (loads full batch then filters down to perNodeBatch rows). + int insertStart = iter * insertBatchSize + myInsertStart; + int loadCount = strideShard ? insertBatchSize : perNodeBatch; { - std::shared_ptr addset = TestUtils::TestDataGenerator::LoadVectorSet(paddset, M, insertStart, insertBatchSize); - ByteArray quantizedAddBytes; - if (enableQuantization) { - auto addFloat = ConvertToFloatVectorSet(addset); - BOOST_REQUIRE(addFloat != nullptr); - quantizedAddBytes = ByteArray::Alloc((size_t)addFloat->Count() * (size_t)(quantizer->GetNumSubvectors())); - BOOST_REQUIRE(QuantizeVectors(quantizer, addFloat, quantizedAddBytes) == ErrorCode::Success); - addset = std::make_shared(quantizedAddBytes, - VectorValueType::UInt8, - quantizer->GetNumSubvectors(), - addFloat->Count()); - } - std::shared_ptr addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, insertBatchSize); + std::string driverTag = "RunBenchmark iter=" + std::to_string(iter); start = std::chrono::high_resolution_clock::now(); - InsertVectors(static_cast *>(cloneIndex.get()), numInsertThreads, insertBatchSize, - addset, addmetaset, numSearchDuringInsertThreads, queryset, numQueries, SearchK, &jsonFile, 0); - end = std::chrono::high_resolution_clock::now(); + LoadAndInsertBatch(static_cast*>(cloneIndex.get()), + paddset, paddmeta, paddmetaidx, M, + insertStart, loadCount, perNodeBatch, + strideShard, numNodes, nodeIndex, + numInsertThreads, workerPtr, + enableQuantization ? quantizer : nullptr, + numSearchDuringInsertThreads, queryset, + numQueries, SearchK, &jsonFile, + driverTag.c_str()); } + + // Wait for all worker nodes to finish this batch via TCP. + if (insertDispatchId > 0) { + auto workerTimes = dispatcher->WaitForAllResults(insertDispatchId, 7200); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Driver: all %d workers finished batch %d\n", + (int)workerTimes.size(), iter + 1); + } + + end = std::chrono::high_resolution_clock::now(); seconds = std::chrono::duration_cast(end - start).count() / 1000000.0f; double throughput = insertBatchSize / seconds; - BOOST_TEST_MESSAGE(" Inserted: " << insertBatchSize << " vectors"); + BOOST_TEST_MESSAGE(" Inserted: " << insertBatchSize << " vectors (" << perNodeBatch << " local)"); BOOST_TEST_MESSAGE(" Time: " << seconds << " seconds"); BOOST_TEST_MESSAGE(" Throughput: " << throughput << " vectors/sec"); @@ -1164,17 +1681,21 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c BOOST_TEST_MESSAGE("\n=== Benchmark 2: Query After Insertions and Deletions ==="); jsonFile << " \"search\":"; BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, topK, SearchK, numSearchThreads, - numQueries, iter + 1, batches, tmpbenchmark, " "); + numQueries, iter + 1, batches, tmpbenchmark, " ", + nodeIndex, workerPtr, dispatcher.get()); BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, - topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " "); + topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " ", + nodeIndex, workerPtr, dispatcher.get()); jsonFile << ",\n"; BOOST_TEST_MESSAGE("\n=== Benchmark 2b: Query After Insertions and Deletions (Round 2) ==="); jsonFile << " \"search_round2\":"; BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, topK, SearchK, numSearchThreads, - numQueries, iter + 1, batches, tmpbenchmark, " "); + numQueries, iter + 1, batches, tmpbenchmark, " ", + nodeIndex, workerPtr, dispatcher.get()); BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, - topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " "); + topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " ", + nodeIndex, workerPtr, dispatcher.get()); jsonFile << ",\n"; start = std::chrono::high_resolution_clock::now(); @@ -1223,6 +1744,18 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c jsonFile << "}\n"; jsonFile.close(); + // Stop workers in distributed mode + if (dispatcher && numNodes > 1) { + // Stop the heartbeat pump first so we don't race a stray Heartbeat + // packet against the Stop dispatch on the same connection. + dispatcher->StopHeartbeat(); + auto dispatchId = dispatcher->BroadcastDispatchCommand(SPANN::DispatchCommand::Type::Stop, 0); + // Wait briefly for ACKs so workers exit cleanly before the driver + // tears down the network (which would force-kill in-flight RPCs). + dispatcher->WaitForAllResults(dispatchId, 60); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Driver: sent Stop command to all workers\n"); + } + M = oldM; K = oldK; N = oldN; @@ -2198,6 +2731,14 @@ BOOST_AUTO_TEST_CASE(IterativeSearchPerf) std::filesystem::remove_all("original_index"); } +// Forward declaration +template +void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, + int insertVectorCount, int batches, int topK, int numSearchThreads, + int numInsertThreads, int numQueries, VectorValueType valueType, + const std::map& ssdOverrides, + const DistributedConfig& distCfg, int workerTimeout); + BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) { using namespace SPFreshTest; @@ -2245,14 +2786,59 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) int topK = iniReader.GetParameter("Benchmark", "TopK", 10); int numSearchThreads = iniReader.GetParameter("Benchmark", "NumSearchThreads", 8); int numInsertThreads = iniReader.GetParameter("Benchmark", "NumInsertThreads", 8); - int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0); int numSearchDuringInsertThreads = iniReader.GetParameter("Benchmark", "NumSearchDuringInsertThreads", 1); + int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0); int numQueries = iniReader.GetParameter("Benchmark", "NumQueries", 1000); int layers = iniReader.GetParameter("Benchmark", "Layers", 1); DistCalcMethod distMethod = iniReader.GetParameter("Benchmark", "DistMethod", DistCalcMethod::L2); - bool rebuild = (iniReader.GetParameter("Benchmark", "Rebuild", true) || iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false)); + bool rebuild = iniReader.GetParameter("Benchmark", "Rebuild", true); + bool rebuildSsdOnly = iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false); + bool buildOnly = iniReader.GetParameter("Benchmark", "BuildOnly", false); int resume = iniReader.GetParameter("Benchmark", "Resume", -1); + // Read storage backend overrides for BuildSSDIndex + std::map ssdOverrides; + std::string storage = iniReader.GetParameter("Benchmark", "Storage", std::string("")); + if (!storage.empty()) { + ssdOverrides["Storage"] = storage; + } + std::string tikvKeyPrefix = iniReader.GetParameter("Benchmark", "TiKVKeyPrefix", std::string("")); + if (!tikvKeyPrefix.empty()) { + ssdOverrides["TiKVKeyPrefix"] = tikvKeyPrefix; + } + if (appendThreadNum > 0) { + ssdOverrides["AppendThreadNum"] = std::to_string(appendThreadNum); + } + + // Pass through any [BuildSSDIndex] section params from the ini as overrides + auto buildSSDParams = iniReader.GetParameters("BuildSSDIndex"); + for (const auto &[key, val] : buildSSDParams) { + ssdOverrides[key] = val; + } + + // Read distributed config from [Distributed] section + auto distCfg = DistributedConfig::FromIni(iniReader); + + // Shared TiKV raft cluster: every compute node connects to the FULL PD + // endpoint list. The TiKV client uses PD-raft to route reads/writes to + // whichever store owns the region, so any compute can access any posting. + if (!distCfg.pdAddrs.empty()) { + ssdOverrides["TiKVPDAddresses"] = distCfg.pdAddrs; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Using PD address: %s (workerIndex=%d)\n", + distCfg.pdAddrs.c_str(), distCfg.workerIndex); + } + + // Pass through [SelectHead] and [BuildHead] params as overrides too + auto selectHeadParams = iniReader.GetParameters("SelectHead"); + for (const auto &[key, val] : selectHeadParams) { + ssdOverrides["SelectHead." + key] = val; + } + auto buildHeadParams = iniReader.GetParameters("BuildHead"); + for (const auto &[key, val] : buildHeadParams) { + ssdOverrides["BuildHead." + key] = val; + } + BOOST_TEST_MESSAGE("=== Benchmark Configuration ==="); BOOST_TEST_MESSAGE("Vector Path: " << vectorPath); BOOST_TEST_MESSAGE("Query Path: " << queryPath); @@ -2273,31 +2859,224 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) BOOST_TEST_MESSAGE("QuantizedDim: " << quantizedDim); } + // Worker node path: if distributed and workerIndex > 0, run as remote worker and return + if (distCfg.enabled && distCfg.workerIndex > 0) { + int workerTimeout = iniReader.GetParameter("Benchmark", "WorkerTimeout", 3600); + BOOST_TEST_MESSAGE("Running as worker node " << distCfg.workerIndex); + if (valueType == VectorValueType::Float) + RunWorker(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout); + else if (valueType == VectorValueType::Int8) + RunWorker(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout); + else if (valueType == VectorValueType::UInt8) + RunWorker(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout); + return; + } + // Get output file path from environment variable or use default const char *outputPath = std::getenv("BENCHMARK_OUTPUT"); std::string outputFile = outputPath ? std::string(outputPath) : "output.json"; BOOST_TEST_MESSAGE("Output File: " << outputFile); - // Dispatch to appropriate type + // Driver path (nodeIndex == 0 or single-node mode) if (valueType == VectorValueType::Float) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, outputFile, + rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg); } else if (valueType == VectorValueType::Int8) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg); } else if (valueType == VectorValueType::UInt8) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg); + } +} + +/// Worker node path for distributed benchmark (nodeIndex > 0). +/// Loads a pre-built head index, connects to TiKV, starts WorkerNode, +/// and waits for TCP dispatch commands from the driver node. +template +void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, + int insertVectorCount, int batches, int topK, int numSearchThreads, + int numInsertThreads, int numQueries, VectorValueType valueType, + const std::map& ssdOverrides, + const DistributedConfig& distCfg, int workerTimeout) +{ + int oldN = N, oldM = M, oldK = K, oldQ = queries; + N = baseVectorCount; M = dimension; K = topK; queries = numQueries; + + int nodeIndex = distCfg.workerIndex; + int numNodes = distCfg.GetNumWorkers(); + int insertBatchSize = insertVectorCount / std::max(batches, 1); + bool strideShard = IsStrideShardEnabled() && numNodes > 1; + int myInsertStart, myInsertEnd, perNodeBatch; + if (strideShard) { + myInsertStart = 0; + myInsertEnd = insertBatchSize; + perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); + } else { + myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + perNodeBatch = myInsertEnd - myInsertStart; + } + + BOOST_TEST_MESSAGE("Worker node " << nodeIndex << ": Loading index from " << indexPath); + std::shared_ptr index; + // IMPORTANT: Pass ssdOverrides through LoadIndex so that worker-specific settings + // (especially TiKVPDAddresses pointing at this worker's local PD) are applied + // BEFORE the underlying TiKV connection is constructed in PrepareDB. Without this, + // the worker would inherit the driver's PD address from the saved indexloader.ini + // and route every KV write back to the driver's TiKV instead of its own. + BOOST_REQUIRE(VectorIndex::LoadIndex(indexPath, ssdOverrides, index) == ErrorCode::Success); + BOOST_REQUIRE(index != nullptr); + + // Create WorkerNode + auto dispAddr = distCfg.GetDispatcherAddr(); + auto workerAddrs = ParseNodeAddrs(distCfg.workerAddrs); + auto storeAddrs = Helper::StrUtils::SplitString(distCfg.storeAddrs, ","); + + auto* spannIndex = dynamic_cast*>(index.get()); + BOOST_REQUIRE_MESSAGE(spannIndex != nullptr, "Failed to cast to SPANN::Index"); + auto diskIndex = spannIndex->GetDiskIndex(0); + BOOST_REQUIRE(diskIndex != nullptr); + auto* searcher = dynamic_cast*>(diskIndex.get()); + BOOST_REQUIRE(searcher != nullptr); + auto workerDb = searcher->GetDB(); + BOOST_REQUIRE_MESSAGE(workerDb != nullptr, "Worker: could not extract db from index"); + + SPANN::WorkerNode workerNode; + BOOST_REQUIRE_MESSAGE(workerNode.Initialize(workerDb, nodeIndex, dispAddr, workerAddrs, storeAddrs), + "WorkerNode initialization failed"); + BOOST_REQUIRE(workerNode.Start()); + auto* router = &workerNode; + + // Bind worker to ALL searcher layers (every layer must see the worker so + // AddIDCapacity grows each layer's version map by capa * numNodes). + BindWorkerToAllLayers(router, spannIndex); + + // Wait for ring from dispatcher + BOOST_REQUIRE_MESSAGE(router->WaitForRing(120), + "Worker: Timed out waiting for ring from dispatcher"); + + BOOST_TEST_MESSAGE("Worker " << nodeIndex << ": Ready, numNodes=" << numNodes + << " perNodeBatch=" << perNodeBatch); + + // Build data file names + std::string typeStr = Helper::Convert::ConvertToString(valueType); + std::string paddset = "perftest_addvector.bin." + typeStr + "_" + std::to_string(insertVectorCount) + "_" + std::to_string(dimension); + std::string paddmeta = "perftest_addmeta.bin." + std::to_string(baseVectorCount) + "_" + std::to_string(insertVectorCount); + std::string paddmetaidx = "perftest_addmetaidx.bin." + std::to_string(baseVectorCount) + "_" + std::to_string(insertVectorCount); + + // Load query set + int searchK = topK; + std::string pqueryset = "perftest_query.bin." + typeStr + "_" + std::to_string(numQueries) + "_" + std::to_string(dimension); + auto queryset = TestUtils::TestDataGenerator::LoadVectorSet(pqueryset, dimension); + BOOST_REQUIRE_MESSAGE(queryset != nullptr, "Worker: Failed to load query set from " << pqueryset); + + // Register dispatch callback + std::promise stopPromise; + auto stopFuture = stopPromise.get_future(); + std::once_flag stopOnce; + + router->SetDispatchCallback([&](const SPANN::DispatchCommand& cmd) -> SPANN::DispatchResult { + SPANN::DispatchResult result; + result.m_dispatchId = cmd.m_dispatchId; + result.m_round = cmd.m_round; + + if (cmd.m_type == SPANN::DispatchCommand::Type::Stop) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Stop command received\n", nodeIndex); + std::call_once(stopOnce, [&]() { stopPromise.set_value(); }); + result.m_status = SPANN::DispatchResult::Status::Success; + return result; + } + + if (cmd.m_type == SPANN::DispatchCommand::Type::Heartbeat) { + // Driver sends a Heartbeat every HeartbeatIntervalSec; the result + // is dropped by DispatchCoordinator. Acknowledge silently so we + // don't log noise every 30s during the insert phase. + result.m_status = SPANN::DispatchResult::Status::Success; + return result; + } + + if (cmd.m_type == SPANN::DispatchCommand::Type::Search) { + int myStart = (int)((long long)nodeIndex * numQueries / numNodes); + int myEnd = (int)((long long)(nodeIndex + 1) * numQueries / numNodes); + int myCount = myEnd - myStart; + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Search round %u - %d queries [%d, %d)\n", + nodeIndex, cmd.m_round, myCount, myStart, myEnd); + + std::vector results; + double wallTime = ExecutePartitionedSearch( + index.get(), queryset, myStart, myCount, searchK, + std::min(numSearchThreads, myCount), + results, /*latenciesOut=*/nullptr, /*statsOut=*/nullptr); + + // Drain merge hints accumulated during this search round. + // Search-side AsyncMergeInSearch on remote-owned heads enqueues + // notifications via QueueRemoteMerge; auto-flush only fires when + // a per-target bucket reaches kMergeAutoFlushThreshold, so the + // tail of every round (and any sparse rounds) needs an explicit + // drain to guarantee no hint is dropped. + router->FlushRemoteMerges(); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Search round %u done - %.1fms\n", + nodeIndex, cmd.m_round, wallTime * 1000); + result.m_status = SPANN::DispatchResult::Status::Success; + result.m_wallTime = wallTime; + return result; + } + + if (cmd.m_type == SPANN::DispatchCommand::Type::Insert) { + int insertStart = cmd.m_round * insertBatchSize + myInsertStart; + int loadCount = strideShard ? insertBatchSize : perNodeBatch; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d, strideShard=%d)\n", + nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart, strideShard ? 1 : 0); + + auto t1 = std::chrono::high_resolution_clock::now(); + std::string workerTag = + "Worker " + std::to_string(nodeIndex) + " batch=" + std::to_string(cmd.m_round + 1); + LoadAndInsertBatch(spannIndex, paddset, paddmeta, paddmetaidx, dimension, + insertStart, loadCount, perNodeBatch, + strideShard, numNodes, nodeIndex, + numInsertThreads, router, + /*quantizer=*/nullptr, + /*searchDuringInsertThreads=*/0, + /*queryset=*/nullptr, + /*numQueries=*/0, /*searchK=*/5, + /*benchmarkData=*/nullptr, + workerTag.c_str()); + auto t2 = std::chrono::high_resolution_clock::now(); + double secs = std::chrono::duration_cast(t2 - t1).count() / 1000000.0; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u done - %d vectors in %.2f s (%.1f vec/s)\n", + nodeIndex, cmd.m_round + 1, perNodeBatch, secs, perNodeBatch / secs); + + result.m_status = SPANN::DispatchResult::Status::Success; + result.m_wallTime = secs; + return result; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Unknown command type %d\n", + nodeIndex, (int)cmd.m_type); + result.m_status = SPANN::DispatchResult::Status::Failed; + return result; + }); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Waiting for dispatch commands\n", nodeIndex); + + auto status = stopFuture.wait_for(std::chrono::seconds(workerTimeout)); + if (status == std::future_status::timeout) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Timeout after %ds\n", nodeIndex, workerTimeout); } - //std::filesystem::remove_all(indexPath); + router->ClearDispatchCallback(); + N = oldN; M = oldM; K = oldK; queries = oldQ; + BOOST_TEST_MESSAGE("Worker " << nodeIndex << ": Shutting down"); } BOOST_AUTO_TEST_SUITE_END() diff --git a/Test/src/TestDataGenerator.cpp b/Test/src/TestDataGenerator.cpp index cb3318548..c32f19e0a 100644 --- a/Test/src/TestDataGenerator.cpp +++ b/Test/src/TestDataGenerator.cpp @@ -274,7 +274,8 @@ void TestDataGenerator::GenerateBatchTruth(const std::string &filename, std:: } template -float TestDataGenerator::EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches) +float TestDataGenerator::EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches, + int totalQueries, int queryOffset) { if (!truth) { @@ -285,14 +286,17 @@ float TestDataGenerator::EvaluateRecall(const std::vector recallK = min(recallK, static_cast(truth->Dimension())); float totalRecall = 0.0f; float eps = 1e-4f; - SizeType distbase = truth->Count() - (totalbatches + 1) * res.size(); + // Use global queryCount when caller provides it (distributed path); otherwise + // assume single-node where res.size() IS the global query count. + SizeType queryCount = (totalQueries > 0) ? static_cast(totalQueries) : static_cast(res.size()); + SizeType distbase = truth->Count() - (totalbatches + 1) * queryCount; for (SizeType i = 0; i < res.size(); ++i) { - const SizeType *truthNN = reinterpret_cast(truth->GetData()) + batch * res.size() + i; + const SizeType *truthNN = reinterpret_cast(truth->GetVector(batch * queryCount + queryOffset + i)); float *truthD = nullptr; if (truth->Count() > distbase) { - truthD = reinterpret_cast(truth->GetVector(distbase + batch * res.size() + i)); + truthD = reinterpret_cast(truth->GetVector(distbase + batch * queryCount + queryOffset + i)); } for (int j = 0; j < recallK; ++j) { diff --git a/Test/src/main.cpp b/Test/src/main.cpp index c1a5cde60..ab8d1342c 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -7,9 +7,7 @@ #include #include -#ifdef TIKV #include -#endif using namespace boost::unit_test; @@ -38,9 +36,8 @@ struct GlobalFixture // adds GraphCycles bookkeeping under a global spinlock on every Lock(); // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). -#ifdef TIKV - absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); -#endif + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); + SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); } diff --git a/benchmark.ini b/benchmark.ini new file mode 100644 index 000000000..e2b400767 --- /dev/null +++ b/benchmark.ini @@ -0,0 +1,19 @@ +[Benchmark] +VectorPath=sift1b/base.100M.u8bin +QueryPath=sift1b/query.public.10K.u8bin +TruthPath=none +IndexPath=proidx/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=10000 +InsertVectorCount=10000 +DeleteVectorCount=0 +BatchNum=10 +TopK=5 +NumThreads=8 +NumQueries=100 +DistMethod=L2 +Rebuild=true +Resume=-1 +QuantizerFilePath=quantizer.bin +QuantizedDim=64 diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md new file mode 100644 index 000000000..1f24bc865 --- /dev/null +++ b/evaluation/distributed/README.md @@ -0,0 +1,294 @@ +# Distributed Benchmark Evaluation — Insert Dominant + +Multi-machine SPTAG SPANN distributed benchmark for an **insert-dominant** workload +(1M base + 10M inserts in 10 batches, with concurrent search-during-insert) on +SIFT1B. Each physical node runs its own independent PD + TiKV (no shared Raft +replication — see "TiKV deployment model" below). + +## Files in this folder + +| File | Purpose | +| --- | --- | +| `configs/benchmark_insert_dominant_template.ini` | Benchmark template; `run_distributed.sh` fills `IndexPath`, `TiKVPDAddresses`, `TiKVKeyPrefix`, and `[Distributed]` from `cluster.conf`. | +| `run_distributed.sh` | Orchestrator: `deploy` / `start-tikv` / `run` / `stop-tikv` / `cleanup`. | +| `README.md` | This file. | + +## Architecture + +``` + ┌──────────────┐ + │ Driver │ (node 0) + │ RunBenchmark│ + │ + Router │ + └──┬───┬───┬──┘ + TCP Dispatch│ │ │ + ┌────────┘ │ └────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ Worker 1 │ │ Worker 2 │ │ Worker N │ + │ + Router│ │ + Router│ │ + Router│ + └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ TiKV 1 │ │ TiKV 2 │ │ TiKV N │ (one PD + one TiKV per node) + └──────────┘ └──────────┘ └──────────┘ +``` + +- **Driver** (node 0): Builds the index, sends Search/Insert/Stop commands via TCP dispatch. +- **Workers** (nodes 1..N): Receive commands, execute their shard locally, report results back. +- **TiKV (per node)**: Each node runs its own independent PD + TiKV pair. Postings + for a head live on the node that owns that head's hash partition. +- **PostingRouter**: Hash-based head routing, remote append, head sync, dispatch protocol. + +## TiKV deployment model + +Unlike a single-machine multi-docker TiKV (3 PD + 3 TiKV behind 127.0.0.1 ports +22791-3 / 20161-3 sharing one Raft cluster), in this multi-machine setup **each +node runs its own isolated PD + TiKV pair** under host networking. Heads are +routed to nodes by hash, and each node's TiKV stores only its own shard. There +is no Raft replication between nodes (no cross-node region quorum), which is +intentional for insert-dominated benchmarks where Raft log overhead would dominate. + +Per-node ports (defaults from `cluster.conf`): + +| Service | Port | Notes | +| --- | --- | --- | +| PD client | `2379` | Local app uses `:2379`. | +| PD peer | `2380` | Inter-PD; isolated cluster of 1 PD per node. | +| TiKV client | `20161` | The node-local SPTAG worker connects here. | +| Router | `30001+` | TCP dispatch / posting routing between nodes. | + +## Prerequisites + +- `Release/SPTAGTest` built with TiKV support on the driver node: + ```bash + cd + cd ThirdParty/kvproto && ./generate_cpp.sh && cd ../.. + mkdir -p Release && cd Release + cmake .. -DTIKV=ON -DTBB=ON -DCMAKE_BUILD_TYPE=Release -DGPU=OFF + cmake --build . --target SPTAGTest -j$(nproc) + ``` + *Note: building the full project may fail on the Java wrapper (`JAVASPTAGFileIO`) + due to a pre-existing `FileIOInterface.h` signature mismatch — the `SPTAGTest` + target alone is sufficient.* +- Passwordless SSH from driver to every other node (configure `ssh_key` in `cluster.conf`). +- Docker installed on every node (TiKV/PD run as containers in host network mode). +- Same dataset path on every node (default `/mnt/nvme/sift1b/`): + - `/mnt/nvme/sift1b/bigann_base.u8bin` (1B × 128 × u8) + - `/mnt/nvme/sift1b/query.10K.u8bin` +- Same fast-storage path for index + TiKV data on every node (`data_dir` in `cluster.conf`, + default `/mnt/nvme`). + +## Step 1 — Cluster config + +```bash +cp evaluation/distributed/cluster.conf.example cluster.conf +vim cluster.conf +``` + +Example: + +```ini +[cluster] +ssh_user=superbench +sptag_dir=/home/superbench/zhangt/SPTAG +data_dir=/mnt/nvme +tikv_version=v7.5.1 +pd_version=v7.5.1 + +[nodes] +# host router_port +10.0.1.1 30001 # driver (always first) +10.0.1.2 30002 # worker 1 +10.0.1.3 30003 # worker 2 + +[tikv] +# host pd_client pd_peer tikv_port +10.0.1.1 2379 2380 20161 +10.0.1.2 2379 2380 20161 +10.0.1.3 2379 2380 20161 +``` + +`run_distributed.sh` reads this file to fill the template's `[Distributed]`, +`TiKVPDAddresses`, `IndexPath`, and `TiKVKeyPrefix` automatically. + +## Step 2 — Deploy + +```bash +./evaluation/distributed/run_distributed.sh deploy cluster.conf +``` + +This rsyncs `Release/SPTAGTest` (and required shared libs) to every node and +ensures the per-node TiKV / PD data directories exist under `data_dir`. + +## Step 3 — Start TiKV (per-node, independent) + +```bash +./evaluation/distributed/run_distributed.sh start-tikv cluster.conf +``` + +This starts one PD + one TiKV per node in host-network containers. Single-replica +placement (`max-replicas=1`) is set so we measure benchmark performance without +3-way Raft replication. + +Health check (run on driver, repeat per node): + +```bash +for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do + curl -s "http://$ip:2379/pd/api/v1/stores" \ + | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])' +done +# Each node should report ['Up']. +``` + +### Pre-split & scatter (optional but recommended) + +For the insert-dominant workload to spread region writes evenly across regions +within a node's TiKV, pre-split the keyspace at boundaries derived from +`DBKey(headID) = MaxID*layer + headID` little-endian byte 0. The TiKV raw key is +`TiKVKeyPrefix + "_" + uint32_le(DBKey)`; for multi-chunk it appends `\x00` / +`\x02` for chunk / count keys, but we split *only* on the head-key prefix so all +chunk and count variants for a head share a region. Boundaries used: `0x02, 0x04, +…, 0xfe` (127 split points → 128 regions). + +Driver-side helper (each PD is independent, so run per node): + +```bash +PREFIX="bench_insert_dominant_3node" # keep in sync with KEY_PREFIX in run_distributed.sh +for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do + PD="http://$ip:2379" + PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v7.5.1 -u "$PD") + python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' +import json, subprocess, sys +prefix = sys.argv[1].encode() + b'_' +pdctl = sys.argv[2:] +def run(args): return subprocess.check_output(pdctl + args, text=True) +def region_for(hex_key): return json.loads(run(['region', 'key', '--format=hex', hex_key]))['id'] +for b in range(2, 256, 2): + key = (prefix + bytes([b, 0, 0, 0])).hex() + rid = region_for(key) + run(['operator', 'add', 'split-region', str(rid), '--policy=usekey', '--keys', key]) +for r in json.loads(run(['region', 'scan']))['regions']: + run(['operator', 'add', 'scatter-region', str(r['id'])]) +PY +done +``` + +Skip this on the very first run if you don't have load skew — `start-tikv` works +without it. For 1B-scale insert-dominant runs on a single node it materially +reduces head-region hot-spotting. + +## Step 4 — Run the benchmark + +```bash +# Single scale, explicit node count (driver + (N-1) workers): +./evaluation/distributed/run_distributed.sh run cluster.conf insert_dominant 3 + +# Or sweep 1-node baseline + N-node distributed for one or more scales: +./evaluation/distributed/run_distributed.sh bench cluster.conf insert_dominant +``` + +What `run` does: + +1. **Build** (driver only): driver builds the index locally with router + *disabled* (`Rebuild=true`, no `[Router]`). Output goes to `…_n0/spann_index`. +2. **Distribute**: rsync head index + perftest files from driver to each worker. +3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` and + the per-node ini (router enabled, `Rebuild=false`). +4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. The + driver dispatches Insert / Search commands across batches via TCP. +5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`. + +Useful environment overrides (see header of `run_distributed.sh`): + +- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and `VersionCacheMaxChunks`. +- `BUILD_WITH_CACHE=1` — build with caches, then drop caches before search/insert (NOCACHE only). +- `SKIP_TIKV_SWAP=1` — when using `BUILD_WITH_CACHE`, skip the destructive TiKV + container restart that has corrupted recall at 100M scale. +- `SKIP_SAVE_LOAD=1` — skip post-build SaveIndex / per-batch Load+Clone+Save (NOCACHE only). +- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present (RebuildSSDOnly). + +## Step 5 — Stop / cleanup + +```bash +./evaluation/distributed/run_distributed.sh stop-tikv cluster.conf +./evaluation/distributed/run_distributed.sh cleanup cluster.conf # remove deployed files +``` + +## Key knobs in `benchmark_insert_dominant_template.ini` + +| Key | Value | Meaning | +| --- | --- | --- | +| `BaseVectorCount` | 1_000_000 | Initial index build size. | +| `InsertVectorCount` / `BatchNum` | 10_000_000 / 10 | 10 batches × 1M inserts. | +| `NumSearchThreads` | 4 | Threads for the standalone post-batch query benchmark. | +| `NumInsertThreads` | 16 | Threads driving `AddIndex` calls on the driver. | +| `AppendThreadNum` | 144 | Async append worker pool size — overprovisioned (≈3× cores) because each thread is I/O-bound on TiKV RPCs, so high concurrency increases in-flight RPCs. | +| `NumSearchDuringInsertThreads` | 1 | Concurrent search threads while inserting (continuous loop, ~1s sleep per query). | +| `NumQueries` | 200 | Size of the rotating query pool (in-insert search loops over it). | +| `WorkerTimeout` | 14400 | Seconds a worker waits for the driver before exiting. | +| `Storage` / `TiKVKeyPrefix` / `TiKVPDAddresses` | `TIKVIO` / filled / filled | Filled by `run_distributed.sh` from `cluster.conf`. | +| `Layers` | 2 | SPANN multi-layer head. | +| `BuildSSDIndex.UseMultiChunkPosting` | false | Single-key posting layout (one TiKV value per head). | +| `BuildSSDIndex.PostingPageLimit` | 8 | Posting page limit; runtime cap is logged as ~246 vectors. | +| `BuildSSDIndex.PostingCountCacheCapacity` | 1_000_000 | Posting-count cache capacity. | +| `BuildSSDIndex.DistributedVersionMap` | true | Use TiKV-backed distributed version map. | +| `BuildSSDIndex.ReassignK` | 64 | Split/reassign target fanout knob. | +| `BuildSSDIndex.AsyncMergeInSearch` | true | Async merge during search. | +| `BuildSSDIndex.VersionCacheMaxChunks` | 100_000 | Local version-chunk cache (set ≤0 to disable). | +| `BuildSSDIndex.LatencyLimit` | 100 | ms latency cap fed to SPANN. | +| `BuildSSDIndex.MaxCheck` | 8192 | Max posting checks per query. | +| `BuildSSDIndex.SearchInternalResultNum` | 64 | Internal candidate count during search. | + +## Output JSON structure (per batch) + +For each insert batch, `output.json/results.benchmark1_insert.batch_N` contains: + +- `Load timeSeconds` / `Load vectorCount` — reload of previous batch. +- `Clone timeSeconds`. +- In-insert concurrent search stats (continuous-loop variant): + `numQueries` (actual count issued), `meanLatency`, `p50/p90/p95/p99`, `qps`, + `batch barrier waitSeconds`. +- `inserted`, `insert timeSeconds`, `insert throughput`. +- `search` and `search_round2` — standalone `BenchmarkQueryPerformance` results + against the post-batch index (cold + warm), independent of the in-insert numbers. +- `save timeSeconds`. + +Pre-insert baseline lives at `results.benchmark0_query_before_insert` and +`results.benchmark0b_query_before_insert_round2`. + +## Dispatch Protocol + +The TCP dispatch protocol replaces file-based barriers. Communication flows through +PostingRouter's existing TCP transport: + +| Packet | Direction | Purpose | +|--------|-----------|---------| +| `DispatchCommand (0x09)` | Driver → Worker | Search/Insert/Stop with `dispatchId` + round. | +| `DispatchResult (0x89)` | Worker → Driver | Status + wallTime for aggregation. | + +- **Search**: Driver broadcasts to workers, runs local queries in parallel, collects + wall times for percentile stats. +- **Insert**: Driver broadcasts batch index, workers insert their shard, driver + waits for all to finish. +- **Stop**: Driver sends at end of benchmark; workers exit gracefully. + +Each command has a unique `dispatchId` (monotonic uint64) to avoid round collisions +between search and insert operations. + +## Troubleshooting + +- **Workers don't connect**: confirm `RouterNodeAddrs` ports (default 30001+) are + reachable between every pair of nodes — the router uses TCP with 2 io_context + threads. +- **TiKV timeout**: ensure each node's PD `advertise-client-urls` use a reachable + IP (not 127.0.0.1) — `start-tikv` sets this from `cluster.conf`. Check + `docker logs sptag-pd-0` on the affected node. +- **Worker exits prematurely**: check the worker logs in `benchmark_logs/`. + Common causes: TiKV not ready, index path mismatch, router connection failure. +- **Build fails on Java wrapper**: pre-existing issue unrelated to the benchmark. + Build only what's needed: + ```bash + cmake --build . --target SPTAGTest -j$(nproc) + ``` diff --git a/evaluation/distributed/configs/benchmark_100m_1node.ini b/evaluation/distributed/configs/benchmark_100m_1node.ini new file mode 100644 index 000000000..42ec07f49 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_100m_1node.ini @@ -0,0 +1,71 @@ +; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). +; 100× larger base index than insert_dominant. Tests how the system behaves when +; the head index is large (~tens of millions of heads on layer 0) and the insert +; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +; +; Notes for 100M-scale operation: +; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; +; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the +; HeadIndex on disk is intact. +; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. +; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; +; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. +; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need +; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_100m_1node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=99000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench100m_1node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=10000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011 +StoreAddrs=10.11.0.7:20171 +PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_100m_2node.ini b/evaluation/distributed/configs/benchmark_100m_2node.ini new file mode 100644 index 000000000..01b9c3e81 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_100m_2node.ini @@ -0,0 +1,71 @@ +; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). +; 100× larger base index than insert_dominant. Tests how the system behaves when +; the head index is large (~tens of millions of heads on layer 0) and the insert +; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +; +; Notes for 100M-scale operation: +; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; +; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the +; HeadIndex on disk is intact. +; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. +; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; +; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. +; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need +; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_100m_2node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=99000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench100m_2node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=10000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 +StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 +PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_100m_template.ini b/evaluation/distributed/configs/benchmark_100m_template.ini new file mode 100644 index 000000000..4a69f39a4 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_100m_template.ini @@ -0,0 +1,71 @@ +; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). +; 100× larger base index than insert_dominant. Tests how the system behaves when +; the head index is large (~tens of millions of heads on layer 0) and the insert +; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +; +; Notes for 100M-scale operation: +; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; +; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the +; HeadIndex on disk is intact. +; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. +; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; +; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. +; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need +; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=PLACEHOLDER +ValueType=UInt8 +Dimension=128 +BaseVectorCount=99000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=PLACEHOLDER + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=10000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=PLACEHOLDER +WorkerAddrs=PLACEHOLDER +StoreAddrs=PLACEHOLDER +PDAddrs=PLACEHOLDER diff --git a/evaluation/distributed/configs/benchmark_10m_1node.ini b/evaluation/distributed/configs/benchmark_10m_1node.ini new file mode 100644 index 000000000..56dbd9088 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_10m_1node.ini @@ -0,0 +1,62 @@ +; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). +; 10× larger base index than insert_dominant, 10× smaller than 100m. +; Useful for validating scaling between 1M and 100M without paying the +; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset +; (truncated to 10M of the 1B available). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_10m_1node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=9000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench10m_1node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011 +StoreAddrs=10.11.0.7:20171 +PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_2node.ini b/evaluation/distributed/configs/benchmark_10m_2node.ini new file mode 100644 index 000000000..4ed317ac3 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_10m_2node.ini @@ -0,0 +1,62 @@ +; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). +; 10× larger base index than insert_dominant, 10× smaller than 100m. +; Useful for validating scaling between 1M and 100M without paying the +; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset +; (truncated to 10M of the 1B available). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_10m_2node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=9000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench10m_2node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 +StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 +PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_template.ini b/evaluation/distributed/configs/benchmark_10m_template.ini new file mode 100644 index 000000000..f40203559 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_10m_template.ini @@ -0,0 +1,62 @@ +; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). +; 10× larger base index than insert_dominant, 10× smaller than 100m. +; Useful for validating scaling between 1M and 100M without paying the +; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset +; (truncated to 10M of the 1B available). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=PLACEHOLDER +ValueType=UInt8 +Dimension=128 +BaseVectorCount=9000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=PLACEHOLDER + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=PLACEHOLDER +WorkerAddrs=PLACEHOLDER +StoreAddrs=PLACEHOLDER +PDAddrs=PLACEHOLDER diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini new file mode 100644 index 000000000..30fe77bbe --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini @@ -0,0 +1,58 @@ +; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. +; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_insert_dominant_1node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=benchinsert_dominant_1node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011 +StoreAddrs=10.11.0.7:20171 +PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini new file mode 100644 index 000000000..d45870b50 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini @@ -0,0 +1,58 @@ +; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. +; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_insert_dominant_2node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=benchinsert_dominant_2node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 +StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 +PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini new file mode 100644 index 000000000..a8050732d --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini @@ -0,0 +1,59 @@ +; insert_dominant: 1M base + 10M insert (10× scale-up) with concurrent search-during-insert. +; Tests how the index handles insertion-dominated workloads where insertion volume +; is much larger than the initial baseline. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/data/sift1b/base.1B.u8bin +QueryPath=/mnt/data/sift1b/query.public.10K.u8bin +TruthPath=truth +IndexPath=/mnt/md0/proidx_insert_dominant_3node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=benchinsert_dominant_3node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=172.27.0.4:30001 +WorkerAddrs=172.27.0.4:30011,172.27.0.5:30002,172.27.0.6:30003 +StoreAddrs=172.27.0.4:20171,172.27.0.5:20171,172.27.0.6:20171 +PDAddrs=172.27.0.4:23791,172.27.0.5:23791,172.27.0.6:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_template.ini b/evaluation/distributed/configs/benchmark_insert_dominant_template.ini new file mode 100644 index 000000000..f8085c03b --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_template.ini @@ -0,0 +1,58 @@ +; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. +; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=PLACEHOLDER +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=PLACEHOLDER + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=PLACEHOLDER +WorkerAddrs=PLACEHOLDER +StoreAddrs=PLACEHOLDER +PDAddrs=PLACEHOLDER diff --git a/evaluation/distributed/configs/cluster_2node.conf b/evaluation/distributed/configs/cluster_2node.conf new file mode 100644 index 000000000..f94500487 --- /dev/null +++ b/evaluation/distributed/configs/cluster_2node.conf @@ -0,0 +1,31 @@ +# 2-node cluster: driver/worker0 on dev-000003 (10.11.0.7), +# worker1 on dev-000006 (10.11.0.10). +# On 000006, /mnt/nvme is symlinked to /mnt_ssd/data7/sptag-bench (data lives on data7 NVMe). +# +# Cluster mode: SHARED TiKV raft cluster. Both PDs form one raft group; both +# TiKVs share the same cluster (max-replicas=1, so each region lives on +# exactly one store and PD routes reads to it). Compute nodes are stateless +# TiKV clients — no cross-compute fetch RPCs during RNGSelection. +[cluster] +ssh_user=superbench +ssh_key=/home/superbench/.ssh/id_rsa +sptag_dir=/home/superbench/zhangt/SPTAG +data_dir=/mnt/nvme +tikv_version=v8.5.1 +pd_version=v8.5.1 +# Image refs (optional). Defaults: +# tikv_image=sptag-tikv (with tag :${tikv_version}) +# pd_image=sptag-pd (with tag :${pd_version}) +# helper_image=mcr.microsoft.com/mirror/docker/library/ubuntu:22.04 +# Override here to use different registries / replace with pingcap/* etc. + +[nodes] +# host router_port +# node 0 (driver) router_port must differ from dispatcher port (hardcoded 30001). +10.11.0.7 30011 +10.11.0.10 30002 + +[tikv] +# host pd_client_port pd_peer_port tikv_port +10.11.0.7 23791 23801 20171 +10.11.0.10 23791 23801 20171 diff --git a/evaluation/distributed/configs/cluster_3node.conf b/evaluation/distributed/configs/cluster_3node.conf new file mode 100644 index 000000000..ff2ba8af4 --- /dev/null +++ b/evaluation/distributed/configs/cluster_3node.conf @@ -0,0 +1,34 @@ +# 3-node cluster: driver/worker0 on 172.27.0.4, +# worker1 on 172.27.0.5 (20.92.202.166), +# worker2 on 172.27.0.6 (20.5.138.158). +# Data lives on /mnt/md0 (NVMe RAID0, ~11T per node). +# +# Cluster mode: SHARED TiKV raft cluster. All PDs form one raft group; all +# TiKVs share the same cluster (max-replicas=1, so each region lives on +# exactly one store and PD routes reads to it). Compute nodes are stateless +# TiKV clients — no cross-compute fetch RPCs during RNGSelection. +[cluster] +ssh_user=azureuser +ssh_key=/home/azureuser/.ssh/id_rsa +sptag_dir=/home/azureuser/zhangt/SPTAG +data_dir=/mnt/md0 +tikv_version=v8.5.1 +pd_version=v8.5.1 +# Image refs (optional). Defaults: +# tikv_image=sptag-tikv (with tag :${tikv_version}) +# pd_image=sptag-pd (with tag :${pd_version}) +# helper_image=mcr.microsoft.com/mirror/docker/library/ubuntu:22.04 +# Override here to use different registries / replace with pingcap/* etc. + +[nodes] +# host router_port +# node 0 (driver) router_port must differ from dispatcher port (hardcoded 30001). +172.27.0.4 30011 +172.27.0.5 30002 +172.27.0.6 30003 + +[tikv] +# host pd_client_port pd_peer_port tikv_port +172.27.0.4 23791 23801 20171 +172.27.0.5 23791 23801 20171 +172.27.0.6 23791 23801 20171 diff --git a/evaluation/distributed/configs/tikv.toml b/evaluation/distributed/configs/tikv.toml new file mode 100755 index 000000000..4ba5282c0 --- /dev/null +++ b/evaluation/distributed/configs/tikv.toml @@ -0,0 +1,74 @@ +memory-usage-limit = "80GB" + +[server] +# v41: 16 → 32 to handle higher concurrent gRPC streams. 96-core host has +# plenty of headroom; previous setting was a default-y stab in the dark. +grpc-concurrency = 32 +grpc-memory-pool-quota = "16GB" + +[raftstore] +region-max-size = "512MB" +region-split-size = "384MB" +region-max-keys = 5120000 +region-split-keys = 3840000 +# v41: 4 → 32. apply-pool is the path raft-log → RocksDB writes go through. +# At 32 concurrent RMW ops per store (4 local insert + 16 receiver sub-workers +# + 4 search + 4 search-during-insert + misc), a 4-thread apply pool meant +# ~8× queue depth, which is the primary write-amp source we observed +# (TiKV at 7/96 cores while ops are still queueing). +apply-pool-size = 32 +# v41: 4 → 16. store-pool routes raft messages between peers and to apply. +store-pool-size = 16 +# v41: batch up raft entries per fsync. If we're disk-fsync bound (likely), +# this directly amortizes the sync cost. +raft-write-batch-size = "1MB" + +[storage] +reserve-space = "1GB" +# v41: 4 (default) → 16. KV scheduler is the front-end before raftstore. +scheduler-worker-pool-size = 16 + +[storage.block-cache] +capacity = "60GB" + +# v41: new section. Read pool default = 0.8×CPU = 76 on 96-core host, which +# would let reads steal CPU from writes. Cap at 32 to leave room for write +# path. Min 8 ensures reads stay responsive under light load. +[readpool.unified] +max-thread-count = 32 +min-thread-count = 8 + +[rocksdb] +max-background-jobs = 32 +max-sub-compactions = 8 +# v41: 8 dedicated flush threads (subset of max-background-jobs). Reduces +# the chance that compaction monopolizes background-jobs and starves flushes. +max-background-flushes = 8 +rate-bytes-per-sec = "0" + +[rocksdb.defaultcf] +# v41: 512MB → 1GB. Bigger memtable means fewer flushes (and thus fewer L0 +# files), reducing the chance of slowdown/stop write triggers under burst. +write-buffer-size = "1GB" +# v41: 5 → 8. More memtables = more headroom before flush back-pressure. +max-write-buffer-number = 8 +min-write-buffer-number-to-merge = 2 +level0-file-num-compaction-trigger = 12 +# v41: 28 → 40, 40 → 60. Loosen the L0 stall thresholds so bursts have more +# slack. With 10K-item chunks (v39+) we generate more small writes than v38 +# did, so we hit slowdown more often. +level0-slowdown-writes-trigger = 40 +level0-stop-writes-trigger = 60 +max-bytes-for-level-base = "2GB" +compression-per-level = ["no", "no", "no", "lz4", "lz4", "zstd", "zstd"] +target-file-size-base = "128MB" + +[rocksdb.writecf] +write-buffer-size = "128MB" +max-write-buffer-number = 5 + +[coprocessor] +region-max-size = "512MB" +region-split-size = "384MB" +region-max-keys = 5120000 +region-split-keys = 3840000 diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh new file mode 100755 index 000000000..c383a7eed --- /dev/null +++ b/evaluation/distributed/run_distributed.sh @@ -0,0 +1,1364 @@ +#!/bin/bash +# Multi-machine distributed benchmark orchestrator for SPTAG. +# +# Usage: +# ./run_distributed.sh deploy Deploy binary + data to all nodes +# ./run_distributed.sh setup-bins Download tikv-server / pd-server to every node +# ./run_distributed.sh start-tikv [node_count] Start independent TiKV/PD instances +# ./run_distributed.sh stop-tikv [node_count] Stop TiKV/PD instances +# ./run_distributed.sh run Run benchmark +# ./run_distributed.sh bench [scale...] Run 1-node + N-node for each scale +# ./run_distributed.sh cleanup Remove deployed files from remote nodes +# +# Environment variables: +# NOCACHE=1 Disable all caches (TiKV block cache, OS page cache, VersionCache) +# BUILD_WITH_CACHE=1 (only with NOCACHE=1) Use cached TiKV+VersionCache during the +# build phase, then restart TiKV with nocache config and drop all +# OS caches before the search/insert phase. Useful for large scales +# (e.g. 100M) where building under nocache is impractical. +# SKIP_TIKV_SWAP=1 (only with BUILD_WITH_CACHE=1) Skip the TiKV container restart. +# Drop OS caches and rely on VersionCache=0 INI overrides for "nocache" +# semantics. Avoids docker rm -f corruption that has destroyed recall +# at 100M scale; TiKV block cache stays warm but contains mostly recent +# build writes (random search reads largely miss it anyway). +# SKIP_SAVE_LOAD=1 (only with NOCACHE=1) Bypass the post-build SaveIndex / per-batch +# LoadIndex / Clone / SaveIndex cycles. For 1-node, build+search+insert +# run in a single SPTAGTest process, dropping OS pagecache after build. +# For 2-node, the build phase skips the broken final SaveIndex (relies +# on the index files written during BuildLargeIndex). Required at 100M +# scale where SaveIndex's "wait for all background jobs to finish" loop +# never terminates and risks a gRPC SEGFAULT after several hours. +# VersionCache cannot be reset mid-process so it stays warm from build. +# SKIP_HEAD_BUILD=1 Reuse existing HeadIndex if present (RebuildSSDOnly). Falls back to +# full build if HeadIndex is missing. +# +# Prerequisites: +# - Passwordless SSH from driver to all nodes (configure ssh_key in cluster.conf) +# - Docker installed on all nodes (for TiKV) +# - cluster.conf configured (see cluster.conf.example) +# +# The driver (first node in [nodes]) orchestrates everything. +# Compute nodes share a single TiKV raft cluster: all PDs join one raft group, +# all TiKVs point to all PDs, max-replicas=1 (no replication, each region on +# exactly one store). With 2 nodes this gives 2 PDs + 2 TiKV stores in one +# cluster; any compute can read any posting via PD-routed TiKV calls, so the +# distributed routing layer no longer needs to forward reads between computes. + +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LOGDIR="$(cd "$SCRIPT_DIR/../.." && pwd)/benchmark_logs" +mkdir -p "$LOGDIR" + +# ─── Config Parsing ─── + +declare -a NODE_HOSTS NODE_ROUTER_PORTS +declare -a TIKV_HOSTS TIKV_PD_CLIENT_PORTS TIKV_PD_PEER_PORTS TIKV_PORTS +declare SSH_USER SPTAG_DIR DATA_DIR TIKV_VERSION PD_VERSION SSH_KEY +declare TIKV_IMAGE PD_IMAGE HELPER_IMAGE BIN_DIR MIRROR +TOTAL_NODES=0 + +parse_config() { + local CONF="$1" + if [ ! -f "$CONF" ]; then + echo "ERROR: Config file not found: $CONF" + exit 1 + fi + + local SECTION="" + + while IFS= read -r line || [ -n "$line" ]; do + # Strip comments and whitespace + line="${line%%#*}" + line="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" + [ -z "$line" ] && continue + + # Section header + if [[ "$line" =~ ^\[(.+)\]$ ]]; then + SECTION="${BASH_REMATCH[1]}" + continue + fi + + case "$SECTION" in + cluster) + local key="${line%%=*}" + local val="${line#*=}" + case "$key" in + ssh_user) SSH_USER="$val" ;; + sptag_dir) SPTAG_DIR="$val" ;; + data_dir) DATA_DIR="$val" ;; + tikv_version) TIKV_VERSION="$val" ;; + pd_version) PD_VERSION="$val" ;; + tikv_image) TIKV_IMAGE="$val" ;; + pd_image) PD_IMAGE="$val" ;; + helper_image) HELPER_IMAGE="$val" ;; + bin_dir) BIN_DIR="$val" ;; + mirror) MIRROR="$val" ;; + ssh_key) SSH_KEY="$val" ;; + esac + ;; + nodes) + read -r host rport <<< "$line" + NODE_HOSTS+=("$host") + NODE_ROUTER_PORTS+=("$rport") + ;; + tikv) + read -r host pd_client pd_peer tikv_port <<< "$line" + TIKV_HOSTS+=("$host") + TIKV_PD_CLIENT_PORTS+=("$pd_client") + TIKV_PD_PEER_PORTS+=("$pd_peer") + TIKV_PORTS+=("$tikv_port") + ;; + esac + done < "$CONF" + + # Defaults + SSH_USER="${SSH_USER:-$(whoami)}" + TIKV_VERSION="${TIKV_VERSION:-v8.5.1}" + PD_VERSION="${PD_VERSION:-v8.5.1}" + # Single image used for ALL containers (PD, TiKV, helper). Stock MCR + # ubuntu:22.04 — never modified, never layered, so security scanners see + # only the MCR base image. TiKV / PD binaries are downloaded to the host + # at $BIN_DIR by `setup-bins` and bind-mounted into the container. + HELPER_IMAGE="${HELPER_IMAGE:-mcr.microsoft.com/mirror/docker/library/ubuntu:22.04}" + TIKV_IMAGE="${TIKV_IMAGE:-${HELPER_IMAGE}}" + PD_IMAGE="${PD_IMAGE:-${HELPER_IMAGE}}" + # Host path on every node where tikv-server / pd-server live. Populated + # by `setup-bins`. Mounted read-only into containers as /sptag-bin. + BIN_DIR="${BIN_DIR:-${SPTAG_DIR}/evaluation/distributed/bin}" + MIRROR="${MIRROR:-https://tiup-mirrors.pingcap.com}" + + # Expand ~ in ssh_key path + if [ -n "$SSH_KEY" ]; then + SSH_KEY="${SSH_KEY/#\~/$HOME}" + fi + + TOTAL_NODES=${#NODE_HOSTS[@]} + + if [ "$TOTAL_NODES" -lt 1 ]; then + echo "ERROR: No compute nodes defined in [nodes]" + exit 1 + fi + if [ ${#TIKV_HOSTS[@]} -lt 1 ]; then + echo "ERROR: No TiKV instances defined in [tikv]" + exit 1 + fi + + echo "Cluster config loaded:" + echo " Compute nodes: $TOTAL_NODES (driver: ${NODE_HOSTS[0]})" + echo " TiKV instances: ${#TIKV_HOSTS[@]}" + echo " SSH user: $SSH_USER" + echo " SSH key: ${SSH_KEY:-(none)}" + echo " SPTAG dir: $SPTAG_DIR" + echo " Data dir: $DATA_DIR" +} + +# ─── SSH Helpers ─── + +# Build SSH options string (key + host checking) +_ssh_opts() { + local opts="-o StrictHostKeyChecking=no -o ConnectTimeout=10" + if [ -n "$SSH_KEY" ]; then + opts+=" -i $SSH_KEY" + fi + echo "$opts" +} + +# Run command on remote host (or locally if it's the driver) +remote_exec() { + local host="$1"; shift + if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then + eval "$@" + else + ssh $(_ssh_opts) "$SSH_USER@$host" "$@" + fi +} + +# rsync files to remote host +remote_sync() { + local host="$1" + local src="$2" + local dst="$3" + if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ]; then + # Local copy — skip if same path + if [ "$(realpath "$src")" != "$(realpath "$dst")" ]; then + rsync -az --progress "$src" "$dst" + fi + else + rsync -az --progress -e "ssh $(_ssh_opts)" "$src" "$SSH_USER@$host:$dst" + fi +} + +# ─── Deploy ─── + +cmd_deploy() { + echo "" + echo "=== Deploying SPTAG to ${#NODE_HOSTS[@]} nodes ===" + echo "" + + # Validate SSH connectivity + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + echo -n " Checking SSH to $host... " + if remote_exec "$host" "echo ok" >/dev/null 2>&1; then + echo "OK" + else + echo "FAILED" + echo "ERROR: Cannot SSH to $SSH_USER@$host" + exit 1 + fi + done + + # Deploy binary to all remote nodes + echo "" + echo "Deploying binary..." + local BINARY="$SPTAG_DIR/Release/SPTAGTest" + if [ ! -f "$BINARY" ]; then + echo "ERROR: Binary not found: $BINARY (run cmake build first)" + exit 1 + fi + + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + echo " → $host:$SPTAG_DIR/Release/" + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release" + remote_sync "$host" "$BINARY" "$SPTAG_DIR/Release/SPTAGTest" + # Also deploy any shared libraries + if ls "$SPTAG_DIR/Release/"*.so 2>/dev/null; then + remote_sync "$host" "$SPTAG_DIR/Release/*.so" "$SPTAG_DIR/Release/" + fi + # Deploy bundled runtime libs (boost 1.73 / abseil / tbb / libstdc++) + # used by SPTAGTest. Not committed; produced locally on the driver. + if [ -d "$SPTAG_DIR/Release/runtime_libs" ]; then + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release/runtime_libs" + rsync -az -e "ssh $(_ssh_opts)" \ + "$SPTAG_DIR/Release/runtime_libs/" \ + "$SSH_USER@$host:$SPTAG_DIR/Release/runtime_libs/" + fi + done + + # Deploy data files (perftest_* vectors, queries) + echo "" + echo "Deploying data files..." + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + echo " → $host:$SPTAG_DIR/ (perftest_* files)" + remote_exec "$host" "mkdir -p $SPTAG_DIR" + rsync -az --progress \ + --include='perftest_*' --exclude='*' \ + -e "ssh $(_ssh_opts)" \ + "$SPTAG_DIR/" "$SSH_USER@$host:$SPTAG_DIR/" + done + + echo "" + echo "Deploy complete." +} + +# ─── TiKV/PD Binary Setup ─── + +setup_bins_one_host() { + # Ensure tikv-server / pd-server are present at $BIN_DIR on $1. + # Downloads from $MIRROR if missing or version mismatch. Idempotent. + local host="$1" + local cmd + # shellcheck disable=SC2016 + cmd='set -e + mkdir -p "'"$BIN_DIR"'" + cd "'"$BIN_DIR"'" + need_tikv=1 + if [ -x tikv-server ] && ./tikv-server --version 2>/dev/null | grep -q "Release Version:[[:space:]]*'"${TIKV_VERSION#v}"'"; then + need_tikv=0 + fi + if [ "$need_tikv" = "1" ]; then + echo " Downloading tikv-'"${TIKV_VERSION}"'..." + curl -fsSL "'"${MIRROR}"'/tikv-'"${TIKV_VERSION}"'-linux-amd64.tar.gz" | tar -xz + chmod +x tikv-server + else + echo " tikv-'"${TIKV_VERSION}"' already present" + fi + need_pd=1 + if [ -x pd-server ] && ./pd-server --version 2>/dev/null | grep -q "Release Version:[[:space:]]*'"${PD_VERSION}"'"; then + need_pd=0 + fi + if [ "$need_pd" = "1" ]; then + echo " Downloading pd-'"${PD_VERSION}"'..." + curl -fsSL "'"${MIRROR}"'/pd-'"${PD_VERSION}"'-linux-amd64.tar.gz" | tar -xz + chmod +x pd-server pd-ctl pd-recover 2>/dev/null || true + else + echo " pd-'"${PD_VERSION}"' already present" + fi' + + if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then + bash -c "$cmd" + else + remote_exec "$host" "$cmd" + fi +} + +cmd_setup_bins() { + # Download tikv-server + pd-server to ${BIN_DIR} on every distinct host + # used by the cluster (compute nodes ∪ tikv nodes). Idempotent. + echo "" + echo "=== Setting up TiKV/PD binaries ===" + echo " BIN_DIR : $BIN_DIR" + echo " TIKV : $TIKV_VERSION" + echo " PD : $PD_VERSION" + echo " MIRROR : $MIRROR" + + declare -A seen + local -a hosts=() + local h + for h in "${NODE_HOSTS[@]}" "${TIKV_HOSTS[@]}"; do + if [ -z "${seen[$h]:-}" ]; then + seen[$h]=1 + hosts+=("$h") + fi + done + + for h in "${hosts[@]}"; do + echo "" + echo "→ $h" + setup_bins_one_host "$h" + done + + echo "" + echo "Binary setup complete." +} + +# ─── TiKV Management (Independent Mode) ─── + + +tikv_start() { + # Start the first PD+TiKV pairs. + # + # node_count == 1: standalone PD + TiKV (1-node benchmarks). + # node_count >= 2: SHARED raft cluster — all PDs join one raft group, + # all TiKVs point to all PDs. max-replicas=1 so each + # region lives on exactly one store; PD routes reads + # to whichever store has the region. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + echo "" + if [ "$node_count" -le 1 ]; then + echo "=== Starting 1 standalone TiKV instance ===" + else + echo "=== Starting $node_count-node SHARED TiKV raft cluster ===" + fi + + # Ensure binaries are present on every host that will run a container. + # Cheap if already there (version-grep, no download). + local i_host + for (( i_host=0; i_host/dev/null | tr -d '[:space:]') + fi + if [ "$present" != "yes" ]; then + echo " → $h: binaries missing, running setup-bins" + setup_bins_one_host "$h" + fi + done + + # Build the initial-cluster string used by every PD. + # For 1-node it's a single-member raft; for N>=2 every PD lists all members. + local initial_cluster="" + for (( i=0; i= 2 they form a raft group. + echo "Starting PD instances (initial-cluster=${initial_cluster})..." + for (( i=0; i/dev/null; \ + docker run -d --name sptag-pd-$i --net host \ + -v $DATA_DIR/tikv-data/pd-$i:/data \ + -v ${BIN_DIR}:/sptag-bin:ro \ + --entrypoint /sptag-bin/pd-server \ + ${PD_IMAGE} \ + --name=${pd_name} \ + --data-dir=/data \ + --client-urls=http://0.0.0.0:${client_port} \ + --advertise-client-urls=http://${host}:${client_port} \ + --peer-urls=http://0.0.0.0:${peer_port} \ + --advertise-peer-urls=http://${host}:${peer_port} \ + --initial-cluster=${initial_cluster}" + done + + echo "Waiting for PD raft to form..." + sleep 5 + + # Wait until every PD reports the expected member count (raft quorum up). + for (( i=0; i/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('members',[])))" 2>/dev/null || echo 0) + if [ "$members" -ge "$node_count" ]; then + echo " PD $i ($host:$pd_port) healthy (members=${members})" + break + fi + if [ "$attempt" -eq 60 ]; then + echo " ERROR: PD $i ($host:$pd_port) only sees ${members}/${node_count} members after 60s" + return 1 + fi + sleep 1 + done + done + + # NOTE: max-replicas is configured AFTER TiKV starts (see below). Setting + # placement rules requires cluster bootstrap, which only happens once a + # TiKV store joins. Before bootstrap, /pd/api/v1/config/rule returns 500 + # ErrNotBootstrapped. We rely on the fact that no data is written until + # SPTAGTest connects (which happens after this function returns), so the + # brief window where bootstrap uses default max-replicas=3 is harmless. + + # Start TiKV instances pointing at the shared PD endpoints. + echo "Starting TiKV instances (pd-endpoints=${pd_endpoints})..." + for (( i=0; i/dev/null; \ + docker run -d --name sptag-tikv-$i --net host \ + --ulimit nofile=1048576:1048576 \ + -v $DATA_DIR/tikv-data/tikv-$i:/data \ + -v $DATA_DIR/tikv-data/conf:/conf \ + -v ${BIN_DIR}:/sptag-bin:ro \ + --entrypoint /sptag-bin/tikv-server \ + ${TIKV_IMAGE} \ + --config=/conf/tikv.toml \ + --addr=0.0.0.0:${tikv_port} \ + --advertise-addr=${host}:${tikv_port} \ + --data-dir=/data \ + --pd-endpoints=${pd_endpoints}" + done + + echo "Waiting for TiKV stores to register..." + sleep 5 + + # All stores show up in PD's store list (any PD works — they share state). + local pd_host="${TIKV_HOSTS[0]}" + local pd_port_first="${TIKV_PD_CLIENT_PORTS[0]}" + for attempt in $(seq 1 60); do + local store_count + store_count=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/stores" 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count',0))" 2>/dev/null || echo 0) + if [ "$store_count" -ge "$node_count" ]; then + echo " All ${store_count} TiKV stores registered" + break + fi + if [ "$attempt" -eq 60 ]; then + echo " WARNING: only ${store_count}/${node_count} TiKV stores registered after 60s" + fi + sleep 1 + done + + # Set max-replicas=1 on the shared cluster, NOW that cluster is bootstrapped. + # + # PD v6+ defaults to enable-placement-rules=true. The authoritative source + # for replica count is then the default placement rule, NOT the legacy + # max-replicas config. /config POST auto-syncs to the rule but is racy; + # we explicitly POST the rule too. Both endpoints require bootstrap. + # Bug seen v45: skipping this caused 30%+ of a 1-node run to execute with + # max-replicas=3 → PD endlessly tried to schedule replicas onto 1 store + # → constant region state changes → gRPC Deadline / region_error storm. + echo "Setting max-replicas=1 (default placement rule)..." + local target_replicas=1 + local mr_ok=0 + for attempt in $(seq 1 30); do + curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config" \ + -X POST -d "{\"max-replicas\": ${target_replicas}}" >/dev/null 2>&1 || true + curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/rule" \ + -X POST -d "{\"group_id\":\"pd\",\"id\":\"default\",\"start_key\":\"\",\"end_key\":\"\",\"role\":\"voter\",\"count\":${target_replicas}}" \ + >/dev/null 2>&1 || true + sleep 1 + local got_cfg + got_cfg=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/replicate" 2>/dev/null \ + | python3 -c 'import sys,json;print(json.load(sys.stdin).get("max-replicas"))' 2>/dev/null) + local got_rule + got_rule=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/rule/pd/default" 2>/dev/null \ + | python3 -c 'import sys,json;print(json.load(sys.stdin).get("count"))' 2>/dev/null) + if [ "$got_cfg" = "$target_replicas" ] && [ "$got_rule" = "$target_replicas" ]; then + echo " max-replicas=${target_replicas} set (attempt $attempt, config & rule verified)" + mr_ok=1 + break + fi + sleep 1 + done + if [ "$mr_ok" != "1" ]; then + echo " ERROR: Failed to set max-replicas=${target_replicas} after 30 attempts. Aborting." >&2 + return 1 + fi + + echo "TiKV cluster started ($node_count node(s))." +} + +tikv_stop() { + # Stop the first TiKV+PD instances. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + echo "" + echo "=== Stopping $node_count TiKV instances ===" + + for (( i=0; i/dev/null || true" + done + + echo "TiKV instances stopped." +} + +tikv_switch_to_nocache() { + # Restart TiKV containers (NOT PD) with the nocache config, so that the search + # and insert phases use cold block cache. Data on disk is preserved because we + # reuse the same data-dir; PD keeps the cluster metadata. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + if [[ ! -f "$SCRIPT_DIR/configs/tikv_nocache.toml" ]]; then + echo " ERROR: configs/tikv_nocache.toml not found; cannot switch to nocache" + return 1 + fi + echo "" + echo "=== Restarting $node_count TiKV instances with tikv_nocache.toml ===" + + # Reconstruct the shared pd-endpoints list (same as tikv_start). + local pd_endpoints="" + for (( i=0; i/dev/null; \ + docker rm -f sptag-tikv-$i 2>/dev/null; \ + docker run -d --name sptag-tikv-$i --net host \ + --ulimit nofile=1048576:1048576 \ + -v $DATA_DIR/tikv-data/tikv-$i:/data \ + -v $DATA_DIR/tikv-data/conf:/conf \ + -v ${BIN_DIR}:/sptag-bin:ro \ + --entrypoint /sptag-bin/tikv-server \ + ${TIKV_IMAGE} \ + --config=/conf/tikv.toml \ + --addr=0.0.0.0:${tikv_port} \ + --advertise-addr=${host}:${tikv_port} \ + --data-dir=/data \ + --pd-endpoints=${pd_endpoints}" + done + + echo "Waiting for TiKV stores to re-register..." + sleep 5 + local pd_host_first="${TIKV_HOSTS[0]}" + local pd_port_first="${TIKV_PD_CLIENT_PORTS[0]}" + for attempt in $(seq 1 60); do + local store_count + store_count=$(curl -sf "http://${pd_host_first}:${pd_port_first}/pd/api/v1/stores" 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count',0))" 2>/dev/null || echo 0) + if [ "$store_count" -ge "$node_count" ]; then + echo " All ${store_count} TiKV stores re-registered" + break + fi + if [ "$attempt" -eq 60 ]; then + echo " WARNING: only ${store_count}/${node_count} stores re-registered after 60s" + fi + sleep 1 + done + echo "TiKV switched to nocache mode." +} + +tikv_clean() { + # Clean TiKV data for the first instances. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + echo "" + echo "=== Cleaning TiKV data ($node_count instances) ===" + + for (( i=0; i/dev/null || true" + done +} + +# Legacy wrappers for the main case block +cmd_start_tikv() { tikv_start "${1:-${#TIKV_HOSTS[@]}}"; } +cmd_stop_tikv() { tikv_stop "${1:-${#TIKV_HOSTS[@]}}"; } + +# ─── Cache Management ─── + +drop_all_caches() { + # Drop OS page cache + dentries/inodes on the first nodes. + # This may take 30-60s per node if there are many dirty pages. + local node_count="${1:-1}" + if [[ "${SKIP_DROP_CACHES:-0}" == "1" ]]; then + echo "[SKIP_DROP_CACHES=1] skipping OS page-cache drop on $node_count node(s)" + return 0 + fi + echo "Dropping OS page cache on $node_count node(s) (timeout 10s per node)..." + for (( i=0; i /proc/sys/vm/drop_caches'" && echo "done" || echo "timeout/failed (non-fatal)" + done + echo "Cache drop complete." +} + +# ─── INI Generation ─── + +generate_ini() { + # Generate a benchmark INI from a template, filling in [Distributed] fields. + # Usage: generate_ini [overrides...] + local SCALE="$1" + local NODE_COUNT="$2" + shift 2 + + local IDX_PATH="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index" + local KEY_PREFIX="bench${SCALE}_${NODE_COUNT}node" + + # Build comma-separated address lists from the first node_count entries + local dispatcher_addr="${NODE_HOSTS[0]}:30001" + local worker_addrs="" store_addrs="" pd_addrs="" + for (( i=0; i&2 + return 1 + fi + + local OUT="$SCRIPT_DIR/configs/benchmark_${SCALE}_${NODE_COUNT}node.ini" + cp "$BASE_INI" "$OUT" + + # Fill in placeholder fields + sed -i "s|^IndexPath=.*|IndexPath=${IDX_PATH}|" "$OUT" + sed -i "s|^TiKVKeyPrefix=.*|TiKVKeyPrefix=${KEY_PREFIX}|" "$OUT" + sed -i "s|^DispatcherAddr=.*|DispatcherAddr=${dispatcher_addr}|" "$OUT" + sed -i "s|^WorkerAddrs=.*|WorkerAddrs=${worker_addrs}|" "$OUT" + sed -i "s|^StoreAddrs=.*|StoreAddrs=${store_addrs}|" "$OUT" + sed -i "s|^PDAddrs=.*|PDAddrs=${pd_addrs}|" "$OUT" + + # Apply extra overrides (key=value pairs) + for override in "$@"; do + local key="${override%%=*}" + local val="${override#*=}" + if grep -q "^${key}=" "$OUT"; then + sed -i "s|^${key}=.*|${key}=${val}|" "$OUT" + else + # Append to [Benchmark] section + sed -i "/^\[Benchmark\]/a ${key}=${val}" "$OUT" + fi + done + + echo "$OUT" +} + +# ─── Worker Management ─── + +WORKER_SSH_PIDS=() + +start_remote_worker() { + # Start a worker on a remote node. Returns immediately; worker runs in background. + local NODE_IDX="$1" + local INI="$2" + local SCALE="$3" + local NODE_COUNT="$4" + local host="${NODE_HOSTS[$NODE_IDX]}" + local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${NODE_IDX}.log" + + # Copy INI + binary to remote + remote_sync "$host" "$INI" "$SPTAG_DIR/worker_n${NODE_IDX}.ini" + + # Start worker via SSH (foreground on remote, background locally). + # Use `ssh -n` to redirect stdin from /dev/null so SSH doesn't try to + # acquire a TTY when the parent script runs under `nohup`. Without -n, + # the SSH client sometimes silently re-points fd1 → /dev/null and fd2 + # → a deleted /tmp file, dropping the worker log. + ssh -n $(_ssh_opts) "$SSH_USER@$host" \ + "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \ + WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \ + SPFRESH_SHARD_STRIDE=${SPFRESH_SHARD_STRIDE:-0} \ + ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \ + "$LOG" 2>&1 & + local ssh_pid=$! + WORKER_SSH_PIDS+=($ssh_pid) + echo " Worker n${NODE_IDX} on $host (SSH PID: $ssh_pid, log: $LOG)" +} + +wait_workers_ready() { + local SCALE="$1" + local NODE_COUNT="$2" + local TIMEOUT=120 + + echo "Waiting for ${#WORKER_SSH_PIDS[@]} workers to be ready..." + for attempt in $(seq 1 $TIMEOUT); do + local all_ready=true + for i in $(seq 1 $((NODE_COUNT - 1))); do + local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}.log" + if ! grep -q "Worker.*[Rr]eady\|Waiting for dispatch" "$LOG" 2>/dev/null; then + all_ready=false + fi + done + if $all_ready; then + echo " All workers ready (${attempt}s)" + return 0 + fi + # Check if any worker SSH process died + for idx in "${!WORKER_SSH_PIDS[@]}"; do + if ! kill -0 "${WORKER_SSH_PIDS[$idx]}" 2>/dev/null; then + echo " ERROR: Worker SSH PID ${WORKER_SSH_PIDS[$idx]} exited prematurely" + return 1 + fi + done + sleep 1 + done + echo " WARNING: Not all workers ready after ${TIMEOUT}s" + return 1 +} + +stop_remote_workers() { + # Wait for workers to self-exit (driver sends TCP Stop), then force-kill. + local TIMEOUT=${1:-30} + if [ ${#WORKER_SSH_PIDS[@]} -eq 0 ]; then return; fi + + echo "Waiting for ${#WORKER_SSH_PIDS[@]} remote workers to exit (${TIMEOUT}s timeout)..." + for pid in "${WORKER_SSH_PIDS[@]}"; do + local elapsed=0 + while kill -0 "$pid" 2>/dev/null && [ $elapsed -lt $TIMEOUT ]; do + sleep 1 + elapsed=$((elapsed + 1)) + done + if kill -0 "$pid" 2>/dev/null; then + echo " WARNING: SSH PID $pid still alive, force killing" + kill -9 "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + else + echo " Worker (SSH PID $pid) exited gracefully" + fi + done + WORKER_SSH_PIDS=() +} + +# Watchdog: detect driver death (segfault, OOM, SIGKILL by oom_killer, ...) +# and tear down remote workers so they don't linger forever. +# The C++ heartbeat watchdog inside the worker is the primary defense (bounded +# at HeartbeatTimeoutSec, default 180s). This shell watchdog is a faster +# secondary path: as soon as the driver PID is gone we (a) kill the local SSH +# wrappers and (b) `pkill` the remote SPTAGTest processes. +DRIVER_WATCHDOG_PID="" + +start_driver_watchdog() { + local DRIVER_PID="$1" + local NODE_COUNT="$2" + if [ "$NODE_COUNT" -lt 2 ]; then return; fi + if [ ${#WORKER_SSH_PIDS[@]} -eq 0 ]; then return; fi + + # Snapshot what we need before backgrounding (subshell forks current env). + local _ssh_pids="${WORKER_SSH_PIDS[*]}" + local _hosts=() + for (( i=1; i/dev/null; do + sleep 5 + done + echo "[watchdog] Driver PID $DRIVER_PID is gone; tearing down remote workers" >&2 + for pid in $_ssh_pids; do + kill -TERM "$pid" 2>/dev/null || true + done + for host in $_hosts_str; do + ssh -n $_ssh_opts_str "$_ssh_user@$host" \ + "pkill -TERM -f 'SPTAGTest.*BenchmarkFromConfig' 2>/dev/null; \ + sleep 5; \ + pkill -KILL -f 'SPTAGTest.*BenchmarkFromConfig' 2>/dev/null; true" \ + /dev/null 2>&1 || true + done + for pid in $_ssh_pids; do + kill -0 "$pid" 2>/dev/null && kill -KILL "$pid" 2>/dev/null || true + done + ) & + DRIVER_WATCHDOG_PID=$! + echo " Driver watchdog started (PID: $DRIVER_WATCHDOG_PID, monitoring driver $DRIVER_PID)" +} + +stop_driver_watchdog() { + if [ -n "$DRIVER_WATCHDOG_PID" ] && kill -0 "$DRIVER_WATCHDOG_PID" 2>/dev/null; then + kill -TERM "$DRIVER_WATCHDOG_PID" 2>/dev/null || true + wait "$DRIVER_WATCHDOG_PID" 2>/dev/null || true + fi + DRIVER_WATCHDOG_PID="" +} + +# ─── Benchmark Run ─── + +distribute_head_index() { + # Copy the head index from driver to all worker nodes. + local SCALE="$1" + local NODE_COUNT="$2" + local SRC="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index" + + echo "Distributing head index to $((NODE_COUNT - 1)) workers..." + for (( i=1; i +resolve_build_mode() { + local SCALE="$1" NODE_COUNT="$2" + local IDX_DIR="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index" + local HEAD_DIR="$IDX_DIR/HeadIndex" + + BUILD_MODE_OVERRIDES=() + if [[ "${SKIP_HEAD_BUILD:-0}" == "1" ]] && [ -d "$HEAD_DIR" ] && [ -n "$(ls -A "$HEAD_DIR" 2>/dev/null)" ]; then + echo "HeadIndex found at $HEAD_DIR — using RebuildSSDOnly (skip SelectHead+BuildHead)" + BUILD_MODE_OVERRIDES=("RebuildSSDOnly=true") + else + if [[ "${SKIP_HEAD_BUILD:-0}" == "1" ]]; then + echo "SKIP_HEAD_BUILD=1 but HeadIndex not found at $HEAD_DIR — falling back to full build" + fi + BUILD_MODE_OVERRIDES=("Rebuild=true") + fi +} + +cmd_run() { + local SCALE="$1" + local NODE_COUNT="$2" + if [ -z "$SCALE" ] || [ -z "$NODE_COUNT" ]; then + echo "Usage: $0 run " + exit 1 + fi + + local BINARY="$SPTAG_DIR/Release/SPTAGTest" + + echo "" + echo "═══════════════════════════════════════════════════" + echo " ${SCALE}: ${NODE_COUNT}-node benchmark${NOCACHE:+ [NOCACHE]}" + echo " Start: $(date)" + echo "═══════════════════════════════════════════════════" + + if [ "$NODE_COUNT" -eq 1 ]; then + # ─── Single-node flow ─── + echo "" + echo "--- Phase 0: Prepare TiKV (1 instance) ---" + tikv_stop 1 + tikv_clean 1 + if ! tikv_start 1; then + echo "ERROR: tikv_start failed; aborting benchmark." >&2 + return 1 + fi + + # Resolve build mode before cleaning (SKIP_HEAD_BUILD needs existing dir) + resolve_build_mode "$SCALE" "$NODE_COUNT" + + if [[ " ${BUILD_MODE_OVERRIDES[*]} " != *"RebuildSSDOnly=true"* ]]; then + # Full build: clean old index dir + rm -rf "$DATA_DIR/proidx_${SCALE}_1node" + fi + mkdir -p "$DATA_DIR/proidx_${SCALE}_1node" + + if [[ "${NOCACHE:-0}" == "1" ]]; then + # NOCACHE: Split into build + cache-drop + search + local BUILD_VERSIONCACHE_OVERRIDES=("VersionCacheTTLMs=0" "VersionCacheMaxChunks=0") + if [[ "${BUILD_WITH_CACHE:-0}" == "1" ]]; then + # Build phase keeps caches enabled; the run phase below switches to nocache + BUILD_VERSIONCACHE_OVERRIDES=() + echo "" + echo "--- Phase 1: Build only (BUILD_WITH_CACHE=1, caches enabled) ---" + else + echo "" + echo "--- Phase 1: Build only (NOCACHE) ---" + fi + + if [[ "${SKIP_SAVE_LOAD:-0}" == "1" ]]; then + # Single-process flow: build + search + insert in one SPTAGTest invocation. + # SkipSaveLoadCycles=true bypasses the broken post-build SaveIndex and per-batch + # Load/Clone/Save. SPTAGTest itself drops OS pagecache after build, before query. + echo "[SKIP_SAVE_LOAD=1] running build + search + insert in a single SPTAGTest process" + local SINGLE_INI + SINGLE_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" \ + "SkipSaveLoadCycles=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1 + + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$SINGLE_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" + + echo "Done: $(date)" + tikv_stop 1 + return 0 + fi + + local BUILD_INI + BUILD_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" "BuildOnly=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1 + + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node_build.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_build.log" + + echo "Build done: $(date)" + + if [[ "${BUILD_WITH_CACHE:-0}" == "1" && "${SKIP_TIKV_SWAP:-0}" != "1" ]]; then + echo "" + echo "--- Phase 1.4: Switch TiKV to nocache config ---" + tikv_switch_to_nocache 1 + elif [[ "${SKIP_TIKV_SWAP:-0}" == "1" ]]; then + echo "[SKIP_TIKV_SWAP=1] keeping TiKV containers running; relying on drop_caches + VersionCache=0" + fi + + echo "" + echo "--- Phase 1.5: Drop all caches (NOCACHE) ---" + drop_all_caches 1 + + echo "" + echo "--- Phase 2: Search+Insert (cold cache) ---" + local RUN_INI + RUN_INI=$(generate_ini "$SCALE" 1 "Rebuild=false" "VersionCacheTTLMs=0" "VersionCacheMaxChunks=0") || exit 1 + + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" + else + echo "" + echo "--- Phase 1: Single-node run ---" + local INI + INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}") || exit 1 + + echo "Starting driver on ${NODE_HOSTS[0]}..." + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" + fi + + echo "Done: $(date)" + tikv_stop 1 + else + # ─── Multi-node flow ─── + echo "" + echo "--- Phase 0: Prepare TiKV ($NODE_COUNT instances) ---" + tikv_stop "$NODE_COUNT" + tikv_clean "$NODE_COUNT" + if ! tikv_start "$NODE_COUNT"; then + echo "ERROR: tikv_start failed; aborting benchmark." >&2 + return 1 + fi + + # --- Phase 1: Build index on driver --- + echo "" + echo "--- Phase 1: Build index on driver ---" + local BUILD_INI + local NOCACHE_OVERRIDES=() + local BUILD_NOCACHE_OVERRIDES=() + if [[ "${NOCACHE:-0}" == "1" ]]; then + NOCACHE_OVERRIDES=("VersionCacheTTLMs=0" "VersionCacheMaxChunks=0" "WorkerTimeout=14400") + if [[ "${BUILD_WITH_CACHE:-0}" == "1" ]]; then + # Build with cache, only run phase is nocache + BUILD_NOCACHE_OVERRIDES=() + echo "[BUILD_WITH_CACHE=1] build phase keeps caches; will switch before run phase" + else + BUILD_NOCACHE_OVERRIDES=("${NOCACHE_OVERRIDES[@]}") + fi + fi + + # Resolve build mode before cleaning (SKIP_HEAD_BUILD needs existing dir) + resolve_build_mode "$SCALE" "$NODE_COUNT" + + if [[ " ${BUILD_MODE_OVERRIDES[*]} " != *"RebuildSSDOnly=true"* ]]; then + # Full build: clean old index dirs on all nodes + for (( i=0; i "$BUILD_LOG" 2>&1 & + local BUILD_PID=$! + echo " Driver build PID: $BUILD_PID" + + # Shell-side watchdog: if the driver dies unexpectedly (segfault, OOM, + # SIGKILL) we want a fast failure path rather than hanging forever. + WORKER_SSH_PIDS=() + start_driver_watchdog "$BUILD_PID" "$NODE_COUNT" + + # Wait for the driver build to finish + echo " Waiting for driver build to complete..." + wait "$BUILD_PID" + local BUILD_RC=$? + echo "Driver build done (exit=$BUILD_RC): $(date)" + stop_driver_watchdog + + if [[ $BUILD_RC -ne 0 ]] || grep -q "===== SEGFAULT" "$BUILD_LOG"; then + echo "" + echo "ERROR: Build phase failed (exit=$BUILD_RC, segfault=$(grep -c '===== SEGFAULT' "$BUILD_LOG"))" + echo "Refusing to proceed to run phase with broken build state." + echo "Tail of build log:" + tail -30 "$BUILD_LOG" + tikv_stop "$NODE_COUNT" + exit 1 + fi + + echo "Build done: $(date)" + + # --- Phase 2: Distribute data --- + echo "" + echo "--- Phase 2: Distribute head index + data ---" + rm -f "$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index/checkpoint.txt" + + distribute_head_index "$SCALE" "$NODE_COUNT" + distribute_perftest_files "$NODE_COUNT" + + # Sync SPTAGTest binary + bundled runtime libs to all workers so + # they pick up the latest compiled changes. (cmd_deploy is a separate + # subcommand; without this step a stale binary on the worker silently + # diverges from the driver.) + echo "" + echo "Syncing SPTAGTest binary + runtime_libs to workers..." + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release" + remote_sync "$host" "$SPTAG_DIR/Release/SPTAGTest" "$SPTAG_DIR/Release/SPTAGTest" + if [ -d "$SPTAG_DIR/Release/runtime_libs" ]; then + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release/runtime_libs" + rsync -az -e "ssh $(_ssh_opts)" \ + "$SPTAG_DIR/Release/runtime_libs/" \ + "$SSH_USER@$host:$SPTAG_DIR/Release/runtime_libs/" + fi + done + + # Binary already pushed; nothing else to do here. + + # --- Phase 3: Start driver first (contains dispatcher), then workers --- + echo "" + + # Drop caches if NOCACHE mode + if [[ "${NOCACHE:-0}" == "1" ]]; then + if [[ "${BUILD_WITH_CACHE:-0}" == "1" && "${SKIP_TIKV_SWAP:-0}" != "1" ]]; then + echo "--- Phase 2.4: Switch TiKV to nocache config ---" + tikv_switch_to_nocache "$NODE_COUNT" + elif [[ "${SKIP_TIKV_SWAP:-0}" == "1" ]]; then + echo "[SKIP_TIKV_SWAP=1] keeping TiKV containers running; relying on drop_caches + VersionCache=0" + fi + echo "--- Phase 2.5: Drop all caches (NOCACHE) ---" + drop_all_caches "$NODE_COUNT" + fi + + echo "--- Phase 3: Distributed run ---" + + local RUN_INI + RUN_INI=$(generate_ini "$SCALE" "$NODE_COUNT" "Rebuild=false" "${NOCACHE_OVERRIDES[@]}") || exit 1 + + # Start driver in background first — it contains the dispatcher that + # workers need to connect to for ring registration. + local DRIVER_LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_driver.log" + echo "Starting driver (dispatcher+worker0) on ${NODE_HOSTS[0]}..." + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_${NODE_COUNT}node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig ) \ + > "$DRIVER_LOG" 2>&1 & + local DRIVER_PID=$! + echo " Driver PID: $DRIVER_PID" + + # Wait for dispatcher to start listening before launching workers + local DISP_PORT=30001 + echo " Waiting for dispatcher to listen on port $DISP_PORT..." + for attempt in $(seq 1 60); do + if ss -tlnp 2>/dev/null | grep -q ":${DISP_PORT} " || \ + netstat -tlnp 2>/dev/null | grep -q ":${DISP_PORT} "; then + echo " Dispatcher listening (${attempt}s)" + break + fi + if ! kill -0 "$DRIVER_PID" 2>/dev/null; then + echo " ERROR: Driver exited prematurely" + cat "$DRIVER_LOG" + return 1 + fi + if [ "$attempt" -eq 60 ]; then + echo " WARNING: Dispatcher not detected on port $DISP_PORT after 60s, proceeding anyway" + fi + sleep 1 + done + + # Now start remote workers — they can connect to the dispatcher + WORKER_SSH_PIDS=() + for (( i=1; i/dev/null || true + done + + tikv_stop "$NODE_COUNT" + fi + + echo "" + echo "═══════════════════════════════════════════════════" + echo " ${SCALE} ${NODE_COUNT}-node done: $(date)" + echo " Results: output_${SCALE}_${NODE_COUNT}node.json" + echo " Logs: $LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_*.log" + echo "═══════════════════════════════════════════════════" +} + +cmd_bench() { + # Run 1-node baseline + N-node distributed for each specified scale. + # Usage: cmd_bench [scale...] + # Special scale "all" expands to all scales with templates in configs/. + local scales=() + for arg in "$@"; do + if [ "$arg" = "all" ]; then + for tmpl in "$SCRIPT_DIR"/configs/benchmark_*_template.ini; do + local name + name="$(basename "$tmpl")" + name="${name#benchmark_}" + name="${name%_template.ini}" + scales+=("$name") + done + else + scales+=("$arg") + fi + done + + if [ ${#scales[@]} -eq 0 ]; then + echo "Usage: $0 bench [scale...] | all" + echo "Available scales:" + for tmpl in "$SCRIPT_DIR"/configs/benchmark_*_template.ini; do + local name + name="$(basename "$tmpl")" + name="${name#benchmark_}" + name="${name%_template.ini}" + echo " $name" + done + exit 1 + fi + + echo "" + echo "═══════════════════════════════════════════════════" + echo " Benchmark suite: ${scales[*]}" + echo " Cluster: $TOTAL_NODES nodes" + echo " Start: $(date)" + echo "═══════════════════════════════════════════════════" + + for scale in "${scales[@]}"; do + echo "" + echo "▶▶▶ Scale: $scale — 1-node baseline" + cmd_run "$scale" 1 + + if [ "$TOTAL_NODES" -gt 1 ]; then + echo "" + echo "▶▶▶ Scale: $scale — ${TOTAL_NODES}-node distributed" + cmd_run "$scale" "$TOTAL_NODES" + else + echo " (Skipping multi-node: cluster has only 1 node)" + fi + done + + echo "" + echo "═══════════════════════════════════════════════════" + echo " Benchmark suite complete: $(date)" + echo "═══════════════════════════════════════════════════" +} + +# ─── Cleanup ─── + +cmd_cleanup() { + echo "" + echo "=== Cleaning up remote nodes ===" + + for i in $(seq 1 $((${#NODE_HOSTS[@]} - 1))); do + local host="${NODE_HOSTS[$i]}" + echo " Cleaning $host..." + remote_exec "$host" "rm -rf $SPTAG_DIR/Release/SPTAGTest $SPTAG_DIR/perftest_* $SPTAG_DIR/worker_*.ini" + # Clean index directories + remote_exec "$host" "rm -rf $DATA_DIR/proidx_*" + done + echo "Cleanup complete." +} + +# ─── Main ─── + +CMD="$1" +CONF="$2" + +if [ -z "$CMD" ] || [ -z "$CONF" ]; then + echo "Usage: $0 [args...]" + echo "" + echo "Commands:" + echo " deploy Deploy binary and data to all nodes" + echo " start-tikv Start independent TiKV/PD instances" + echo " stop-tikv Stop TiKV/PD instances" + echo " run Run benchmark: $0 run cluster.conf " + echo " bench Run full benchmark suite: $0 bench cluster.conf [scale...] | all" + echo " cleanup Remove deployed files from remote nodes" + exit 1 +fi + +parse_config "$CONF" + +# Trap for cleanup on interrupt +trap 'echo ""; echo "Interrupted!"; stop_driver_watchdog; stop_remote_workers 5; cmd_stop_tikv; exit 1' INT TERM + +case "$CMD" in + deploy) + cmd_deploy + ;; + setup-bins) + cmd_setup_bins + ;; + start-tikv) + cmd_start_tikv "${3:-}" + ;; + stop-tikv) + cmd_stop_tikv "${3:-}" + ;; + run) + cmd_run "$3" "$4" + ;; + bench) + shift 2 # skip cmd and conf + cmd_bench "$@" + ;; + cleanup) + cmd_cleanup + ;; + *) + echo "Unknown command: $CMD" + echo "Valid commands: deploy, setup-bins, start-tikv, stop-tikv, run, bench, cleanup" + exit 1 + ;; +esac From 418674711afefef9a7548136618940061343f0de Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 07:21:04 +0000 Subject: [PATCH 02/48] Fix unneede diff --- .gitignore | 3 +- Test/src/main.cpp | 5 +- benchmark.ini | 19 ----- .../configs/benchmark_100m_1node.ini | 71 ------------------- .../configs/benchmark_100m_2node.ini | 71 ------------------- .../configs/benchmark_10m_1node.ini | 62 ---------------- .../configs/benchmark_10m_2node.ini | 62 ---------------- .../benchmark_insert_dominant_1node.ini | 58 --------------- .../benchmark_insert_dominant_2node.ini | 58 --------------- .../benchmark_insert_dominant_3node.ini | 59 --------------- 10 files changed, 5 insertions(+), 463 deletions(-) delete mode 100644 benchmark.ini delete mode 100644 evaluation/distributed/configs/benchmark_100m_1node.ini delete mode 100644 evaluation/distributed/configs/benchmark_100m_2node.ini delete mode 100644 evaluation/distributed/configs/benchmark_10m_1node.ini delete mode 100644 evaluation/distributed/configs/benchmark_10m_2node.ini delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_1node.ini delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_2node.ini delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_3node.ini diff --git a/.gitignore b/.gitignore index e3dc9796a..190ca29d3 100644 --- a/.gitignore +++ b/.gitignore @@ -464,5 +464,4 @@ FodyWeavers.xsd *.sln.iml # SPTAG benchmark generated artifacts -/perftest_* -/evaluation/2026-04-23/output_distributed_hostname_*.json +*perftest_* diff --git a/Test/src/main.cpp b/Test/src/main.cpp index ab8d1342c..49ca39950 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -7,7 +7,9 @@ #include #include +#ifdef TIKV #include +#endif using namespace boost::unit_test; @@ -36,8 +38,9 @@ struct GlobalFixture // adds GraphCycles bookkeeping under a global spinlock on every Lock(); // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). +#ifdef TIKV absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); - +#endif SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); } diff --git a/benchmark.ini b/benchmark.ini deleted file mode 100644 index e2b400767..000000000 --- a/benchmark.ini +++ /dev/null @@ -1,19 +0,0 @@ -[Benchmark] -VectorPath=sift1b/base.100M.u8bin -QueryPath=sift1b/query.public.10K.u8bin -TruthPath=none -IndexPath=proidx/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=10000 -InsertVectorCount=10000 -DeleteVectorCount=0 -BatchNum=10 -TopK=5 -NumThreads=8 -NumQueries=100 -DistMethod=L2 -Rebuild=true -Resume=-1 -QuantizerFilePath=quantizer.bin -QuantizedDim=64 diff --git a/evaluation/distributed/configs/benchmark_100m_1node.ini b/evaluation/distributed/configs/benchmark_100m_1node.ini deleted file mode 100644 index 42ec07f49..000000000 --- a/evaluation/distributed/configs/benchmark_100m_1node.ini +++ /dev/null @@ -1,71 +0,0 @@ -; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). -; 100× larger base index than insert_dominant. Tests how the system behaves when -; the head index is large (~tens of millions of heads on layer 0) and the insert -; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -; -; Notes for 100M-scale operation: -; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; -; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the -; HeadIndex on disk is intact. -; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. -; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; -; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. -; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need -; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_100m_1node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=99000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=true -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench100m_1node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=10000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011 -StoreAddrs=10.11.0.7:20171 -PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_100m_2node.ini b/evaluation/distributed/configs/benchmark_100m_2node.ini deleted file mode 100644 index 01b9c3e81..000000000 --- a/evaluation/distributed/configs/benchmark_100m_2node.ini +++ /dev/null @@ -1,71 +0,0 @@ -; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). -; 100× larger base index than insert_dominant. Tests how the system behaves when -; the head index is large (~tens of millions of heads on layer 0) and the insert -; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -; -; Notes for 100M-scale operation: -; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; -; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the -; HeadIndex on disk is intact. -; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. -; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; -; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. -; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need -; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_100m_2node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=99000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench100m_2node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=10000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 -StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 -PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_1node.ini b/evaluation/distributed/configs/benchmark_10m_1node.ini deleted file mode 100644 index 56dbd9088..000000000 --- a/evaluation/distributed/configs/benchmark_10m_1node.ini +++ /dev/null @@ -1,62 +0,0 @@ -; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). -; 10× larger base index than insert_dominant, 10× smaller than 100m. -; Useful for validating scaling between 1M and 100M without paying the -; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset -; (truncated to 10M of the 1B available). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_10m_1node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=9000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=true -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench10m_1node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011 -StoreAddrs=10.11.0.7:20171 -PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_2node.ini b/evaluation/distributed/configs/benchmark_10m_2node.ini deleted file mode 100644 index 4ed317ac3..000000000 --- a/evaluation/distributed/configs/benchmark_10m_2node.ini +++ /dev/null @@ -1,62 +0,0 @@ -; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). -; 10× larger base index than insert_dominant, 10× smaller than 100m. -; Useful for validating scaling between 1M and 100M without paying the -; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset -; (truncated to 10M of the 1B available). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_10m_2node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=9000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench10m_2node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 -StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 -PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini deleted file mode 100644 index 30fe77bbe..000000000 --- a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini +++ /dev/null @@ -1,58 +0,0 @@ -; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. -; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_insert_dominant_1node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=1000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=true -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=benchinsert_dominant_1node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=100000 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011 -StoreAddrs=10.11.0.7:20171 -PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini deleted file mode 100644 index d45870b50..000000000 --- a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini +++ /dev/null @@ -1,58 +0,0 @@ -; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. -; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_insert_dominant_2node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=1000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=benchinsert_dominant_2node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=100000 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 -StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 -PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini deleted file mode 100644 index a8050732d..000000000 --- a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini +++ /dev/null @@ -1,59 +0,0 @@ -; insert_dominant: 1M base + 10M insert (10× scale-up) with concurrent search-during-insert. -; Tests how the index handles insertion-dominated workloads where insertion volume -; is much larger than the initial baseline. Layers=2, L2 distance, SIFT1B dataset. -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/data/sift1b/base.1B.u8bin -QueryPath=/mnt/data/sift1b/query.public.10K.u8bin -TruthPath=truth -IndexPath=/mnt/md0/proidx_insert_dominant_3node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=1000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=benchinsert_dominant_3node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=100000 - -[Distributed] -Enabled=true -DispatcherAddr=172.27.0.4:30001 -WorkerAddrs=172.27.0.4:30011,172.27.0.5:30002,172.27.0.6:30003 -StoreAddrs=172.27.0.4:20171,172.27.0.5:20171,172.27.0.6:20171 -PDAddrs=172.27.0.4:23791,172.27.0.5:23791,172.27.0.6:23791 From ee97d3ff732f69c91c2b35158219c5f3f1873187 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 08:21:07 +0000 Subject: [PATCH 03/48] Remove unused stride-shard experiment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strip the SPFRESH_SHARD_STRIDE opt-in code path (4 helpers + plumbing through LoadAndInsertBatch/RunBenchmark/RunWorker). No active config sets the env var; we always use the contiguous slice partition. Test/CMakeLists.txt: explicitly link ${TiKV_LIBRARIES} into SPTAGTest so a clean build (no .o cache) resolves gpr_/grpc_ symbols pulled in by the kvproto generated stubs. ThirdParty/kvproto/.gitignore: stop tracking regenerated stubs going forward — they are environment-specific (must match the protoc/grpc in the build env); regenerate locally via generate_cpp.sh. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Test/CMakeLists.txt | 2 +- Test/src/SPFreshTest.cpp | 148 ++-------------------- ThirdParty/kvproto/.gitignore | 4 + evaluation/distributed/run_distributed.sh | 1 - 4 files changed, 19 insertions(+), 136 deletions(-) create mode 100644 ThirdParty/kvproto/.gitignore diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt index 27bdeebb5..9db640da2 100644 --- a/Test/CMakeLists.txt +++ b/Test/CMakeLists.txt @@ -24,7 +24,7 @@ if (NOT LIBRARYONLY) file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h) file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp) add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES}) - target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) + target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} ${TiKV_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) install(TARGETS SPTAGTest RUNTIME DESTINATION bin diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 9ab420db9..1a2140773 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -62,94 +62,6 @@ static __attribute__((constructor)) void install_segfault_handler() { using namespace SPTAG; -// --------------------------------------------------------------------------- -// Stride sharding (a.k.a. odd/even sharding) experiment -// --------------------------------------------------------------------------- -// When the env var SPFRESH_SHARD_STRIDE is set to "1"/"true", each node, instead -// of inserting a contiguous slice [n*B/N, (n+1)*B/N) of the per-iteration batch, -// inserts the strided rows {n, n+N, n+2*N, ...} where n=nodeIndex, N=numNodes. -// This breaks any spatial structure in the input dataset (e.g. SIFT files that -// are roughly sorted by visual feature), letting us check whether the layer-0 -// split skew (driver 71 vs worker 2 in v18) is caused by contiguous slicing -// landing similar vectors on the same node and overflowing a small set of heads. -// -// The total number of vectors inserted across all nodes per iteration is the -// same; only the assignment changes. Recall measurement still works because -// the dataset and ground truth are unchanged — only insert routing differs. -static bool IsStrideShardEnabled() { - const char* e = std::getenv("SPFRESH_SHARD_STRIDE"); - if (!e) return false; - std::string v(e); - return v == "1" || v == "true" || v == "TRUE" || v == "yes"; -} - -// Compute count of indices i in [0, total) with (i % stride) == offset. -static SizeType StrideCount(SizeType total, int stride, int offset) { - if (stride <= 1) return total; - if (offset < 0 || offset >= stride) return 0; - if (total <= offset) return 0; - return (total - 1 - offset) / stride + 1; -} - -// Build a strided sub-VectorSet by copying every `stride`-th vector starting -// at `offset` into a contiguous packed ByteArray. Returns a BasicVectorSet. -static std::shared_ptr ExtractStridedVectors( - const std::shared_ptr& full, int stride, int offset) -{ - if (!full) return nullptr; - SizeType totalCount = full->Count(); - SizeType outCount = StrideCount(totalCount, stride, offset); - auto vt = full->GetValueType(); - auto dim = full->Dimension(); - size_t perVecSize = full->PerVectorDataSize(); - if (outCount <= 0) { - return std::make_shared(ByteArray::Alloc(0), vt, dim, 0); - } - ByteArray buf = ByteArray::Alloc(static_cast(outCount) * perVecSize); - for (SizeType i = 0; i < outCount; ++i) { - SizeType srcIdx = static_cast(offset) + i * static_cast(stride); - std::memcpy(buf.Data() + static_cast(i) * perVecSize, - full->GetVector(srcIdx), - perVecSize); - } - return std::make_shared(buf, vt, dim, outCount); -} - -// Build a strided sub-MetadataSet. Two-pass: first compute offsets, then copy. -static std::shared_ptr ExtractStridedMetadata( - const std::shared_ptr& full, int stride, int offset) -{ - if (!full) return nullptr; - SizeType totalCount = full->Count(); - SizeType outCount = StrideCount(totalCount, stride, offset); - if (outCount <= 0) { - ByteArray emptyMeta = ByteArray::Alloc(0); - ByteArray offBuf = ByteArray::Alloc(sizeof(std::uint64_t)); - *reinterpret_cast(offBuf.Data()) = 0ULL; - return std::make_shared(emptyMeta, offBuf, 0); - } - std::vector offsets(static_cast(outCount) + 1, 0ULL); - std::uint64_t total = 0; - for (SizeType i = 0; i < outCount; ++i) { - SizeType srcIdx = static_cast(offset) + i * static_cast(stride); - ByteArray meta = full->GetMetadata(srcIdx); - offsets[i] = total; - total += meta.Length(); - } - offsets[outCount] = total; - ByteArray metaBuf = ByteArray::Alloc(total > 0 ? total : 1); - for (SizeType i = 0; i < outCount; ++i) { - SizeType srcIdx = static_cast(offset) + i * static_cast(stride); - ByteArray meta = full->GetMetadata(srcIdx); - if (meta.Length() > 0) { - std::memcpy(metaBuf.Data() + offsets[i], meta.Data(), meta.Length()); - } - } - ByteArray offBuf = ByteArray::Alloc((static_cast(outCount) + 1) * sizeof(std::uint64_t)); - std::memcpy(offBuf.Data(), offsets.data(), offsets.size() * sizeof(std::uint64_t)); - return std::make_shared(metaBuf, offBuf, outCount); -} - // Helper: parse "host:port,host:port,..." into vector of pairs. static std::vector> ParseNodeAddrs(const std::string& addrStr) { std::vector> result; @@ -1098,7 +1010,6 @@ void LoadAndInsertBatch(SPANN::Index* spannIndex, const std::string& paddmetaidx, int dimension, int insertStart, int loadCount, int perNodeBatch, - bool strideShard, int numNodes, int nodeIndex, int numInsertThreads, SPANN::WorkerNode* router, std::shared_ptr quantizer, @@ -1121,14 +1032,6 @@ void LoadAndInsertBatch(SPANN::Index* spannIndex, addFloat->Count()); } auto addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, loadCount); - if (strideShard) { - addset = ExtractStridedVectors(addset, numNodes, nodeIndex); - addmetaset = ExtractStridedMetadata(addmetaset, numNodes, nodeIndex); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "%s: stride-shard batchStart=%d loadCount=%d -> kept=%d (every %d-th, offset=%d)\n", - logPrefix, insertStart, loadCount, - (int)(addset ? addset->Count() : 0), numNodes, nodeIndex); - } InsertVectors(spannIndex, numInsertThreads, perNodeBatch, addset, addmetaset, searchDuringInsertThreads, queryset, numQueries, searchK, @@ -1225,23 +1128,12 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c // Use distributed config for multi-node partitioning int nodeIndex = distCfg.workerIndex; int numNodes = distCfg.GetNumWorkers(); - bool strideShard = IsStrideShardEnabled() && numNodes > 1; - int myInsertStart, myInsertEnd, perNodeBatch; - if (strideShard) { - // Stride mode: each node loads the FULL per-iter batch then keeps rows - // where (rowIdx % numNodes) == nodeIndex. myInsertStart/End span the - // full batch; perNodeBatch is the count of strided rows. - myInsertStart = 0; - myInsertEnd = insertBatchSize; - perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); - } else { - myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; - myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; - perNodeBatch = myInsertEnd - myInsertStart; - } + int myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + int myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + int perNodeBatch = myInsertEnd - myInsertStart; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d strideShard=%d\n", - nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch, strideShard ? 1 : 0); + "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d\n", + nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch); // Variables to collect JSON output data std::ostringstream tmpbenchmark; @@ -1585,19 +1477,16 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c SPANN::DispatchCommand::Type::Insert, static_cast(iter)); } - // Each node inserts its partition. Default mode: contiguous slice - // [iter*batchSize + myInsertStart, +perNodeBatch). Stride mode: - // every numNodes-th row of the full batch starting at nodeIndex - // (loads full batch then filters down to perNodeBatch rows). + // Each node inserts its contiguous slice + // [iter*batchSize + myInsertStart, +perNodeBatch). int insertStart = iter * insertBatchSize + myInsertStart; - int loadCount = strideShard ? insertBatchSize : perNodeBatch; + int loadCount = perNodeBatch; { std::string driverTag = "RunBenchmark iter=" + std::to_string(iter); start = std::chrono::high_resolution_clock::now(); LoadAndInsertBatch(static_cast*>(cloneIndex.get()), paddset, paddmeta, paddmetaidx, M, insertStart, loadCount, perNodeBatch, - strideShard, numNodes, nodeIndex, numInsertThreads, workerPtr, enableQuantization ? quantizer : nullptr, numSearchDuringInsertThreads, queryset, @@ -2914,17 +2803,9 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, int nodeIndex = distCfg.workerIndex; int numNodes = distCfg.GetNumWorkers(); int insertBatchSize = insertVectorCount / std::max(batches, 1); - bool strideShard = IsStrideShardEnabled() && numNodes > 1; - int myInsertStart, myInsertEnd, perNodeBatch; - if (strideShard) { - myInsertStart = 0; - myInsertEnd = insertBatchSize; - perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); - } else { - myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; - myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; - perNodeBatch = myInsertEnd - myInsertStart; - } + int myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + int myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + int perNodeBatch = myInsertEnd - myInsertStart; BOOST_TEST_MESSAGE("Worker node " << nodeIndex << ": Loading index from " << indexPath); std::shared_ptr index; @@ -3035,16 +2916,15 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, if (cmd.m_type == SPANN::DispatchCommand::Type::Insert) { int insertStart = cmd.m_round * insertBatchSize + myInsertStart; - int loadCount = strideShard ? insertBatchSize : perNodeBatch; - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d, strideShard=%d)\n", - nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart, strideShard ? 1 : 0); + int loadCount = perNodeBatch; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d)\n", + nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart); auto t1 = std::chrono::high_resolution_clock::now(); std::string workerTag = "Worker " + std::to_string(nodeIndex) + " batch=" + std::to_string(cmd.m_round + 1); LoadAndInsertBatch(spannIndex, paddset, paddmeta, paddmetaidx, dimension, insertStart, loadCount, perNodeBatch, - strideShard, numNodes, nodeIndex, numInsertThreads, router, /*quantizer=*/nullptr, /*searchDuringInsertThreads=*/0, diff --git a/ThirdParty/kvproto/.gitignore b/ThirdParty/kvproto/.gitignore new file mode 100644 index 000000000..b2dab26f7 --- /dev/null +++ b/ThirdParty/kvproto/.gitignore @@ -0,0 +1,4 @@ +# Generated C++ stubs are environment-specific (protoc/grpc versions must +# match the gRPC libs in the build env). Each developer should regenerate +# locally via generate_cpp.sh instead of consuming the committed snapshot. +generated/ diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh index c383a7eed..bb982ab7d 100755 --- a/evaluation/distributed/run_distributed.sh +++ b/evaluation/distributed/run_distributed.sh @@ -744,7 +744,6 @@ start_remote_worker() { ssh -n $(_ssh_opts) "$SSH_USER@$host" \ "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \ WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \ - SPFRESH_SHARD_STRIDE=${SPFRESH_SHARD_STRIDE:-0} \ ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \ "$LOG" 2>&1 & local ssh_pid=$! From 4df704f9897ede7997e6632568f7362ebe893449 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 08:36:46 +0000 Subject: [PATCH 04/48] InsertVectors: dedupe branches, log InsertThreadNum ignore in bulk path The previous if/else duplicated the thread launch+join. Restructure to a single launch with an optional search-during-insert thread: - launch insertThreadCount workers - if benchmarking, launch one search thread in parallel - join all, then compute stats (only when search ran) Also log a clear note when the bulk router path is used: the user- supplied InsertThreadNum is unused there (driver runs one launcher thread and parallelism comes from [BuildSSDIndex] AppendThreadNum inside ExtraDynamicSearcher's append/split pool). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Test/src/SPFreshTest.cpp | 50 ++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 1a2140773..5bef228a3 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -661,29 +661,39 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step if (useBulk) { func = bulkFunc; insertThreadCount = 1; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "InsertVectors: bulk path - driver launcher=1, internal parallelism comes from " + "[BuildSSDIndex] AppendThreadNum (user-supplied InsertThreadNum=%d is unused on this path)\n", + insertThreads); } else { func = perVecFunc; insertThreadCount = insertThreads; } - if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) { - std::vector latencies; - std::vector results; - double searchWallSeconds = 0.0; + bool withSearch = (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr); - for (int j = 0; j < insertThreadCount; j++) - { - threads.emplace_back(func); - } - std::thread searchThread([&]() { + for (int j = 0; j < insertThreadCount; j++) + { + threads.emplace_back(func); + } + + std::vector latencies; + std::vector results; + double searchWallSeconds = 0.0; + std::thread searchThread; + if (withSearch) { + searchThread = std::thread([&]() { searchWallSeconds = ExecutePartitionedSearch( p_index, queryset, /*myStart=*/0, numQueries, k, searchThreads, results, &latencies, /*statsOut=*/nullptr); }); - for (auto &thread : threads) - { - thread.join(); - } + } + + for (auto &thread : threads) + { + thread.join(); + } + if (withSearch) { searchThread.join(); // Calculate statistics @@ -712,17 +722,6 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step *benchmarkData << " \"minLatency\": " << minLat << ",\n"; *benchmarkData << " \"maxLatency\": " << maxLat << ",\n"; *benchmarkData << " \"qps\": " << qps << ",\n"; - } else { - // No search-during-insert path: just run the insert threads. - // (Used by worker dispatch and any caller that doesn't need stats.) - for (int j = 0; j < insertThreadCount; j++) - { - threads.emplace_back(func); - } - for (auto &thread : threads) - { - thread.join(); - } } auto barrierStart = std::chrono::high_resolution_clock::now(); size_t barrierPolls = 0; @@ -743,9 +742,6 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step } - - - template void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ptr &queryset, std::shared_ptr &truth, const std::string &truthPath, From c27a109ac297d350521478b15bcb2e33b7e1827a Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:10:14 +0000 Subject: [PATCH 05/48] Restore (layers+1) multiplier in BlockController IO queue size 87160070 removed the (m_layers+1) multiplier in the SPDK BlockController queue-depth formula. The change was based on an incorrect assumption that the distributed port collapses all per-layer SPDK pools into the single shared layer-0 pool. In practice only layer 0 + the RPC receiver share a pool; every inner layer (m_layer >= 1) still creates its own SPDKThreadPool in both BuildIndex and LoadIndex. With Layers=2 (current active configs) we therefore have ~2 independent pools each running insert + reassign + append worker threads, so the peak concurrent IO-submitter count remains the qianxi-original (layers+1)*(insert+reassign+append) plus search threads. Under-sizing the BlockController queue could stall IO submission under heavy split/reassign + search load; over-sizing is harmless. Restore the multiplier to match qianxi behaviour. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/src/Core/SPANN/ExtraFileController.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp index b5db83822..24c839455 100644 --- a/AnnService/src/Core/SPANN/ExtraFileController.cpp +++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp @@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer) #ifndef _MSC_VER O_RDWR | O_DIRECT, numblocks, 2, 2, max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) + - p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)), + (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))), ((std::uint64_t)p_opt.m_startFileSize) << 30 #else GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2, From f3a9de98da29a208ef8eeb7311ad6c433bcfd21b Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:23:17 +0000 Subject: [PATCH 06/48] SetVersionBatch: bypass LRU cache, read TiKV directly All distributed runs override VersionCacheMaxChunks=0 (set by run_distributed.sh in build/run/nocache phases), so the LRU cache is effectively disabled. Using ReadChunkCached inside SetVersionBatch adds bookkeeping noise (cache hit/miss path, refresh-mutex acquire) that produces no benefit. Switch to direct ReadChunk; the dirty-byte gating still saves the WriteChunk RPC when no version byte actually changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/inc/Core/Common/TiKVVersionMap.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h index 69191fe1b..ff30306e8 100644 --- a/AnnService/inc/Core/Common/TiKVVersionMap.h +++ b/AnnService/inc/Core/Common/TiKVVersionMap.h @@ -386,7 +386,10 @@ namespace SPTAG } // Group writes by chunk: 1 ReadChunk + N byte-modifications + 1 WriteChunk - // per chunk, instead of N × (ReadChunk + WriteChunk). + // per chunk, instead of N × (ReadChunk + WriteChunk). Bypasses the LRU + // cache because runs that exercise this path always have + // VersionCacheMaxChunks=0; reading TiKV directly removes a layer of + // bookkeeping (cache invalidate-on-write) we no longer benefit from. void SetVersionBatch(const std::vector& vids, const std::vector& versions) override { size_t n = std::min(vids.size(), versions.size()); @@ -408,7 +411,7 @@ namespace SPTAG SizeType cid = kv.first; auto& idxs = kv.second; std::lock_guard lock(ChunkMutex(cid)); - std::string chunk = ReadChunkCached(cid); + std::string chunk = ReadChunk(cid); if (chunk.empty()) { chunk.assign(m_chunkSize, static_cast(0xff)); } From f35ae85bdb46d25d51585061de47c63b312f48c1 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:42:39 +0000 Subject: [PATCH 07/48] Drop high-priority job queue from SPDKThreadPool The distributed port introduced a separate m_highJobs queue + add_high in ThreadPool plus 'urgent' parameters on AppendAsync/ReassignAsync. Receiver dispatch already discovered high-priority starved Split jobs and switched to high=false. The remaining urgent=true callers were: - AppendAsync in CollectReAssign's non-TiKV branch (dead under Storage::TIKVIO which is the only storage we use) - ReassignAsync on head-miss in Append/BatchAppend (same starvation risk against Split that motivated the receiver-side revert) Restore ThreadPool.h to the upstream deque+addfront shape (no semantic change vs. original) and drop the urgent parameter from AppendAsync/ ReassignAsync, the high flag from JobSubmitter, and the high path from WireJobSubmitterIfReady. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 27 ++++++--------- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 29 +++++----------- AnnService/inc/Helper/ThreadPool.h | 33 +++++-------------- 3 files changed, 28 insertions(+), 61 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 577b91876..0f032c2ba 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -95,7 +95,7 @@ namespace SPTAG::SPANN { // its own m_splitThreadPool, so BatchAppend items dispatch by the // request's m_layer to the matching pool. A single submitter would // pile both layers' remote appends into whichever pool wired last. - using JobSubmitter = std::function; + using JobSubmitter = std::function; void SetJobSubmitter(int layer, JobSubmitter submitter) { std::unique_lock lk(m_callbackLifetimeMutex); EnsureLayerSlot_NoLock(layer); @@ -756,13 +756,12 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count); - // Submit each item as a high-priority Job to the searcher's - // shared compute pool. Pool workers run the local Append callback - // exactly like a local insert would. Last completion ACKs the - // sender. This puts remote work on the SAME concurrency budget - // as local Split/Merge/Reassign — eliminating the over-subscribed - // TiKV behaviour of the old separate bg executor + transient - // sub-worker threads. + // Submit each item as a Job to the searcher's shared compute pool. + // Pool workers run the local Append callback exactly like a local + // insert would. Last completion ACKs the sender. This puts remote + // work on the SAME concurrency budget as local Split/Merge/Reassign + // — eliminating the over-subscribed TiKV behaviour of the old + // separate bg executor + transient sub-worker threads. auto packetPtr = std::make_shared(std::move(packet)); const size_t total = batchReq->m_items.size(); if (total == 0) { @@ -810,15 +809,9 @@ namespace SPTAG::SPANN { // submitter we have. for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } } - // Normal priority. Per-layer routing (m_jobSubmitters[layer]) - // already isolates layer-N append items from other layers' - // pools. High priority starved split entirely (split:N - // in_flight, 0 completed) because once all 16 worker threads - // are running long-tail append items, fresh high-prio appends - // keep cutting in front of split. Append throughput per chunk - // is limited by pool concurrency × per-item RMW; widen the - // pool (AppendThreadNum) instead of using priority hacks. - if (sub) (*sub)(job, /*high=*/false); + // Per-layer routing (m_jobSubmitters[layer]) isolates layer-N + // append items from other layers' pools. + if (sub) (*sub)(job); else { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); } } } diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 29129bdb4..b8ca98e85 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -395,10 +395,7 @@ namespace SPTAG::SPANN { if (!m_worker || !m_splitThreadPool) return; auto pool = m_splitThreadPool; m_worker->SetJobSubmitter(m_layer, - [pool](Helper::ThreadPool::Job* j, bool high) { - if (high) pool->add_high(j); - else pool->add(j); - }); + [pool](Helper::ThreadPool::Job* j) { pool->add(j); }); } /// Set the external WorkerNode pointer and bind all callbacks @@ -436,7 +433,7 @@ namespace SPTAG::SPANN { // Mirror sender's version map for the records we're about // to persist so MergePostings + SearchIndex don't drop - // them as "stale". See HEAD git history for rationale. + // them as "stale". { const uint8_t* basePtr = reinterpret_cast(appendPosting.data()); size_t totalRec = appendPosting.size() / m_vectorInfoSize; @@ -1713,28 +1710,20 @@ namespace SPTAG::SPANN { m_splitThreadPool->add(curJob); } - inline void AppendAsync(SizeType headID, std::shared_ptr postingList, bool urgent = false,std::function p_callback = nullptr) + inline void AppendAsync(SizeType headID, std::shared_ptr postingList, std::function p_callback = nullptr) { auto* curJob = new AppendAsyncJob(this, headID, std::move(postingList), p_callback); m_appendJobsInFlight++; m_totalAppendSubmitted++; - if (urgent) { - m_splitThreadPool->addfront(curJob); - } else { - m_splitThreadPool->add(curJob); - } + m_splitThreadPool->add(curJob); } - inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, bool urgent = false, std::function p_callback = nullptr) + inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, std::function p_callback = nullptr) { auto* curJob = new ReassignAsyncJob(this, std::move(vectorInfo), headPrev, p_callback); m_reassignJobsInFlight++; m_totalReassignSubmitted++; - if (urgent) { - m_splitThreadPool->addfront(curJob); - } else { - m_splitThreadPool->add(curJob); - } + m_splitThreadPool->add(curJob); } ErrorCode CollectReAssign(ExtraWorkSpace *p_exWorkSpace, SizeType headID, std::shared_ptr headVec, @@ -1901,7 +1890,7 @@ namespace SPTAG::SPANN { if (m_opt->m_storage == Storage::TIKVIO) ret = BatchAppend(p_exWorkSpace, batchReassign, "CollectReAssign"); else { for (auto& kv : batchReassign) { - AppendAsync(kv.first, std::make_shared(kv.second), true); + AppendAsync(kv.first, std::make_shared(kv.second)); } } if (batchReassignCount > 0) { @@ -2019,7 +2008,7 @@ namespace SPTAG::SPANN { if (m_versionMap->GetVersion(VID) == version) { // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss To ReAssign: VID: %d, current version: %d\n", *(int*)(&appendPosting[idx]), version); m_stat.m_headMiss++; - ReassignAsync(vectorInfo, headID, true); + ReassignAsync(vectorInfo, headID); } // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss Do Not To ReAssign: VID: %d, version: %d, current version: %d\n", *(int*)(&appendPosting[idx]), m_versionMap->GetVersion(*(int*)(&appendPosting[idx])), version); } @@ -2185,7 +2174,7 @@ namespace SPTAG::SPANN { uint8_t version = *(uint8_t*)(ptr + sizeof(SizeType)); if (m_versionMap->GetVersion(VID) == version) { m_stat.m_headMiss++; - ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID, true); + ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID); } } continue; diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h index a351a75c8..01c82e2a7 100644 --- a/AnnService/inc/Helper/ThreadPool.h +++ b/AnnService/inc/Helper/ThreadPool.h @@ -5,7 +5,7 @@ #define _SPTAG_HELPER_THREADPOOL_H_ #include -#include +#include #include #include #include @@ -78,42 +78,28 @@ namespace SPTAG { { std::lock_guard lock(m_lock); - m_jobs.push(j); + m_jobs.push_back(j); } m_cond.notify_one(); } - // High-priority push: jobs in m_highJobs always run before m_jobs. - // Used by the distributed receiver to let inbound BatchAppend RPC - // work jump ahead of local Split/Merge/Reassign so the sender - // (driver) doesn't time out waiting for the chunk ack while the - // local pool drains long-running rebalance work. - void add_high(Job* j) + void addfront(Job* j) { { std::lock_guard lock(m_lock); - m_highJobs.push(j); + m_jobs.push_front(j); } m_cond.notify_one(); } - // Alias kept for compatibility with code that calls addfront() - // (e.g., split-async path). Same semantics as add_high. - void addfront(Job* j) { add_high(j); } - bool get(Job*& j) { std::unique_lock lock(m_lock); - while (m_jobs.empty() && m_highJobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); + while (m_jobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); if (!m_abort.ShouldAbort()) { - if (!m_highJobs.empty()) { - j = m_highJobs.front(); - m_highJobs.pop(); - } else { - j = m_jobs.front(); - m_jobs.pop(); - } + j = m_jobs.front(); currentJobs++; + m_jobs.pop_front(); return true; } return false; @@ -122,7 +108,7 @@ namespace SPTAG size_t jobsize() { std::lock_guard lock(m_lock); - return m_jobs.size() + m_highJobs.size(); + return m_jobs.size(); } inline uint32_t runningJobs() { return currentJobs; } @@ -136,8 +122,7 @@ namespace SPTAG protected: std::atomic_uint32_t currentJobs{ 0 }; - std::queue m_jobs; - std::queue m_highJobs; + std::deque m_jobs; Abort m_abort; std::mutex m_lock; std::condition_variable m_cond; From a49b26d5292b90c7ccd2ead91fb71176b8e5ae4b Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:58:06 +0000 Subject: [PATCH 08/48] Fix space --- Test/src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Test/src/main.cpp b/Test/src/main.cpp index 49ca39950..c1a5cde60 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -39,7 +39,7 @@ struct GlobalFixture // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). #ifdef TIKV - absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); #endif SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); From 689e5b23e45da738b7ff77830a59283d0a58c5e4 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:06:24 +0000 Subject: [PATCH 09/48] Fix distributed benchmark README + drop dead orchestrator code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_distributed.sh: - Remove wait_workers_ready() — dead since the driver-listens-on-30001 handshake replaced log-grep readiness detection. - Drop the stale 'Binary already pushed; nothing else to do here' comment that sat immediately after the actual binary-push rsync block. README.md: - Correct the TiKV deployment model: the cluster is SHARED (all PDs in one raft group, all TiKVs registered as stores, max-replicas=1) — not one isolated PD+TiKV per node as the old text claimed. Architecture diagram, port table, and pre-split helper updated accordingly (one PD endpoint, not a per-node loop). - Fix Step 1 cluster-config path: configs/cluster_2node.conf (an actual shipped file), not the non-existent cluster.conf.example. - Update port defaults to match cluster_2node.conf (23791/23801/20171) and call out that the driver's router_port must not collide with the dispatcher port 30001 (cluster_2node.conf uses 30011 for this reason). - List all shipped configs (10m, 100m, insert_dominant, tikv.toml, cluster_*.conf) in the file table. - Document setup-bins subcommand alongside deploy. - Flag the Build / Distribute / Run split as a workaround for the missing distributed SelectHead/BuildHead implementation, so readers don't mistake it for the steady-state design. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/README.md | 219 +++++++++++++--------- evaluation/distributed/run_distributed.sh | 33 ---- 2 files changed, 126 insertions(+), 126 deletions(-) diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md index 1f24bc865..4717efc35 100644 --- a/evaluation/distributed/README.md +++ b/evaluation/distributed/README.md @@ -1,18 +1,26 @@ # Distributed Benchmark Evaluation — Insert Dominant Multi-machine SPTAG SPANN distributed benchmark for an **insert-dominant** workload -(1M base + 10M inserts in 10 batches, with concurrent search-during-insert) on -SIFT1B. Each physical node runs its own independent PD + TiKV (no shared Raft -replication — see "TiKV deployment model" below). +(1M base + 1M-10M inserts in batches, with concurrent search-during-insert) on +SIFT1B. All nodes share a single TiKV raft cluster (see "TiKV deployment model" +below). ## Files in this folder | File | Purpose | | --- | --- | -| `configs/benchmark_insert_dominant_template.ini` | Benchmark template; `run_distributed.sh` fills `IndexPath`, `TiKVPDAddresses`, `TiKVKeyPrefix`, and `[Distributed]` from `cluster.conf`. | -| `run_distributed.sh` | Orchestrator: `deploy` / `start-tikv` / `run` / `stop-tikv` / `cleanup`. | +| `configs/benchmark_insert_dominant_template.ini` | 1M base + 1M insert, search-during-insert workload. | +| `configs/benchmark_10m_template.ini` | 9M base + 1M insert, growing-index workload. | +| `configs/benchmark_100m_template.ini` | 99M base + 1M insert, steady-state/freshness workload. | +| `configs/cluster_2node.conf`, `configs/cluster_3node.conf` | Example cluster topologies. Pick one (or write your own) and pass to the orchestrator. | +| `configs/tikv.toml` | TiKV server config baked into the containers. | +| `run_distributed.sh` | Orchestrator: `deploy` / `setup-bins` / `start-tikv` / `run` / `bench` / `stop-tikv` / `cleanup`. | +| `bin/` | `tikv-server` + `pd-server` binaries used by the containers (`setup-bins` downloads them if missing). | | `README.md` | This file. | +`run_distributed.sh` fills the template's `IndexPath`, `TiKVPDAddresses`, +`TiKVKeyPrefix`, and `[Distributed]` section from the cluster config. + ## Architecture ``` @@ -29,35 +37,42 @@ replication — see "TiKV deployment model" below). │ + Router│ │ + Router│ │ + Router│ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │ │ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ TiKV 1 │ │ TiKV 2 │ │ TiKV N │ (one PD + one TiKV per node) - └──────────┘ └──────────┘ └──────────┘ + └────────────┼────────────┘ + ▼ + ┌───────────────────┐ + │ Shared TiKV raft │ N PDs (one raft group) + + │ cluster │ N TiKV stores (max-replicas=1) + └───────────────────┘ ``` -- **Driver** (node 0): Builds the index, sends Search/Insert/Stop commands via TCP dispatch. -- **Workers** (nodes 1..N): Receive commands, execute their shard locally, report results back. -- **TiKV (per node)**: Each node runs its own independent PD + TiKV pair. Postings - for a head live on the node that owns that head's hash partition. -- **PostingRouter**: Hash-based head routing, remote append, head sync, dispatch protocol. +- **Driver** (node 0): builds the index, sends Search/Insert/Stop commands via + TCP dispatch. +- **Workers** (nodes 1..N): receive commands, execute their shard locally, + report results back over the dispatch channel. +- **Shared TiKV cluster**: every node runs a PD + TiKV container; all PDs join + one raft group, all TiKVs point to all PDs. PD routes each key to the store + that owns its region. +- **PostingRouter**: hash-based head routing, remote append, head sync, + dispatch protocol. ## TiKV deployment model -Unlike a single-machine multi-docker TiKV (3 PD + 3 TiKV behind 127.0.0.1 ports -22791-3 / 20161-3 sharing one Raft cluster), in this multi-machine setup **each -node runs its own isolated PD + TiKV pair** under host networking. Heads are -routed to nodes by hash, and each node's TiKV stores only its own shard. There -is no Raft replication between nodes (no cross-node region quorum), which is -intentional for insert-dominated benchmarks where Raft log overhead would dominate. +All nodes share **one** TiKV raft cluster: every node's PD joins the same raft +group, every node's TiKV registers as a store in that cluster, and PD routes +reads/writes to whichever store owns the region. `max-replicas=1` is set so +each region lives on exactly one store — we measure benchmark performance +without 3-way Raft replication. Compute nodes are stateless TiKV clients; they +read any posting through the shared client, so there is no cross-compute fetch +RPC during RNGSelection. -Per-node ports (defaults from `cluster.conf`): +Per-node ports (defaults from `configs/cluster_2node.conf`): -| Service | Port | Notes | +| Service | Default port | Notes | | --- | --- | --- | -| PD client | `2379` | Local app uses `:2379`. | -| PD peer | `2380` | Inter-PD; isolated cluster of 1 PD per node. | -| TiKV client | `20161` | The node-local SPTAG worker connects here. | -| Router | `30001+` | TCP dispatch / posting routing between nodes. | +| PD client | `23791` | TiKV client + `pd-ctl` connect here. | +| PD peer | `23801` | Inter-PD raft traffic. | +| TiKV client | `20171` | Per-node TiKV listens here. | +| Router | `30002+` | TCP dispatch / posting routing between nodes. **Driver's `router_port` must NOT be `30001`** — the dispatcher listens on `30001` and a collision will silently break worker registration. The shipped 2-node config uses `30011` on the driver for this reason. | ## Prerequisites @@ -69,45 +84,47 @@ Per-node ports (defaults from `cluster.conf`): cmake .. -DTIKV=ON -DTBB=ON -DCMAKE_BUILD_TYPE=Release -DGPU=OFF cmake --build . --target SPTAGTest -j$(nproc) ``` - *Note: building the full project may fail on the Java wrapper (`JAVASPTAGFileIO`) - due to a pre-existing `FileIOInterface.h` signature mismatch — the `SPTAGTest` - target alone is sufficient.* -- Passwordless SSH from driver to every other node (configure `ssh_key` in `cluster.conf`). + *Note: building the full project may fail on the Java wrapper + (`JAVASPTAGFileIO`) due to a pre-existing `FileIOInterface.h` signature + mismatch — the `SPTAGTest` target alone is sufficient.* +- Passwordless SSH from driver to every other node (configure `ssh_key` in + the cluster config). - Docker installed on every node (TiKV/PD run as containers in host network mode). - Same dataset path on every node (default `/mnt/nvme/sift1b/`): - `/mnt/nvme/sift1b/bigann_base.u8bin` (1B × 128 × u8) - `/mnt/nvme/sift1b/query.10K.u8bin` -- Same fast-storage path for index + TiKV data on every node (`data_dir` in `cluster.conf`, - default `/mnt/nvme`). +- Same fast-storage path for index + TiKV data on every node (`data_dir` in + the cluster config, default `/mnt/nvme`). ## Step 1 — Cluster config +Pick one of the shipped templates and edit it for your hosts/paths: + ```bash -cp evaluation/distributed/cluster.conf.example cluster.conf -vim cluster.conf +cp evaluation/distributed/configs/cluster_2node.conf my_cluster.conf +vim my_cluster.conf ``` -Example: +Layout: ```ini [cluster] ssh_user=superbench +ssh_key=/home/superbench/.ssh/id_rsa sptag_dir=/home/superbench/zhangt/SPTAG data_dir=/mnt/nvme -tikv_version=v7.5.1 -pd_version=v7.5.1 +tikv_version=v8.5.1 +pd_version=v8.5.1 [nodes] -# host router_port -10.0.1.1 30001 # driver (always first) -10.0.1.2 30002 # worker 1 -10.0.1.3 30003 # worker 2 +# host router_port (driver is first; router_port must not equal 30001) +10.0.1.1 30011 # driver +10.0.1.2 30002 # worker 1 [tikv] -# host pd_client pd_peer tikv_port -10.0.1.1 2379 2380 20161 -10.0.1.2 2379 2380 20161 -10.0.1.3 2379 2380 20161 +# host pd_client_port pd_peer_port tikv_port +10.0.1.1 23791 23801 20171 +10.0.1.2 23791 23801 20171 ``` `run_distributed.sh` reads this file to fill the template's `[Distributed]`, @@ -116,50 +133,49 @@ pd_version=v7.5.1 ## Step 2 — Deploy ```bash -./evaluation/distributed/run_distributed.sh deploy cluster.conf +./evaluation/distributed/run_distributed.sh deploy my_cluster.conf +./evaluation/distributed/run_distributed.sh setup-bins my_cluster.conf ``` -This rsyncs `Release/SPTAGTest` (and required shared libs) to every node and -ensures the per-node TiKV / PD data directories exist under `data_dir`. +`deploy` rsyncs `Release/SPTAGTest` (and required shared libs) to every node +and ensures per-node TiKV / PD data directories exist under `data_dir`. +`setup-bins` downloads `tikv-server` / `pd-server` into `bin/` on every node +(idempotent; skipped automatically by `start-tikv` if binaries are already +present). -## Step 3 — Start TiKV (per-node, independent) +## Step 3 — Start the shared TiKV cluster ```bash -./evaluation/distributed/run_distributed.sh start-tikv cluster.conf +./evaluation/distributed/run_distributed.sh start-tikv my_cluster.conf ``` -This starts one PD + one TiKV per node in host-network containers. Single-replica -placement (`max-replicas=1`) is set so we measure benchmark performance without -3-way Raft replication. +This starts one PD + one TiKV container per node in host-network mode and +joins them into a single raft cluster (`max-replicas=1`, no 3-way replication). -Health check (run on driver, repeat per node): +Health check (single PD endpoint is enough — the cluster is shared): ```bash -for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do - curl -s "http://$ip:2379/pd/api/v1/stores" \ - | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])' -done -# Each node should report ['Up']. +curl -s "http://10.0.1.1:23791/pd/api/v1/stores" \ + | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])' +# Expected: ['Up', 'Up'] (one entry per TiKV store). ``` ### Pre-split & scatter (optional but recommended) -For the insert-dominant workload to spread region writes evenly across regions -within a node's TiKV, pre-split the keyspace at boundaries derived from -`DBKey(headID) = MaxID*layer + headID` little-endian byte 0. The TiKV raw key is -`TiKVKeyPrefix + "_" + uint32_le(DBKey)`; for multi-chunk it appends `\x00` / -`\x02` for chunk / count keys, but we split *only* on the head-key prefix so all -chunk and count variants for a head share a region. Boundaries used: `0x02, 0x04, -…, 0xfe` (127 split points → 128 regions). +For the insert-dominant workload, pre-split the keyspace so writes spread +evenly across regions and stores. Boundaries derive from +`DBKey(headID) = MaxID*layer + headID` little-endian byte 0; the TiKV raw key +is `TiKVKeyPrefix + "_" + uint32_le(DBKey)`. We split *only* on the head-key +prefix so all chunk/count variants for a head share a region. Used split +points: `0x02, 0x04, …, 0xfe` (127 split points → 128 regions). -Driver-side helper (each PD is independent, so run per node): +Since the cluster is shared, run the helper **once** against any PD endpoint: ```bash -PREFIX="bench_insert_dominant_3node" # keep in sync with KEY_PREFIX in run_distributed.sh -for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do - PD="http://$ip:2379" - PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v7.5.1 -u "$PD") - python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' +PREFIX="bench_insert_dominant_2node" # keep in sync with KEY_PREFIX in run_distributed.sh +PD="http://10.0.1.1:23791" +PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v8.5.1 -u "$PD") +python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' import json, subprocess, sys prefix = sys.argv[1].encode() + b'_' pdctl = sys.argv[2:] @@ -172,48 +188,65 @@ for b in range(2, 256, 2): for r in json.loads(run(['region', 'scan']))['regions']: run(['operator', 'add', 'scatter-region', str(r['id'])]) PY -done ``` -Skip this on the very first run if you don't have load skew — `start-tikv` works -without it. For 1B-scale insert-dominant runs on a single node it materially -reduces head-region hot-spotting. +Skip this on the very first run if you don't have load skew — `start-tikv` +works without it. For 1B-scale insert-dominant runs it materially reduces +head-region hot-spotting. ## Step 4 — Run the benchmark ```bash # Single scale, explicit node count (driver + (N-1) workers): -./evaluation/distributed/run_distributed.sh run cluster.conf insert_dominant 3 +./evaluation/distributed/run_distributed.sh run my_cluster.conf insert_dominant 2 # Or sweep 1-node baseline + N-node distributed for one or more scales: -./evaluation/distributed/run_distributed.sh bench cluster.conf insert_dominant +./evaluation/distributed/run_distributed.sh bench my_cluster.conf insert_dominant +./evaluation/distributed/run_distributed.sh bench my_cluster.conf all ``` What `run` does: 1. **Build** (driver only): driver builds the index locally with router - *disabled* (`Rebuild=true`, no `[Router]`). Output goes to `…_n0/spann_index`. + *disabled* (`Rebuild=true`, no `[Distributed]`). Output goes to + `…_n0/spann_index`. Because the TiKV cluster is shared, the driver writes + all postings straight to TiKV via PD-routed RPCs — there is no need for a + distributed build phase. 2. **Distribute**: rsync head index + perftest files from driver to each worker. -3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` and - the per-node ini (router enabled, `Rebuild=false`). -4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. The - driver dispatches Insert / Search commands across batches via TCP. +3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` + and the per-node ini (router enabled, `Rebuild=false`). +4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. + The driver dispatches Insert / Search commands across batches via TCP. 5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`. -Useful environment overrides (see header of `run_distributed.sh`): - -- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and `VersionCacheMaxChunks`. -- `BUILD_WITH_CACHE=1` — build with caches, then drop caches before search/insert (NOCACHE only). -- `SKIP_TIKV_SWAP=1` — when using `BUILD_WITH_CACHE`, skip the destructive TiKV - container restart that has corrupted recall at 100M scale. -- `SKIP_SAVE_LOAD=1` — skip post-build SaveIndex / per-batch Load+Clone+Save (NOCACHE only). -- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present (RebuildSSDOnly). +> The "build on the driver, then distribute and run" split is a workaround: +> we don't yet have a real distributed SelectHead/BuildHead implementation, so +> Phase 1 is single-node-with-shared-TiKV. The `BuildOnly=true` / +> `RebuildSSDOnly=true` / `SkipSaveLoadCycles=true` / +> `tikv_switch_to_nocache` / `drop_caches` choreography exists because of +> this split; it is not a feature of the steady-state design. + +Useful environment overrides (see the header of `run_distributed.sh` for the +authoritative list): + +- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and + `VersionCacheMaxChunks` for the search/insert phase. +- `BUILD_WITH_CACHE=1` — build with caches enabled, then drop caches before + search/insert (requires `NOCACHE=1`). Used at 100M scale where building + under nocache is impractical. +- `SKIP_TIKV_SWAP=1` — with `BUILD_WITH_CACHE`, skip the destructive TiKV + container restart that has corrupted recall at 100M scale. Relies on + drop_caches + `VersionCacheMaxChunks=0` for nocache semantics. +- `SKIP_SAVE_LOAD=1` — skip the post-build SaveIndex / per-batch + Load+Clone+Save cycle (`SkipSaveLoadCycles=true`). Required at 100M scale. +- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present + (`RebuildSSDOnly=true`); falls back to full build if HeadIndex is missing. ## Step 5 — Stop / cleanup ```bash -./evaluation/distributed/run_distributed.sh stop-tikv cluster.conf -./evaluation/distributed/run_distributed.sh cleanup cluster.conf # remove deployed files +./evaluation/distributed/run_distributed.sh stop-tikv my_cluster.conf +./evaluation/distributed/run_distributed.sh cleanup my_cluster.conf # remove deployed files ``` ## Key knobs in `benchmark_insert_dominant_template.ini` diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh index bb982ab7d..28404c8a3 100755 --- a/evaluation/distributed/run_distributed.sh +++ b/evaluation/distributed/run_distributed.sh @@ -751,37 +751,6 @@ start_remote_worker() { echo " Worker n${NODE_IDX} on $host (SSH PID: $ssh_pid, log: $LOG)" } -wait_workers_ready() { - local SCALE="$1" - local NODE_COUNT="$2" - local TIMEOUT=120 - - echo "Waiting for ${#WORKER_SSH_PIDS[@]} workers to be ready..." - for attempt in $(seq 1 $TIMEOUT); do - local all_ready=true - for i in $(seq 1 $((NODE_COUNT - 1))); do - local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}.log" - if ! grep -q "Worker.*[Rr]eady\|Waiting for dispatch" "$LOG" 2>/dev/null; then - all_ready=false - fi - done - if $all_ready; then - echo " All workers ready (${attempt}s)" - return 0 - fi - # Check if any worker SSH process died - for idx in "${!WORKER_SSH_PIDS[@]}"; do - if ! kill -0 "${WORKER_SSH_PIDS[$idx]}" 2>/dev/null; then - echo " ERROR: Worker SSH PID ${WORKER_SSH_PIDS[$idx]} exited prematurely" - return 1 - fi - done - sleep 1 - done - echo " WARNING: Not all workers ready after ${TIMEOUT}s" - return 1 -} - stop_remote_workers() { # Wait for workers to self-exit (driver sends TCP Stop), then force-kill. local TIMEOUT=${1:-30} @@ -1140,8 +1109,6 @@ cmd_run() { fi done - # Binary already pushed; nothing else to do here. - # --- Phase 3: Start driver first (contains dispatcher), then workers --- echo "" From ee405d4ddff4ec218c6a827eb4084087d96432cc Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:09:08 +0000 Subject: [PATCH 10/48] README: clarify driver = worker 0 + dispatcher; workers peer-to-peer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous wording made it sound like the driver was a stateless coordinator and workers only talked back to it. Reality: node 0 runs as worker 0 (owns its hash shard like every other worker) and additionally hosts the dispatcher; workers talk to each other directly through PostingRouter for remote append, head sync, and merge hints — no driver-mediated forwarding. Diagram and 'What run does' steps updated. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/README.md | 55 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md index 4717efc35..2b9c0950e 100644 --- a/evaluation/distributed/README.md +++ b/evaluation/distributed/README.md @@ -24,20 +24,23 @@ below). ## Architecture ``` - ┌──────────────┐ - │ Driver │ (node 0) - │ RunBenchmark│ - │ + Router │ - └──┬───┬───┬──┘ - TCP Dispatch│ │ │ - ┌────────┘ │ └────────┐ - ▼ ▼ ▼ + ┌────────────────────┐ + │ Driver = Worker 0│ (node 0) + │ + Dispatcher │ + └─┬──┬──┬────────────┘ + TCP Dispatch │ │ │ ▲ ▲ ▲ + (broadcast) │ │ │ │ │ │ status replies + ┌──────┘ │ └──────┐│ │ │ + ▼ ▼ ▼│ │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ Worker 1 │ │ Worker 2 │ │ Worker N │ - │ + Router│ │ + Router│ │ + Router│ - └────┬─────┘ └────┬─────┘ └────┬─────┘ - │ │ │ - └────────────┼────────────┘ + └──┬───▲───┘ └──┬───▲───┘ └──┬───▲───┘ + │ │ │ │ │ │ + └───┴────────┴───┴────────┴───┘ + PostingRouter peer-to-peer + (remote append / head sync / + merge hints, by hash owner) + │ ▼ ┌───────────────────┐ │ Shared TiKV raft │ N PDs (one raft group) + @@ -45,15 +48,19 @@ below). └───────────────────┘ ``` -- **Driver** (node 0): builds the index, sends Search/Insert/Stop commands via - TCP dispatch. -- **Workers** (nodes 1..N): receive commands, execute their shard locally, - report results back over the dispatch channel. +- **Driver** (node 0): also runs as **worker 0**. On top of the worker role, + it owns the dispatcher: builds the initial index, then broadcasts + Search/Insert/Stop commands to the other workers over TCP dispatch. +- **Workers** (nodes 0..N-1): each owns a shard of the head index by hash. + Workers talk to each other peer-to-peer through PostingRouter for remote + append, head sync, and merge hints — there is no driver-mediated forwarding. + On each `DispatchCommand` they execute the local part of the request and + report status back to the dispatcher. - **Shared TiKV cluster**: every node runs a PD + TiKV container; all PDs join one raft group, all TiKVs point to all PDs. PD routes each key to the store that owns its region. -- **PostingRouter**: hash-based head routing, remote append, head sync, - dispatch protocol. +- **PostingRouter**: hash-based head routing, remote append, head sync, and + the TCP dispatch transport used by the dispatcher. ## TiKV deployment model @@ -213,10 +220,14 @@ What `run` does: all postings straight to TiKV via PD-routed RPCs — there is no need for a distributed build phase. 2. **Distribute**: rsync head index + perftest files from driver to each worker. -3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` - and the per-node ini (router enabled, `Rebuild=false`). -4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. - The driver dispatches Insert / Search commands across batches via TCP. +3. **Workers**: SSH-launches `SPTAGTest` on each remote worker (nodes 1..N-1) + with `WORKER_INDEX=i` and the per-node ini (router enabled, + `Rebuild=false`). Workers wire PostingRouter so they can reach every peer + directly for remote append / head sync. +4. **Driver**: relaunches `SPTAGTest` on node 0 with router enabled, + `Rebuild=false`. The same process acts as **worker 0** (owns its hash + shard like any other worker) **and** as the dispatcher (broadcasts Insert + / Search / Stop over TCP and waits for status replies). 5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`. > The "build on the driver, then distribute and run" split is a workaround: From 6cf7d36e922d01a86163377a1bbc5cdc3f07f6e8 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:10:26 +0000 Subject: [PATCH 11/48] README: drop unused TiKV pre-split helper section We never actually ran the pre-split/scatter helper in our benchmark runs. Keeping it in the doc gives the false impression that it's part of the recommended setup. Remove the whole section. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/README.md | 34 -------------------------------- 1 file changed, 34 deletions(-) diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md index 2b9c0950e..7b2234908 100644 --- a/evaluation/distributed/README.md +++ b/evaluation/distributed/README.md @@ -167,40 +167,6 @@ curl -s "http://10.0.1.1:23791/pd/api/v1/stores" \ # Expected: ['Up', 'Up'] (one entry per TiKV store). ``` -### Pre-split & scatter (optional but recommended) - -For the insert-dominant workload, pre-split the keyspace so writes spread -evenly across regions and stores. Boundaries derive from -`DBKey(headID) = MaxID*layer + headID` little-endian byte 0; the TiKV raw key -is `TiKVKeyPrefix + "_" + uint32_le(DBKey)`. We split *only* on the head-key -prefix so all chunk/count variants for a head share a region. Used split -points: `0x02, 0x04, …, 0xfe` (127 split points → 128 regions). - -Since the cluster is shared, run the helper **once** against any PD endpoint: - -```bash -PREFIX="bench_insert_dominant_2node" # keep in sync with KEY_PREFIX in run_distributed.sh -PD="http://10.0.1.1:23791" -PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v8.5.1 -u "$PD") -python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' -import json, subprocess, sys -prefix = sys.argv[1].encode() + b'_' -pdctl = sys.argv[2:] -def run(args): return subprocess.check_output(pdctl + args, text=True) -def region_for(hex_key): return json.loads(run(['region', 'key', '--format=hex', hex_key]))['id'] -for b in range(2, 256, 2): - key = (prefix + bytes([b, 0, 0, 0])).hex() - rid = region_for(key) - run(['operator', 'add', 'split-region', str(rid), '--policy=usekey', '--keys', key]) -for r in json.loads(run(['region', 'scan']))['regions']: - run(['operator', 'add', 'scatter-region', str(r['id'])]) -PY -``` - -Skip this on the very first run if you don't have load skew — `start-tikv` -works without it. For 1B-scale insert-dominant runs it materially reduces -head-region hot-spotting. - ## Step 4 — Run the benchmark ```bash From 07bdc03a6b1c3e89944da005d96cc073b733acfd Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:11:38 +0000 Subject: [PATCH 12/48] Clean comment --- AnnService/inc/Core/Common/FineGrainedLock.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/AnnService/inc/Core/Common/FineGrainedLock.h b/AnnService/inc/Core/Common/FineGrainedLock.h index 5cfad7ac6..1f7d1eab4 100644 --- a/AnnService/inc/Core/Common/FineGrainedLock.h +++ b/AnnService/inc/Core/Common/FineGrainedLock.h @@ -56,10 +56,6 @@ namespace SPTAG return GetLock(idx); } - // Per-posting lock identity. Two indices share a lock iff they are - // the same posting, so external callers can use `hash_func(a) == - // hash_func(b)` as a self-lock guard (e.g. in Split, to skip - // re-locking the same head VID). static inline unsigned hash_func(unsigned idx) { return idx; From f0d8fe5d473262637dbec4ae23bbdb851bcddcd5 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:11:31 +0000 Subject: [PATCH 13/48] Extract IsRemoteOwnedHead predicate for owner-ring checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three Split/Merge/Append code paths duplicated the same check: m_worker && m_worker->IsEnabled() && each with their own (or missing) m_layer != 0 gate. Split() at L878 and MergePostings() at L1336 were missing the layer gate entirely, so on a hypothetical multi-layer cluster they would have skipped local inner-layer ops (which never use owner-ring routing). Unify on a single predicate IsRemoteOwnedHead(headID, &nodeIndex) and gate every callsite on it: - TryRouteRemoteAppend (routing — populates nodeIndex) - Split (drop remote splits early) - MergePostings (defense-in-depth net) - SplitAsync / MergeAsync (don't burn a pool slot for jobs we'll drop) Addresses PR #448 L553 review comment 'Can we find somewhere to just identify once'. Also folds the L1336 'if refine is not there, do we still need the filter' question — the filter at MergePostings is now only a safety net behind the MergeAsync enqueue-time gate, so future RefineIndex removal won't change anything. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 77 +++++++++++-------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index b8ca98e85..77c96843c 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -547,6 +547,26 @@ namespace SPTAG::SPANN { m_worker->QueueRemoteAppend(nodeIndex, std::move(req)); } + // Single source of truth for "this head lives on a different node". + // Only the outer (head) layer participates in the owner-ring route; + // inner layers (m_layer > 0) hold per-node-local state with no + // shared VID space and no cross-node TiKV key contract, so they + // always answer false. When true, outNodeIndex (if not null) is + // populated with the owner's node index. + // + // Every Split / Merge / Append code path that might touch a head + // it doesn't own MUST gate on this predicate so the invariant + // (only owners mutate their own postings) is enforced in exactly + // one place. + bool IsRemoteOwnedHead(SizeType headID, int* outNodeIndex = nullptr) { + if (m_layer != 0) return false; + if (!m_worker || !m_worker->IsEnabled()) return false; + auto target = m_worker->GetOwner(headID); + if (target.isLocal) return false; + if (outNodeIndex) *outNodeIndex = target.nodeIndex; + return true; + } + // If headID is owned by a remote node, queue the append for that // node and return true; otherwise return false (caller continues // with local write logic). @@ -554,18 +574,9 @@ namespace SPTAG::SPANN { int appendNum, std::string posting, const void* headVecBytes = nullptr) { - if (!m_worker || !m_worker->IsEnabled()) return false; - // Only the outer (head) layer participates in the owner-ring - // route. Inner layers (m_layer > 0) hold per-node-local state - // (no shared head VID space, no cross-node TiKV key naming - // contract), so each node services its own inner layer - // independently. Without this gate inner-layer appends would - // also dispatch RPCs that the receiver can't meaningfully - // apply. - if (m_layer != 0) return false; - auto target = m_worker->GetOwner(headID); - if (target.isLocal) return false; - EnqueueRemoteAppend(target.nodeIndex, headID, appendNum, + int ownerNode = -1; + if (!IsRemoteOwnedHead(headID, &ownerNode)) return false; + EnqueueRemoteAppend(ownerNode, headID, appendNum, std::move(posting), headVecBytes); return true; } @@ -875,13 +886,10 @@ namespace SPTAG::SPANN { // Only the OWNER of headID should run Split. Remote-issued // splits get dropped early so we don't mutate a posting that // doesn't live on this node. - if (m_worker && m_worker->IsEnabled()) { - auto target = m_worker->GetOwner(headID); - if (!target.isLocal) { - std::unique_lock tmplock(m_splitListLock); - m_splitList.unsafe_erase(headID); - return ErrorCode::Success; - } + if (IsRemoteOwnedHead(headID)) { + std::unique_lock tmplock(m_splitListLock); + m_splitList.unsafe_erase(headID); + return ErrorCode::Success; } // Owner-side: wait for any in-flight remote-initiated lock on @@ -1237,7 +1245,7 @@ namespace SPTAG::SPANN { auto updateHeadBegin = std::chrono::high_resolution_clock::now(); if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); - if (!remoteCreated && db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { + if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); } return ret; @@ -1333,13 +1341,12 @@ namespace SPTAG::SPANN { { // The owner runs its own merge passes. Skip when this head is // owned by another node — we'd just be racing the owner. - if (m_worker && m_worker->IsEnabled()) { - auto target = m_worker->GetOwner(headID); - if (!target.isLocal) { - std::unique_lock tmplock(m_mergeListLock); - m_mergeList.unsafe_erase(headID); - return ErrorCode::Success; - } + // (Defense in depth: MergeAsync already filters at enqueue, but + // ownership can change between enqueue and execution.) + if (IsRemoteOwnedHead(headID)) { + std::unique_lock tmplock(m_mergeListLock); + m_mergeList.unsafe_erase(headID); + return ErrorCode::Success; } WaitForRemoteBucketUnlocked(headID); @@ -1667,13 +1674,10 @@ namespace SPTAG::SPANN { inline void SplitAsync(SizeType headID, int postingSize, std::function p_callback = nullptr) { - // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Into SplitAsync, current headID: %d, size: %d\n", headID, m_postingSizes.GetSize(headID)); - // tbb::concurrent_hash_map::const_accessor headIDAccessor; - // if (m_splitList.find(headIDAccessor, headID)) { - // return; - // } - // tbb::concurrent_hash_map::value_type workPair(headID, headID); - // m_splitList.insert(workPair); + // Don't enqueue split jobs for heads we don't own; the owner + // will detect oversize on its own. Skipping here avoids + // burning a thread-pool slot only to drop the job in Split(). + if (IsRemoteOwnedHead(headID)) return; { Helper::Concurrent::ConcurrentMap::value_type workPair(headID, postingSize); std::shared_lock tmplock(m_splitListLock); @@ -1694,6 +1698,11 @@ namespace SPTAG::SPANN { inline void MergeAsync(SizeType headID, std::function p_callback = nullptr) { + // Don't enqueue merge jobs for heads we don't own; the owner + // runs its own merge pass. Filtering here is the single + // upstream gate so MergePostings's owner check is only a + // defense-in-depth net. + if (IsRemoteOwnedHead(headID)) return; { std::shared_lock tmplock(m_mergeListLock); auto res = m_mergeList.insert(headID); From d55de5454e74c3fe74b4e8a2793f632f00eead9f Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:14:23 +0000 Subject: [PATCH 14/48] VersionMap extend: use stride formula capacity*numWorkers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design specifies that when the local VersionMap lags behind a posting written by a remote peer, the lagging node catches up via AddBatch(capacity * numWorkers) This works because the global VID space is striped across worker nodes (VID % numWorkers == nodeID), so the peer's maxVID can be at most ~ localCount * numWorkers ahead of us. Extending in this large chunk amortizes many remote inserts into one capacity bump and keeps growth conflict-free. The previous EnsureVersionMapCoversPosting did AddBatch(maxVid+1-localCount), which is correct but causes thrashing — every remote append where maxVid happens to be slightly past localCount triggers a small extend. Floor at the exact-gap need so single-node builds (numWorkers <= 1) behave identically to before. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 77c96843c..c92630616 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -596,11 +596,27 @@ namespace SPTAG::SPANN { } if (maxVid >= localCount) { SizeType need = maxVid + 1 - localCount; - m_versionMap->AddBatch(need); + // Design contract: on the interleaved stride scheme each + // node owns globalVIDs satisfying VID % numWorkers == + // nodeID. The max VID a remote peer can have produced by + // now is approximately localCount * numWorkers, so when + // we lag behind we extend by capacity*numWorkers in one + // shot. This keeps capacity growth conflict-free (we + // amortize many remote inserts into one extension) and + // avoids the per-VID AddBatch(1) thrashing of the old + // exact-gap formula. + int numWorkers = GetNumWorkerNodes(); + SizeType extendBy = need; + if (numWorkers > 1) { + SizeType strideGrow = localCount * (SizeType)numWorkers; + if (strideGrow > extendBy) extendBy = strideGrow; + } + m_versionMap->AddBatch(extendBy); SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, - "%s: extended local versionMap by %lld (head=%lld maxVid=%lld localCount=%lld)\n", - p_caller, (std::int64_t)need, (std::int64_t)p_headID, - (std::int64_t)maxVid, (std::int64_t)localCount); + "%s: extended local versionMap by %lld (head=%lld maxVid=%lld localCount=%lld need=%lld numWorkers=%d)\n", + p_caller, (std::int64_t)extendBy, (std::int64_t)p_headID, + (std::int64_t)maxVid, (std::int64_t)localCount, + (std::int64_t)need, numWorkers); } } From 370386618cf6106d2a76a3ea5114d7bb0f0e327f Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:19:52 +0000 Subject: [PATCH 15/48] RemotePostingOps: move RPC chunk/retry/timeout/inflight into INI options The four magic constants buried in the RPC layer kChunkSize = 3000 (RemotePostingOps.h) attempt < 3 = retry (RemotePostingOps.h) wait_for(180s) = timeout (RemotePostingOps.h) kMaxInflightPerNode = 4 (WorkerNode.h) are now exposed as SPANN INI parameters under [SSDIndex]: RemoteAppendChunkSize (default 3000) RemoteAppendRetry (default 3) RemoteAppendTimeoutSec (default 180) RemoteAppendMaxInflight (default 4) Defaults preserve current behavior. Plumbing: - Options.h / ParameterDefinitionList.h: declare/register parameters - RemotePostingOps: hold values in atomics, expose Set/Get* setters - WorkerNode: forward setters; m_maxInflightPerNode is now atomic - ExtraDynamicSearcher::SetWorker: push m_opt->m_remoteAppend* once This unblocks per-deployment RPC tuning (e.g. larger chunks on low- latency clusters, shorter timeouts in CI) without recompiling, and removes the long historical comments documenting why the chunk size was changed 5 times during benchmarking. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 65 +++++++++++-------- .../inc/Core/SPANN/Distributed/WorkerNode.h | 15 ++++- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 13 ++++ AnnService/inc/Core/SPANN/Options.h | 6 ++ .../inc/Core/SPANN/ParameterDefinitionList.h | 6 ++ 5 files changed, 77 insertions(+), 28 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 0f032c2ba..03851df1c 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -87,6 +87,18 @@ namespace SPTAG::SPANN { void SetNetwork(NetworkAccess* net) { m_net = net; } + // RPC tuning. All knobs are configurable via SPANN INI options + // (RemoteAppend{ChunkSize,Retry,TimeoutSec,MaxInflight}). Defaults + // are baked here to keep single-node / unconfigured paths working; + // SPANN::ExtraDynamicSearcher::SetWorker() pushes the option-driven + // values once the index is bound to a worker. + void SetRpcChunkSize(int v) { if (v > 0) m_rpcChunkSize.store(v, std::memory_order_relaxed); } + void SetRpcRetry(int v) { if (v > 0) m_rpcRetry.store(v, std::memory_order_relaxed); } + void SetRpcTimeoutSec(int v) { if (v > 0) m_rpcTimeoutSec.store(v, std::memory_order_relaxed); } + int GetRpcChunkSize() const { return m_rpcChunkSize.load(std::memory_order_relaxed); } + int GetRpcRetry() const { return m_rpcRetry.load(std::memory_order_relaxed); } + int GetRpcTimeoutSec() const { return m_rpcTimeoutSec.load(std::memory_order_relaxed); } + // Inject the searcher's shared compute pool. Receiver-side BatchAppend // work runs as Jobs on this pool so it shares a single bounded- // concurrency budget with local Append/Split/Merge/Reassign (instead @@ -285,26 +297,14 @@ namespace SPTAG::SPANN { { if (items.empty()) return ErrorCode::Success; - // Chunk the batch so a single RPC never exceeds kChunkSize items. - // Large batches (millions of items) cannot be processed by the - // receiver within a single timeout window, causing data loss - // when the request is dropped. Chunking keeps each RPC bounded. - // [v38] Reduced 50000 → 10000 to (a) shrink end-of-batch drain - // tail (final chunk no longer 14s wide) and (b) let multiple - // chunks pipeline on the receiver pool. - // [v43] Back to 50000 — v42 (10k) was throughput-best (906/s) - // but during-insert p50 was 222ms; v43 (50k) trades throughput - // (-22% → 704/s) for during-insert p50 (-36% → 141ms) and big - // recovery in post-insert r1 QPS (47→85). v44 (100k) blew up - // tail drain: a single 100k chunk took 116s on the receiver, - // making end-of-batch drain run 40+ min (vs 8 min at 50k). - // 50k is the sweet spot. - // [v47] With shared-pool receiver (BatchAppendItemJob on - // m_splitThreadPool), 50k chunks still occasionally exceed the - // 180s wait_for window under contention → "Timeout waiting for - // batch response" + retries. Drop to 10k so each RPC's worst-case - // receiver wall-clock is ~6× smaller and stays under the timeout. - constexpr size_t kChunkSize = 3000; + // Chunk the batch so a single RPC never exceeds the configured + // chunk size. Large batches (millions of items) cannot be + // processed by the receiver within a single timeout window, + // causing data loss when the request is dropped. Chunking keeps + // each RPC bounded. Tunable via SPANN option + // RemoteAppendChunkSize (default 3000). + const size_t kChunkSize = + std::max(1, (size_t)m_rpcChunkSize.load(std::memory_order_relaxed)); const size_t total = items.size(); size_t offset = 0; std::vector chunk; @@ -337,13 +337,15 @@ namespace SPTAG::SPANN { { if (items.empty()) return ErrorCode::Success; - for (int attempt = 0; attempt < 3; attempt++) { + const int kMaxAttempts = std::max(1, m_rpcRetry.load(std::memory_order_relaxed)); + const int kTimeoutSec = std::max(1, m_rpcTimeoutSec.load(std::memory_order_relaxed)); + for (int attempt = 0; attempt < kMaxAttempts; attempt++) { Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); if (connID == Socket::c_invalidConnectionID) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "RemotePostingOps: Cannot connect to node %d for batch (%d items, attempt %d)\n", targetNodeIndex, (int)items.size(), attempt + 1); - if (attempt < 2) continue; + if (attempt < kMaxAttempts - 1) continue; return ErrorCode::Fail; } @@ -381,9 +383,12 @@ namespace SPTAG::SPANN { m_net->GetClient()->SendPacket(connID, std::move(packet), MakeSendFailHandler(resID)); - // Generous timeout: 50k items * (~10ms TiKV roundtrip / 16 worker threads) - // = ~31s typical; cap at 180s to allow for lock contention with merges/splits. - auto status = future.wait_for(std::chrono::seconds(180)); + // Wait window comes from SPANN option RemoteAppendTimeoutSec + // (default 180s). Sized so a normal-load chunk (chunk_size + // items at ~10ms TiKV roundtrip / 16 worker threads ≈ tens of + // seconds) completes well under the cap, leaving headroom for + // lock contention with merges/splits. + auto status = future.wait_for(std::chrono::seconds(kTimeoutSec)); auto waitMs = std::chrono::duration_cast( std::chrono::steady_clock::now() - waitStart).count(); if (status == std::future_status::timeout) { @@ -397,7 +402,7 @@ namespace SPTAG::SPANN { // are signalled via MakeSendFailHandler (which sets the // promise to Fail, taking the "result != Success" path // below). - if (attempt < 2) continue; + if (attempt < kMaxAttempts - 1) continue; return ErrorCode::Fail; } @@ -1194,6 +1199,14 @@ namespace SPTAG::SPANN { NetworkAccess* m_net = nullptr; + // RPC tuning knobs. See SetRpcChunkSize/Retry/TimeoutSec. Defaults + // match historical hardcoded values; overridden via SPANN options + // by ExtraDynamicSearcher::SetWorker(). Stored as atomics so the + // batch sender can read them lock-free. + std::atomic m_rpcChunkSize{3000}; + std::atomic m_rpcRetry{3}; + std::atomic m_rpcTimeoutSec{180}; + // Per-layer callback registries. Indexed by ExtraDynamicSearcher layer // (m_layer at the call site). Resized lazily by SetXxxCallback. The // empty/null entry at layer 0 is preserved so a single-layer caller diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index 8af906fcc..d50edcfd5 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -134,6 +134,17 @@ namespace SPTAG::SPANN { void SetDispatchCallback(DispatchCallback cb) { m_dispatch.SetDispatchCallback(std::move(cb)); } void ClearDispatchCallback() { m_dispatch.ClearDispatchCallback(); } + // RPC tuning forwarders. See RemotePostingOps for semantics. + // MaxInflightPerNode caps how many auto-flush chunks may be on + // the wire to a given peer at once; chunk size/retry/timeout + // are forwarded directly into RemotePostingOps. + void SetRpcChunkSize(int v) { m_remoteOps.SetRpcChunkSize(v); } + void SetRpcRetry(int v) { m_remoteOps.SetRpcRetry(v); } + void SetRpcTimeoutSec(int v) { m_remoteOps.SetRpcTimeoutSec(v); } + void SetRpcMaxInflightPerNode(int v) { + if (v > 0) m_maxInflightPerNode.store(v, std::memory_order_relaxed); + } + // ---- Routing ---- RouteTarget GetOwner(SizeType headID) { @@ -246,7 +257,7 @@ namespace SPTAG::SPANN { // wave) can saturate the receiver's bg-executor pool instead of // queueing up serially behind a single per-node mutex. if (q.size() >= kAutoFlushThreshold - && m_perNodeInflight[nodeIndex] < kMaxInflightPerNode) { + && m_perNodeInflight[nodeIndex] < m_maxInflightPerNode.load(std::memory_order_relaxed)) { toFlush.swap(q); m_remoteQueueSize.fetch_sub(toFlush.size(), std::memory_order_relaxed); ++m_perNodeInflight[nodeIndex]; @@ -585,7 +596,7 @@ namespace SPTAG::SPANN { std::atomic m_inflightAppendFlushes{0}; std::unordered_map m_perNodeInflight; // guarded by m_appendQueueMutex static constexpr size_t kAutoFlushThreshold = 50000; - static constexpr int kMaxInflightPerNode = 4; + std::atomic m_maxInflightPerNode{4}; std::mutex& GetPerNodeAppendFlushMutex(int nodeIndex) { std::lock_guard lk(m_perNodeAppendFlushMutexMapLock); diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c92630616..36d49bbfa 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -404,6 +404,19 @@ namespace SPTAG::SPANN { m_worker = router; if (!m_worker) return; + // Push RPC tuning from SPANN options (RemoteAppend*) so the + // hardcoded defaults in RemotePostingOps/WorkerNode get + // overridden by whatever the ini file specified. Pushing per + // SetWorker call (rather than once at WorkerNode construction) + // means a hot reconfigure via index reload picks up new + // values automatically. + if (m_opt) { + m_worker->SetRpcChunkSize(m_opt->m_remoteAppendChunkSize); + m_worker->SetRpcRetry(m_opt->m_remoteAppendRetry); + m_worker->SetRpcTimeoutSec(m_opt->m_remoteAppendTimeoutSec); + m_worker->SetRpcMaxInflightPerNode(m_opt->m_remoteAppendMaxInflight); + } + WireJobSubmitterIfReady(); // Claim ownership so the matching destructor's IfOwner check diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 2c9c8865e..0bbe4a90a 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -127,6 +127,12 @@ namespace SPTAG { int m_versionCacheMaxChunks; int m_asyncRpcMaxInflight; + // Distributed RemotePostingOps RPC tuning + int m_remoteAppendChunkSize; + int m_remoteAppendRetry; + int m_remoteAppendTimeoutSec; + int m_remoteAppendMaxInflight; + // GPU building int m_gpuSSDNumTrees; int m_gpuSSDLeafSize; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 50823168d..c1b268c9b 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -125,6 +125,12 @@ DefineSSDParameter(m_versionCacheTTLMs, int, 0, "VersionCacheTTLMs") DefineSSDParameter(m_versionCacheMaxChunks, int, 10000, "VersionCacheMaxChunks") DefineSSDParameter(m_asyncRpcMaxInflight, int, 0, "AsyncRpcMaxInflight") +// Distributed RemotePostingOps RPC tuning +DefineSSDParameter(m_remoteAppendChunkSize, int, 3000, "RemoteAppendChunkSize") +DefineSSDParameter(m_remoteAppendRetry, int, 3, "RemoteAppendRetry") +DefineSSDParameter(m_remoteAppendTimeoutSec, int, 180, "RemoteAppendTimeoutSec") +DefineSSDParameter(m_remoteAppendMaxInflight, int, 4, "RemoteAppendMaxInflight") + // GPU Building DefineSSDParameter(m_gpuSSDNumTrees, int, 100, "GPUSSDNumTrees") DefineSSDParameter(m_gpuSSDLeafSize, int, 200, "GPUSSDLeafSize") From 9619b2fda2134bcb5c9718833fc5422fde1763c8 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:23:27 +0000 Subject: [PATCH 16/48] Async Split/Merge jobs: retry counter + re-enqueue on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design says async Split/MergeAsync jobs must be safe-to-retry from any compute node (Section: Async Job Fault Tolerance). Previous code recorded a non-Success ret into m_asyncStatus and silently dropped the job — a transient failure (TiKV blip, remote-lock timeout, etc.) permanently lost the split/merge. Both MergeAsyncJob and SplitAsyncJob now carry an attempts counter. On non-Success, if attempts+1 < m_asyncJobMaxRetry (new SPANN option, default 3), the job re-adds itself to m_splitThreadPool without touching the in-flight counter, so the outer drain loop still accounts for it. After MaxRetry exhaustion the failure surfaces via m_asyncStatus as before, plus a clear LL_Error log identifying the head and attempt count. Idempotency requirements for safe retry are already in place: - Owner check (IsRemoteOwnedHead) drops remote heads immediately - ContainSample liveness gate inside Split/MergePostings - Re-locking the per-head RWLock on each entry - Read-deduplicate during the next split attempt for partial writes Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 43 ++++++++++++++++++- AnnService/inc/Core/SPANN/Options.h | 5 +++ .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 36d49bbfa..4044afad7 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -63,6 +63,7 @@ namespace SPTAG::SPANN { ExtraDynamicSearcher* m_extraIndex; SizeType m_headID; std::function m_callback; + int m_attempts = 0; public: MergeAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::function p_callback) : m_extraIndex(extraIndex), m_headID(headID), m_callback(std::move(p_callback)) {} @@ -73,8 +74,28 @@ namespace SPTAG::SPANN { } inline void exec(void* p_workSpace, IAbortOperation* p_abort) override { ErrorCode ret = m_extraIndex->MergePostings((ExtraWorkSpace*)p_workSpace, m_headID); - if (ret != ErrorCode::Success) + if (ret != ErrorCode::Success) { + int maxRetry = m_extraIndex->m_opt + ? m_extraIndex->m_opt->m_asyncJobMaxRetry : 0; + if (m_attempts + 1 < maxRetry) { + // Async-job fault-tolerance contract: merges are + // safe to retry idempotently (the owner check, the + // ContainSample liveness gate, and the locked RMW + // all re-evaluate on each attempt). Re-enqueue + // without touching m_mergeJobsInFlight so the + // outer "wait for in-flight" loop still sees us. + ++m_attempts; + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergeAsyncJob: head=%lld attempt=%d failed ret=%d, re-enqueueing\n", + (std::int64_t)m_headID, m_attempts, (int)ret); + m_extraIndex->m_splitThreadPool->add(this); + return; // skip cleanup; Job lives on + } m_extraIndex->m_asyncStatus = ret; + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "MergeAsyncJob: head=%lld giving up after %d attempts ret=%d\n", + (std::int64_t)m_headID, m_attempts + 1, (int)ret); + } m_extraIndex->m_mergeJobsInFlight--; m_extraIndex->m_totalMergeCompleted++; if (m_callback != nullptr) { @@ -89,6 +110,7 @@ namespace SPTAG::SPANN { ExtraDynamicSearcher* m_extraIndex; SizeType m_headID; std::function m_callback; + int m_attempts = 0; public: SplitAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::function p_callback) : m_extraIndex(extraIndex), m_headID(headID), m_callback(std::move(p_callback)) {} @@ -105,8 +127,25 @@ namespace SPTAG::SPANN { m_extraIndex->m_totalSplitTimeUs += elapsedUs; uint64_t prevMax = m_extraIndex->m_maxSplitTimeUs.load(); while (elapsedUs > prevMax && !m_extraIndex->m_maxSplitTimeUs.compare_exchange_weak(prevMax, elapsedUs)); - if (ret != ErrorCode::Success) + if (ret != ErrorCode::Success) { + int maxRetry = m_extraIndex->m_opt + ? m_extraIndex->m_opt->m_asyncJobMaxRetry : 0; + if (m_attempts + 1 < maxRetry) { + // See MergeAsyncJob: splits are designed safe to + // retry from any compute node (read-deduplicate + // during the next attempt handles partial writes). + ++m_attempts; + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "SplitAsyncJob: head=%lld attempt=%d failed ret=%d, re-enqueueing\n", + (std::int64_t)m_headID, m_attempts, (int)ret); + m_extraIndex->m_splitThreadPool->add(this); + return; + } m_extraIndex->m_asyncStatus = ret; + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "SplitAsyncJob: head=%lld giving up after %d attempts ret=%d\n", + (std::int64_t)m_headID, m_attempts + 1, (int)ret); + } m_extraIndex->m_splitJobsInFlight--; m_extraIndex->m_totalSplitCompleted++; if (m_callback != nullptr) { diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 0bbe4a90a..6542069c9 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -133,6 +133,11 @@ namespace SPTAG { int m_remoteAppendTimeoutSec; int m_remoteAppendMaxInflight; + // Async Split/Merge job retry count. Distributed design + // requires async jobs to be safe-to-retry — see Async Job + // Fault Tolerance section. + int m_asyncJobMaxRetry; + // GPU building int m_gpuSSDNumTrees; int m_gpuSSDLeafSize; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index c1b268c9b..481947ca1 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -130,6 +130,7 @@ DefineSSDParameter(m_remoteAppendChunkSize, int, 3000, "RemoteAppendChunkSize") DefineSSDParameter(m_remoteAppendRetry, int, 3, "RemoteAppendRetry") DefineSSDParameter(m_remoteAppendTimeoutSec, int, 180, "RemoteAppendTimeoutSec") DefineSSDParameter(m_remoteAppendMaxInflight, int, 4, "RemoteAppendMaxInflight") +DefineSSDParameter(m_asyncJobMaxRetry, int, 3, "AsyncJobMaxRetry") // GPU Building DefineSSDParameter(m_gpuSSDNumTrees, int, 100, "GPUSSDNumTrees") From 864e2688a8fdddc94b985b673bc2bd9c0a514434 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:36:27 +0000 Subject: [PATCH 17/48] DispatchResult: carry SPTAG::ErrorCode back to driver Previously the dispatch result only signalled Success/Failed via a 1-byte enum, so any worker-side failure (TiKV unavailable, KeyNotFound during search, append rejection, etc.) collapsed into a generic 'Failed' that the driver couldn't distinguish or react to differently. Bump DispatchResult MirrorVersion 1 -> 2 and add m_errorCode (int32). Read/Write gated on mirror >= 2 so older peers stay compatible (they leave the field at 0). Driver-side HandleDispatchResult now logs the errorCode at LL_Error on failed paths, and the existing log line for every result echoes the code so post-mortem traces show exactly what each worker reported. Sample wiring: SPFreshTest's worker dispatch callback sets m_errorCode on its Unknown-command fallback. Other code paths (Search/Insert) already only fail through exceptions in the helpers, which the driver treats as crash-class events; the field is ready for future failure propagation work in those paths. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/DispatchCoordinator.h | 9 +++++++-- .../Core/SPANN/Distributed/DistributedProtocol.h | 15 +++++++++++++-- Test/src/SPFreshTest.cpp | 1 + 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h index 8bb32a7eb..ffd02f05c 100644 --- a/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h +++ b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h @@ -306,9 +306,10 @@ namespace SPTAG::SPANN { } SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "DispatchCoordinator: Result id=%llu round=%u node=%d status=%d wallTime=%.3f\n", + "DispatchCoordinator: Result id=%llu round=%u node=%d status=%d errorCode=%d wallTime=%.3f\n", (unsigned long long)result.m_dispatchId, result.m_round, - result.m_nodeIndex, (int)result.m_status, result.m_wallTime); + result.m_nodeIndex, (int)result.m_status, (int)result.m_errorCode, + result.m_wallTime); std::shared_ptr state; { @@ -325,6 +326,10 @@ namespace SPTAG::SPANN { if (result.m_status != DispatchResult::Status::Success) { state->errors++; + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: dispatch %llu node=%d failed errorCode=%d\n", + (unsigned long long)result.m_dispatchId, result.m_nodeIndex, + (int)result.m_errorCode); } { diff --git a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h index b4da82fcc..963ca6b35 100644 --- a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h +++ b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h @@ -392,9 +392,11 @@ namespace SPTAG::SPANN { }; /// Result from worker back to driver after executing a dispatch command. + /// MirrorVersion 2 added m_errorCode so failures can carry SPTAG::ErrorCode + /// detail back to the driver instead of collapsing into a boolean. struct DispatchResult { static constexpr std::uint16_t MajorVersion() { return 1; } - static constexpr std::uint16_t MirrorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 2; } enum class Status : std::uint8_t { Success = 0, Failed = 1 }; Status m_status = Status::Success; @@ -402,11 +404,16 @@ namespace SPTAG::SPANN { std::uint32_t m_round = 0; double m_wallTime = 0.0; std::int32_t m_nodeIndex = -1; // which worker sent this result + // SPTAG::ErrorCode cast to int32 (Success == 0). Populated by the + // worker's dispatch callback so the driver can distinguish e.g. + // KeyNotFound from disk-full from network-fail. Older peers (mirror + // 1) leave this at 0 even when m_status == Failed. + std::int32_t m_errorCode = 0; std::size_t EstimateBufferSize() const { return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + sizeof(std::uint64_t) + sizeof(std::uint32_t) + sizeof(double) - + sizeof(std::int32_t); + + sizeof(std::int32_t) * 2; } std::uint8_t* Write(std::uint8_t* p_buffer) const { @@ -418,6 +425,7 @@ namespace SPTAG::SPANN { p_buffer = SimpleWriteBuffer(m_round, p_buffer); p_buffer = SimpleWriteBuffer(m_wallTime, p_buffer); p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + p_buffer = SimpleWriteBuffer(m_errorCode, p_buffer); return p_buffer; } @@ -436,6 +444,9 @@ namespace SPTAG::SPANN { if (mirrorVer >= 1) { p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); } + if (mirrorVer >= 2) { + p_buffer = SimpleReadBuffer(p_buffer, m_errorCode); + } return p_buffer; } }; diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 5bef228a3..2c754635e 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -2941,6 +2941,7 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Unknown command type %d\n", nodeIndex, (int)cmd.m_type); result.m_status = SPANN::DispatchResult::Status::Failed; + result.m_errorCode = static_cast(SPTAG::ErrorCode::Undefined); return result; }); From 1cd19f10a679b7c823accdb3697720b8c1552419 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:39:49 +0000 Subject: [PATCH 18/48] AppendCallback: HandleRaceCondition gate against in-flight split/merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design's receive-side flow specifies a HandleRaceCondition step before the local Append callback runs: 'check whether the target HeadID is currently being split or merged on this node; if so, the append waits for the structural operation to commit before proceeding.' Without this, the existing wasMissing branch (which re-materializes a missing head from the sender's headVec) can resurrect a head that local Merge just deleted. The race is real but small — the per-head RWLock used by Append/Split/Merge already serializes RMW, but the head-index ContainSample check + AddHeadIndex resurrection happens outside that lock. Add ExtraDynamicSearcher::HandleRaceCondition(headID) that: 1. Peeks m_splitList / m_mergeList for the head. 2. If present, briefly acquires the per-head RWLock to wait for the structural op to commit. 3. Returns; the callback continues with a stable view, and the normal Append re-acquires the RWLock for the actual RMW. When the head is genuinely gone after the wait, the sender's later retry will see the updated head index (via HeadSync) and re-route to the new owner — exactly the path the design's Append-vs-Merge race section describes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 4044afad7..c97229af6 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -427,9 +427,41 @@ namespace SPTAG::SPANN { return m_initialVectorSize + (localVID - m_initialVectorSize) * numWorkers + GetWorkerNodeIndex(); } - // Idempotent: wires the receiver's BatchAppend Jobs onto our shared + // Receive-side race coordination: before applying a remote Append + // for headID, make sure no local Split or Merge is currently + // mutating the same head. Splits delete the original head and + // create new ones; merges delete a loser head. If we let the + // append's wasMissing branch run while a Split/Merge holds the + // RWLock, the AddHeadIndex resurrection would race the local + // DeleteIndex and we'd briefly bring a dead head back to life + // (only papered over by the eventual HeadSync from the structural + // op). Briefly acquiring the RWLock here serializes us behind + // the in-flight structural op without forking an explicit + // condition-variable channel. After the structural op completes + // its bookkeeping (lists drained, head index updated, HeadSync + // broadcast), the callback re-checks ContainSample with a stable + // view. When the head is genuinely gone, sender retries against + // the updated head index and routes to the new owner. + void HandleRaceCondition(SizeType headID) { + bool inSplit = false, inMerge = false; + { + std::shared_lock sl(m_splitListLock); + inSplit = (m_splitList.find(headID) != m_splitList.end()); + } + { + std::shared_lock sl(m_mergeListLock); + inMerge = (m_mergeList.find(headID) != m_mergeList.end()); + } + if (!inSplit && !inMerge) return; + // Wait until the structural op releases the per-head RWLock. + // Acquire-and-immediately-release; the Append below re-locks. + std::unique_lock w(m_rwLocks[headID]); + (void)w; + } + // SPDKThreadPool. Called both after pool creation and from // SetWorker(); whichever happens last actually binds the submitter. + // Idempotent: wires the receiver's BatchAppend Jobs onto our shared void WireJobSubmitterIfReady() { if (!m_worker || !m_splitThreadPool) return; auto pool = m_splitThreadPool; @@ -467,6 +499,13 @@ namespace SPTAG::SPANN { m_worker->SetAppendCallback(m_layer, [this](SizeType headID, std::shared_ptr headVec, int appendNum, std::string& appendPosting) -> ErrorCode { + // Per-design HandleRaceCondition: wait for any local + // Split/Merge on this head to commit before we look at + // the head index. Otherwise the wasMissing branch + // below can resurrect a head that the structural op + // just deleted. + HandleRaceCondition(headID); + // Reuse SPDKThreadPool's per-worker pre-allocated workspace // when called from BatchAppendItemJob on m_splitThreadPool. ExtraWorkSpace localWorkSpace; From dca197ba0c2ca580f57b01800ebe7953ccbead26 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:44:12 +0000 Subject: [PATCH 19/48] SPANN distributed: TTL-based remote lock lease MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-bucket atomic remote-lock cache with a dedicated RemoteLeaseTable that tracks per-bucket expiry timestamps. This lets the owner auto-reclaim a remote lock when the holder crashes or stalls beyond RemoteLockTtlMs (default 30s) instead of blocking Split/Merge forever. New file: AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h. Fencing tokens deferred — they require a protocol-mirror bump on RemoteLock{Request,Response} and a callback signature change; will be added when the watchdog/resend path lands. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemoteLeaseTable.h | 109 ++++++++++++++++++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 31 ++--- AnnService/inc/Core/SPANN/Options.h | 6 + .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + 4 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h diff --git a/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h b/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h new file mode 100644 index 000000000..2d6881c7e --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h @@ -0,0 +1,109 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// RemoteLeaseTable +// ---------------- +// Owner-side bookkeeping for cross-node merge / structural-op locks. +// Backs the per-bucket advisory flag that local Split / Merge consult via +// WaitForRemoteBucketUnlocked before mutating a head whose ownership is +// shared with a remote candidate. +// +// Design contract (see Async Job Fault Tolerance): +// * Each acquired lock carries a bounded TTL. If the holder crashes or +// stops responding, the lease auto-expires and the owner is free to +// proceed (or grant the bucket to another holder). +// * No keepalive: structural ops are expected to complete in under one +// TTL. If they exceed the TTL, the holder must retry the whole job; +// the owner has already released the lease. +// +// The TTL is the single configurable knob (default 30s, matching the +// design's lease-TTL recommendation). A future iteration can add a +// fencing token so a zombie holder that resumes after expiry has its +// late unlock rejected — that requires a protocol bump on +// RemoteLockRequest/Response, which we'll do once a real owner-restart +// test exists to validate the change. For now the in-memory lease +// table provides the safety net the design requires: zombie holders +// never indefinitely block the owner. + +#ifndef _SPTAG_SPANN_DISTRIBUTED_REMOTELEASETABLE_H_ +#define _SPTAG_SPANN_DISTRIBUTED_REMOTELEASETABLE_H_ + +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + class RemoteLeaseTable { + public: + using Clock = std::chrono::steady_clock; + + // bucketCount must match the searcher's lock-pool bucket count + // (FineGrainedRWLock::BucketIndex range). Allocates one slot per + // bucket; slots start in the unlocked state (expiry == 0). + explicit RemoteLeaseTable(std::size_t bucketCount, int ttlMs = 30000) + : m_count(bucketCount + 1), m_ttlMs(ttlMs) + { + m_expiry = std::make_unique[]>(m_count); + for (std::size_t i = 0; i < m_count; ++i) m_expiry[i].store(0, std::memory_order_relaxed); + } + + void SetTtlMs(int ttlMs) { if (ttlMs > 0) m_ttlMs.store(ttlMs, std::memory_order_relaxed); } + int GetTtlMs() const { return m_ttlMs.load(std::memory_order_relaxed); } + + // Try to grant a lease for bucket. Succeeds iff bucket is unlocked + // OR the previous holder's lease has expired (auto-reclamation). + // Records the new expiry deadline. + bool TryAcquire(unsigned bucket) { + if (bucket >= m_count) return false; + const std::int64_t nowNs = NowNs(); + const std::int64_t ttlNs = (std::int64_t)m_ttlMs.load(std::memory_order_relaxed) * 1'000'000LL; + std::int64_t current = m_expiry[bucket].load(std::memory_order_acquire); + for (;;) { + if (current != 0 && current > nowNs) return false; // still held by live lease + const std::int64_t newExpiry = nowNs + ttlNs; + if (m_expiry[bucket].compare_exchange_weak(current, newExpiry, + std::memory_order_acq_rel)) return true; + // CAS lost: re-evaluate with the updated `current`. + } + } + + // Release the lease unconditionally. In the current protocol the + // caller is trusted (holder cooperates). When a fencing token is + // added, this becomes a token-validated release. + void Release(unsigned bucket) { + if (bucket >= m_count) return; + m_expiry[bucket].store(0, std::memory_order_release); + } + + // True iff the lease is currently held AND not expired. Auto-clears + // expired entries so a stuck holder doesn't permanently block the + // owner's Split/Merge path. + bool IsLocked(unsigned bucket) { + if (bucket >= m_count) return false; + std::int64_t current = m_expiry[bucket].load(std::memory_order_acquire); + if (current == 0) return false; + if (current > NowNs()) return true; + // Expired: try to clear (best-effort; loss of race is OK because + // a concurrent holder either renewed or is also expired). + std::int64_t expected = current; + m_expiry[bucket].compare_exchange_strong(expected, 0, + std::memory_order_acq_rel); + return false; + } + + private: + static std::int64_t NowNs() { + return std::chrono::duration_cast( + Clock::now().time_since_epoch()).count(); + } + + std::size_t m_count; + std::atomic m_ttlMs; + std::unique_ptr[]> m_expiry; + }; + +} // namespace SPTAG::SPANN + +#endif // _SPTAG_SPANN_DISTRIBUTED_REMOTELEASETABLE_H_ diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c97229af6..a97369d08 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -20,6 +20,7 @@ #include "inc/Core/Common/TiKVVersionMap.h" #include "ExtraFileController.h" #include "Distributed/WorkerNode.h" +#include "Distributed/RemoteLeaseTable.h" #include #include #include @@ -266,9 +267,11 @@ namespace SPTAG::SPANN { COMMON::FineGrainedRWLock m_rwLocks; - // Per-bucket flags for remote (cross-node) locking. + // Per-bucket lease table for remote (cross-node) locking. Each + // entry carries a TTL so a crashed/disconnected holder doesn't + // permanently block Split/Merge here. See RemoteLeaseTable.h. static constexpr int kRemoteLockPoolSize = 32767; - std::unique_ptr[]> m_remoteBucketLocked; + std::unique_ptr m_remoteLeaseTable; IndexStats m_stat; @@ -394,8 +397,11 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[CONFIG] layer=%d DistributedVersionMap=%s SearchCheckVersionMapOnlyLayer0=%s UseMultiChunkPosting=%s PostingPageLimit=%d\n", layer, p_opt.m_distributedVersionMap ? "true" : "false", p_opt.m_searchCheckVersionMapOnlyLayer0 ? "true" : "false", p_opt.m_useMultiChunkPosting ? "true" : "false", p_opt.m_postingPageLimit); - // Initialize per-bucket remote lock flags - m_remoteBucketLocked.reset(new std::atomic[kRemoteLockPoolSize + 1]{}); + // Initialize per-bucket remote lease table. TTL is picked up + // from SPANN option RemoteLockTtlMs (default 30000ms = 30s). + m_remoteLeaseTable = std::make_unique( + kRemoteLockPoolSize, + p_opt.m_remoteLockTtlMs > 0 ? p_opt.m_remoteLockTtlMs : 30000); } ~ExtraDynamicSearcher() { @@ -570,22 +576,19 @@ namespace SPTAG::SPANN { } }); - // Remote lock callback: per-bucket atomic flags + // Remote lock callback: per-bucket leases with TTL auto-release. m_worker->SetRemoteLockCallback(m_layer, [this](SizeType headID, bool lock) -> bool { unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); if (lock) { - bool expected = false; - if (!m_remoteBucketLocked[bucket].compare_exchange_strong(expected, true)) { - return false; - } + if (!m_remoteLeaseTable->TryAcquire(bucket)) return false; if (!m_rwLocks[headID].try_lock()) { - m_remoteBucketLocked[bucket].store(false); + m_remoteLeaseTable->Release(bucket); return false; } m_rwLocks[headID].unlock(); return true; } else { - m_remoteBucketLocked[bucket].store(false); + m_remoteLeaseTable->Release(bucket); return true; } }); @@ -600,14 +603,16 @@ namespace SPTAG::SPANN { } // Owner-side wait for any in-flight remote lock on this bucket. + // RemoteLeaseTable::IsLocked auto-clears expired leases, so a + // zombie holder beyond TTL doesn't stall Split/Merge here. void WaitForRemoteBucketUnlocked(SizeType headID) const { if (!m_worker || !m_worker->IsEnabled()) return; unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); - if (!m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) return; + if (!m_remoteLeaseTable->IsLocked(bucket)) return; constexpr int kMaxRemoteBucketWaitMs = 5000; auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(kMaxRemoteBucketWaitMs); - while (m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) { + while (m_remoteLeaseTable->IsLocked(bucket)) { if (std::chrono::steady_clock::now() > deadline) { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "WaitForRemoteBucketUnlocked: headID=%lld bucket=%u stuck for %d ms, proceeding\n", diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 6542069c9..e8546c17f 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -138,6 +138,12 @@ namespace SPTAG { // Fault Tolerance section. int m_asyncJobMaxRetry; + // Remote lock lease TTL in milliseconds (default 30000). + // Bounds how long a crashed or disconnected holder can block + // the owner's Split/Merge path; the owner auto-reclaims the + // lease on expiry. Match this to your structural-op p99. + int m_remoteLockTtlMs; + // GPU building int m_gpuSSDNumTrees; int m_gpuSSDLeafSize; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 481947ca1..700a5d592 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -131,6 +131,7 @@ DefineSSDParameter(m_remoteAppendRetry, int, 3, "RemoteAppendRetry") DefineSSDParameter(m_remoteAppendTimeoutSec, int, 180, "RemoteAppendTimeoutSec") DefineSSDParameter(m_remoteAppendMaxInflight, int, 4, "RemoteAppendMaxInflight") DefineSSDParameter(m_asyncJobMaxRetry, int, 3, "AsyncJobMaxRetry") +DefineSSDParameter(m_remoteLockTtlMs, int, 30000, "RemoteLockTtlMs") // GPU Building DefineSSDParameter(m_gpuSSDNumTrees, int, 100, "GPUSSDNumTrees") From 489ff4ee043a0eec6b8b2dfc51fd75ccfb8a2aa7 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:47:24 +0000 Subject: [PATCH 20/48] SPANN distributed: watchdog for failed async append batches QueueRemoteAppend's auto-flush is fire-and-forget: when the receiver is briefly unreachable the batch was previously dropped after a single log line. This breaks the distributed design's at-least-once async job contract. Add AsyncJobWatchdog (new file under Distributed/) that owns timeout- driven, bounded exponential-backoff resends in a single background thread. Wire WorkerNode's auto-flush failure path to hand the batch to the watchdog instead of dropping it. RemoteAppend is idempotent on the receive side (per-posting RMW), so at-least-once is safe. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/AsyncJobWatchdog.h | 177 ++++++++++++++++++ .../inc/Core/SPANN/Distributed/WorkerNode.h | 28 ++- 2 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Distributed/AsyncJobWatchdog.h diff --git a/AnnService/inc/Core/SPANN/Distributed/AsyncJobWatchdog.h b/AnnService/inc/Core/SPANN/Distributed/AsyncJobWatchdog.h new file mode 100644 index 000000000..31bb8627f --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/AsyncJobWatchdog.h @@ -0,0 +1,177 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_DISTRIBUTED_ASYNCJOBWATCHDOG_H_ +#define _SPTAG_SPANN_DISTRIBUTED_ASYNCJOBWATCHDOG_H_ + +#include "inc/Helper/Logging.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG { +namespace SPANN { +namespace Distributed { + +// AsyncJobWatchdog tracks async (fire-and-forget) inter-node dispatches +// and resends them on timeout or transport failure. +// +// Today the only fire-and-forget path is QueueRemoteAppend auto-flush in +// WorkerNode: it ships a batch of RemoteAppendRequests to a peer with no +// synchronous error propagation. Without a watchdog, transient network +// or peer-crash failures silently lose those appends. +// +// The watchdog is intentionally small: callers register a batch with a +// resend callback; the watchdog reschedules the callback up to +// MaxAttempts with exponential backoff. RemoteAppend is idempotent on +// the receive side (HandleRemoteAppend de-dups via per-posting RMW), so +// at-least-once delivery is safe. +class AsyncJobWatchdog { +public: + using ResendFn = std::function; // returns true on success + + AsyncJobWatchdog(int maxAttempts = 3, + int initialBackoffMs = 200) + : m_maxAttempts(maxAttempts), + m_initialBackoffMs(initialBackoffMs), + m_stop(false) { + m_worker = std::thread([this]() { Loop(); }); + } + + ~AsyncJobWatchdog() { + { + std::lock_guard lk(m_mutex); + m_stop = true; + } + m_cv.notify_all(); + if (m_worker.joinable()) m_worker.join(); + } + + // Submit a fire-and-forget dispatch. The watchdog calls `resend` if + // and only if a prior attempt has failed; the caller is responsible + // for the initial attempt. After success, call MarkSuccess(id). + uint64_t Track(ResendFn resend, std::string tag = "") { + std::lock_guard lk(m_mutex); + uint64_t id = ++m_nextId; + Entry e; + e.resend = std::move(resend); + e.attempts = 0; + e.tag = std::move(tag); + e.nextDeadline = std::chrono::steady_clock::time_point::max(); + m_entries.emplace(id, std::move(e)); + return id; + } + + void MarkSuccess(uint64_t id) { + std::lock_guard lk(m_mutex); + m_entries.erase(id); + } + + // Schedule a resend after backoff for entry `id`. Called by producer + // when its synchronous attempt fails. Gives up after MaxAttempts. + void MarkFailureAndScheduleResend(uint64_t id) { + std::unique_lock lk(m_mutex); + auto it = m_entries.find(id); + if (it == m_entries.end()) return; + if (++it->second.attempts >= m_maxAttempts) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "AsyncJobWatchdog: %s giving up after %d attempts\n", + it->second.tag.c_str(), it->second.attempts); + m_entries.erase(it); + return; + } + int backoffMs = m_initialBackoffMs << (it->second.attempts - 1); + it->second.nextDeadline = + std::chrono::steady_clock::now() + + std::chrono::milliseconds(backoffMs); + lk.unlock(); + m_cv.notify_all(); + } + + size_t OutstandingCount() const { + std::lock_guard lk(m_mutex); + return m_entries.size(); + } + +private: + struct Entry { + ResendFn resend; + int attempts; + std::string tag; + std::chrono::steady_clock::time_point nextDeadline; + }; + + void Loop() { + std::unique_lock lk(m_mutex); + while (!m_stop) { + auto now = std::chrono::steady_clock::now(); + auto nextWake = now + std::chrono::seconds(1); + std::vector due; + for (auto& kv : m_entries) { + if (kv.second.nextDeadline <= now) { + due.push_back(kv.first); + } else if (kv.second.nextDeadline < nextWake) { + nextWake = kv.second.nextDeadline; + } + } + for (uint64_t id : due) { + auto it = m_entries.find(id); + if (it == m_entries.end()) continue; + ResendFn fn = it->second.resend; + std::string tag = it->second.tag; + int attempt = it->second.attempts; + it->second.nextDeadline = + std::chrono::steady_clock::time_point::max(); + lk.unlock(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "AsyncJobWatchdog: resending %s attempt=%d\n", + tag.c_str(), attempt + 1); + bool ok = false; + try { ok = fn(); } catch (...) { ok = false; } + lk.lock(); + if (ok) { + m_entries.erase(id); + } else { + auto it2 = m_entries.find(id); + if (it2 != m_entries.end()) { + if (++it2->second.attempts >= m_maxAttempts) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "AsyncJobWatchdog: %s giving up after %d attempts\n", + it2->second.tag.c_str(), it2->second.attempts); + m_entries.erase(it2); + } else { + int backoffMs = + m_initialBackoffMs << (it2->second.attempts - 1); + it2->second.nextDeadline = + std::chrono::steady_clock::now() + + std::chrono::milliseconds(backoffMs); + } + } + } + } + m_cv.wait_until(lk, nextWake, [this]() { return m_stop; }); + } + } + + mutable std::mutex m_mutex; + std::condition_variable m_cv; + std::unordered_map m_entries; + uint64_t m_nextId = 0; + int m_maxAttempts; + int m_initialBackoffMs; + bool m_stop; + std::thread m_worker; +}; + +} // namespace Distributed +} // namespace SPANN +} // namespace SPTAG + +#endif // _SPTAG_SPANN_DISTRIBUTED_ASYNCJOBWATCHDOG_H_ diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index d50edcfd5..40d537379 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -5,6 +5,7 @@ #define _SPTAG_SPANN_WORKERNODE_H_ #include "inc/Core/SPANN/Distributed/NetworkNode.h" +#include "inc/Core/SPANN/Distributed/AsyncJobWatchdog.h" #include "inc/Helper/KeyValueIO.h" #include "inc/Helper/CommonHelper.h" #include "inc/Socket/SimpleSerialization.h" @@ -279,8 +280,25 @@ namespace SPTAG::SPANN { while (true) { ErrorCode ret = SendBatchRemoteAppend(nodeIndex, *items); if (ret != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "QueueRemoteAppend auto-flush: batch to node %d failed (%zu items)\n", + // Hand the failed batch to the watchdog. It owns + // backoff/retry until MaxAttempts; RemoteAppend is + // idempotent on the receive side so at-least-once + // delivery is safe. + auto retryItems = + std::make_shared>(*items); + int n = nodeIndex; + auto self = this; + std::string tag = "QueueRemoteAppend node=" + + std::to_string(n) + " items=" + + std::to_string(retryItems->size()); + uint64_t id = m_asyncWatchdog.Track( + [self, n, retryItems]() { + return self->SendBatchRemoteAppend(n, *retryItems) + == ErrorCode::Success; + }, std::move(tag)); + m_asyncWatchdog.MarkFailureAndScheduleResend(id); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "QueueRemoteAppend auto-flush: batch to node %d failed (%zu items), handed to watchdog\n", nodeIndex, items->size()); } items->clear(); @@ -598,6 +616,12 @@ namespace SPTAG::SPANN { static constexpr size_t kAutoFlushThreshold = 50000; std::atomic m_maxInflightPerNode{4}; + // Resends failed async fire-and-forget batches with exponential + // backoff (see AsyncJobWatchdog.h). Constructed last so it tears + // down before the queues; declared here so destruction order + // matches the design's fault-tolerance contract. + Distributed::AsyncJobWatchdog m_asyncWatchdog{3, 200}; + std::mutex& GetPerNodeAppendFlushMutex(int nodeIndex) { std::lock_guard lk(m_perNodeAppendFlushMutexMapLock); auto it = m_perNodeAppendFlushMutex.find(nodeIndex); From 7093d4060f0221a55d9a413fb051b4b579d3bf34 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 09:52:13 +0000 Subject: [PATCH 21/48] SPANN distributed: durable HeadSync log + Split WAL scaffolding Adds two TiKV-backed durability primitives matching the distributed design's HeadSync Job Fault Tolerance and Split Path WAL sections: * HeadSyncLog (new file Distributed/HeadSyncLog.h) Per-shard monotonically-versioned log of HeadSyncEntry, keyed by 'hs/e//', with 'hs/v/' as the published tip and 'hs/c//' as each node's applied cursor. Exposes Append/ReadSince/LoadCursor/StoreCursor and an optional background reconciler thread. Raw KV (no txn) per design guidance; producer-side per-shard mutex serializes version bumps and the next reader catches up via cursor replay. * SplitWAL (new file Distributed/SplitWAL.h) Stage-tracked record under 'wal/split//' so that a cross-owner split can be GC'd after partial failure (one side written, the other not). Wire-in: ExtraDynamicSearcher's BroadcastHeadSync now persists entries to HeadSyncLog before issuing the in-memory broadcast. Broadcast remains the latency path; TiKV is the source of truth so lost or duplicated broadcasts no longer threaten correctness. SplitWAL Begin/Commit hooks at the split site, and reconciler thread activation, are scaffolded behind the new members but not yet wired into the split flow; they are sequential follow-ups that require distributed integration testing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/Distributed/HeadSyncLog.h | 282 ++++++++++++++++++ .../inc/Core/SPANN/Distributed/SplitWAL.h | 105 +++++++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 29 ++ 3 files changed, 416 insertions(+) create mode 100644 AnnService/inc/Core/SPANN/Distributed/HeadSyncLog.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/SplitWAL.h diff --git a/AnnService/inc/Core/SPANN/Distributed/HeadSyncLog.h b/AnnService/inc/Core/SPANN/Distributed/HeadSyncLog.h new file mode 100644 index 000000000..eb71f666e --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/HeadSyncLog.h @@ -0,0 +1,282 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_DISTRIBUTED_HEADSYNCLOG_H_ +#define _SPTAG_SPANN_DISTRIBUTED_HEADSYNCLOG_H_ + +#include "inc/Core/Common.h" +#include "inc/Helper/KeyValueIO.h" +#include "inc/Helper/Logging.h" +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG { +namespace SPANN { +namespace Distributed { + +// HeadSyncLog: durable per-shard log of HeadSync entries in TiKV. +// +// Per the distributed design, the canonical source of truth for head +// topology changes is TiKV, not the in-memory broadcast. Each shard +// (today: per owner node) holds: +// * `hs/v/` little-endian uint64 latest version +// * `hs/e//` serialized HeadSyncEntry payload +// * `hs/c//` little-endian uint64 applied version +// +// Versions are monotonically increasing per shard. Producers serialize +// their version-bump under `m_appendMutex` and write entry-then-version; +// readers tolerate a transient lag where version points slightly past +// the last entry (treat the missing entry as not-yet-visible and retry). +// TiKV's raw KV API gives no multi-key atomicity; the design (per user +// direction) accepts this and relies on idempotent apply + cursor +// catch-up to converge. +// +// This header is intentionally self-contained; it does not depend on +// any SPANN searcher type. ExtraDynamicSearcher wires it up by +// constructing one instance per layer-0 ExtraDynamicSearcher, calling +// Append() in BroadcastHeadSync, and supplying an ApplyFn callback for +// the reconciler. +class HeadSyncLog { +public: + // Decoded entry returned by ReadSince. Carries the version so the + // reconciler can advance its cursor strictly past it on success. + struct VersionedEntry { + std::uint64_t version; + HeadSyncEntry entry; + }; + + using ApplyFn = std::function; + + HeadSyncLog(std::shared_ptr db, + int nodeIndex, + int reconcileIntervalMs = 2000) + : m_db(std::move(db)), + m_nodeIndex(nodeIndex), + m_reconcileIntervalMs(reconcileIntervalMs), + m_stop(false) {} + + ~HeadSyncLog() { Stop(); } + + // Append a batch of entries to the given shard's log. Returns the + // version of the last written entry (>= 1 on success, 0 on failure). + std::uint64_t Append(int shard, const std::vector& entries) { + if (!m_db || entries.empty()) return 0; + std::lock_guard lk(GetShardAppendMutex(shard)); + std::uint64_t base = LoadLatestVersion(shard); + std::vector keys; + std::vector values; + keys.reserve(entries.size()); + values.reserve(entries.size()); + std::uint64_t v = base; + for (const auto& e : entries) { + ++v; + keys.push_back(MakeEntryKey(shard, v)); + values.push_back(EncodeEntry(e)); + } + auto ec = m_db->MultiPut(keys, values, kTimeout, nullptr); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "HeadSyncLog::Append shard=%d entries=%zu MultiPut failed (%d)\n", + shard, entries.size(), (int)ec); + return 0; + } + ec = m_db->Put(MakeVersionKey(shard), + EncodeUint64(v), + kTimeout, nullptr); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "HeadSyncLog::Append shard=%d version Put failed (%d), entries durable but version lag\n", + shard, (int)ec); + // Entries are durable; the next Append (or reconciler in + // another node) will discover them via probe. + return v; + } + return v; + } + + // Read latest version that the shard publisher has advanced to. + // Returns 0 if no version is published yet or on read failure. + std::uint64_t GetLatestVersion(int shard) const { return LoadLatestVersion(shard); } + + // Read entries (cursor, latest], one at a time. Stops at the first + // missing version (which indicates writer lag). + std::vector ReadSince(int shard, + std::uint64_t cursor, + std::uint64_t latest, + size_t maxBatch = 256) const { + std::vector out; + if (!m_db || cursor >= latest) return out; + size_t want = std::min(maxBatch, + static_cast(latest - cursor)); + std::vector keys; + keys.reserve(want); + for (size_t i = 0; i < want; ++i) { + keys.push_back(MakeEntryKey(shard, cursor + 1 + i)); + } + std::vector values; + auto ec = m_db->MultiGet(keys, &values, kTimeout, nullptr); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "HeadSyncLog::ReadSince shard=%d MultiGet failed (%d)\n", + shard, (int)ec); + return out; + } + for (size_t i = 0; i < values.size(); ++i) { + if (values[i].empty()) break; // writer lag; stop here + VersionedEntry ve; + ve.version = cursor + 1 + i; + if (!DecodeEntry(values[i], ve.entry)) break; + out.push_back(std::move(ve)); + } + return out; + } + + // Cursor I/O for a (node, shard) pair. + std::uint64_t LoadCursor(int shard) const { + if (!m_db) return 0; + std::string out; + auto ec = m_db->Get(MakeCursorKey(m_nodeIndex, shard), &out, kTimeout, nullptr); + if (ec != ErrorCode::Success || out.size() < sizeof(std::uint64_t)) return 0; + return DecodeUint64(out); + } + + bool StoreCursor(int shard, std::uint64_t version) { + if (!m_db) return false; + auto ec = m_db->Put(MakeCursorKey(m_nodeIndex, shard), + EncodeUint64(version), + kTimeout, nullptr); + return ec == ErrorCode::Success; + } + + // Start a background reconciler that wakes every interval and, for + // each known shard, fetches missing entries since the local cursor + // and feeds them to `apply`. `apply` must be idempotent. + void StartReconciler(std::vector shards, ApplyFn apply) { + if (m_reconciler.joinable()) return; + m_shards = std::move(shards); + m_apply = std::move(apply); + m_stop = false; + m_reconciler = std::thread([this]() { ReconcileLoop(); }); + } + + void Stop() { + { + std::lock_guard lk(m_cvMutex); + m_stop = true; + } + m_cv.notify_all(); + if (m_reconciler.joinable()) m_reconciler.join(); + } + +private: + static constexpr auto kTimeout = std::chrono::microseconds(2'000'000); + + static std::string EncodeUint64(std::uint64_t v) { + std::string s(sizeof(v), '\0'); + memcpy(&s[0], &v, sizeof(v)); + return s; + } + static std::uint64_t DecodeUint64(const std::string& s) { + std::uint64_t v = 0; + if (s.size() >= sizeof(v)) memcpy(&v, s.data(), sizeof(v)); + return v; + } + static std::string MakeVersionKey(int shard) { + return "hs/v/" + std::to_string(shard); + } + static std::string MakeEntryKey(int shard, std::uint64_t version) { + // Big-endian version so byte-range scans (if added later) are + // monotonically sorted. + std::string s = "hs/e/" + std::to_string(shard) + "/"; + char be[8]; + for (int i = 0; i < 8; ++i) be[i] = static_cast((version >> ((7 - i) * 8)) & 0xff); + s.append(be, 8); + return s; + } + static std::string MakeCursorKey(int node, int shard) { + return "hs/c/" + std::to_string(node) + "/" + std::to_string(shard); + } + + static std::string EncodeEntry(const HeadSyncEntry& e) { + std::string s(e.EstimateBufferSize(), '\0'); + std::uint8_t* end = e.Write(reinterpret_cast(&s[0])); + s.resize(static_cast(end - reinterpret_cast(&s[0]))); + return s; + } + static bool DecodeEntry(const std::string& s, HeadSyncEntry& e) { + if (s.empty()) return false; + e.Read(reinterpret_cast(s.data())); + return true; + } + + std::uint64_t LoadLatestVersion(int shard) const { + std::string out; + auto ec = m_db->Get(MakeVersionKey(shard), &out, kTimeout, nullptr); + if (ec != ErrorCode::Success) return 0; + return DecodeUint64(out); + } + + std::mutex& GetShardAppendMutex(int shard) { + std::lock_guard lk(m_appendMutexMapLock); + auto& slot = m_appendMutexes[shard]; + if (!slot) slot = std::make_unique(); + return *slot; + } + + void ReconcileLoop() { + std::unique_lock lk(m_cvMutex); + while (!m_stop) { + lk.unlock(); + for (int shard : m_shards) { + std::uint64_t cursor = LoadCursor(shard); + std::uint64_t latest = LoadLatestVersion(shard); + if (latest <= cursor) continue; + auto entries = ReadSince(shard, cursor, latest); + if (entries.empty()) continue; + std::uint64_t advanced = cursor; + for (const auto& ve : entries) { + if (!m_apply(ve)) break; + advanced = ve.version; + } + if (advanced > cursor) { + StoreCursor(shard, advanced); + } + } + lk.lock(); + m_cv.wait_for(lk, std::chrono::milliseconds(m_reconcileIntervalMs), + [this]() { return m_stop; }); + } + } + + std::shared_ptr m_db; + int m_nodeIndex; + int m_reconcileIntervalMs; + + std::mutex m_appendMutexMapLock; + std::unordered_map> m_appendMutexes; + + std::vector m_shards; + ApplyFn m_apply; + + mutable std::mutex m_cvMutex; + std::condition_variable m_cv; + bool m_stop; + std::thread m_reconciler; +}; + +} // namespace Distributed +} // namespace SPANN +} // namespace SPTAG + +#endif // _SPTAG_SPANN_DISTRIBUTED_HEADSYNCLOG_H_ diff --git a/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h b/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h new file mode 100644 index 000000000..1bc84b052 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h @@ -0,0 +1,105 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_DISTRIBUTED_SPLITWAL_H_ +#define _SPTAG_SPANN_DISTRIBUTED_SPLITWAL_H_ + +#include "inc/Core/Common.h" +#include "inc/Helper/KeyValueIO.h" +#include "inc/Helper/Logging.h" + +#include +#include +#include +#include +#include + +namespace SPTAG { +namespace SPANN { +namespace Distributed { + +// SplitWAL: durable write-ahead log entry for a cross-owner split. +// +// Per the distributed design's Split Happy Path, when a split produces +// two child heads owned by different nodes, the split writes the local +// child via PutPostingToDB and the remote child via the remote queue. +// If either write fails after the other succeeded, a WAL-driven GC job +// must clean up the orphan posting under the partner head. +// +// Key schema: +// wal/split// → encoded SplitWALRecord +// Garbage-collection (background): scan `wal/split/` prefix; if a +// record is older than `kStaleSec` and not marked committed, it +// represents either an in-flight split or a crashed one — issue +// best-effort deletes against both children using the recorded headIDs. +// +// Today this is scaffolding: Begin/Commit hooks should wrap the split's +// cross-owner write path in ExtraDynamicSearcher. GC sweep can run on +// the existing RefineIndex cadence. +class SplitWAL { +public: + enum class Stage : std::uint8_t { + Begin = 0, // both children allocated, neither written + LocalDone = 1, // local write succeeded; remote pending + RemoteDone = 2, // remote write succeeded; local pending + BothDone = 3, // both written; safe to remove WAL + delete src + }; + + struct Record { + std::uint64_t jobID; + SizeType srcHeadID; + SizeType localChildHeadID; + SizeType remoteChildHeadID; + int remoteOwnerNodeIndex; + std::int64_t startTimestampSec; + Stage stage; + + std::string Encode() const { + std::string s(sizeof(Record), '\0'); + memcpy(&s[0], this, sizeof(Record)); + return s; + } + bool Decode(const std::string& s) { + if (s.size() < sizeof(Record)) return false; + memcpy(this, s.data(), sizeof(Record)); + return true; + } + }; + + explicit SplitWAL(std::shared_ptr db) : m_db(std::move(db)) {} + + // Write or update a WAL record. Stage transitions are monotonic. + bool Write(const Record& r) { + if (!m_db) return false; + auto ec = m_db->Put(MakeKey(r.srcHeadID, r.jobID), r.Encode(), kTimeout, nullptr); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "SplitWAL::Write head=%lld job=%llu stage=%u failed (%d)\n", + (long long)r.srcHeadID, (unsigned long long)r.jobID, + (unsigned)r.stage, (int)ec); + return false; + } + return true; + } + + // Remove a completed WAL record after both writes succeeded. + bool Clear(SizeType srcHeadID, std::uint64_t jobID) { + if (!m_db) return false; + std::vector k{ MakeKey(srcHeadID, jobID) }; + return m_db->MultiDelete(k, kTimeout) == ErrorCode::Success; + } + + static std::string MakeKey(SizeType srcHeadID, std::uint64_t jobID) { + return "wal/split/" + std::to_string(srcHeadID) + "/" + std::to_string(jobID); + } + +private: + static constexpr auto kTimeout = std::chrono::microseconds(2'000'000); + std::shared_ptr m_db; +}; + +} // namespace Distributed +} // namespace SPANN +} // namespace SPTAG + +#endif // _SPTAG_SPANN_DISTRIBUTED_SPLITWAL_H_ diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index a97369d08..6e7f35eb3 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -21,6 +21,8 @@ #include "ExtraFileController.h" #include "Distributed/WorkerNode.h" #include "Distributed/RemoteLeaseTable.h" +#include "Distributed/HeadSyncLog.h" +#include "Distributed/SplitWAL.h" #include #include #include @@ -273,6 +275,13 @@ namespace SPTAG::SPANN { static constexpr int kRemoteLockPoolSize = 32767; std::unique_ptr m_remoteLeaseTable; + // Durable HeadSync log + per-owner split WAL. Populated by + // SetWorker once we have the shared TiKV handle. See + // Distributed/HeadSyncLog.h and Distributed/SplitWAL.h. + std::unique_ptr m_headSyncLog; + std::unique_ptr m_splitWAL; + std::atomic m_splitJobIdCounter{ 0 }; + IndexStats m_stat; std::shared_ptr m_wal; @@ -494,6 +503,17 @@ namespace SPTAG::SPANN { m_worker->SetRpcMaxInflightPerNode(m_opt->m_remoteAppendMaxInflight); } + // Initialize durable HeadSync log + SplitWAL once we know the + // worker (and therefore the node identity). Both are layer-0 + // concerns: only layer 0 actually broadcasts HeadSync and + // performs cross-owner splits. See Distributed/HeadSyncLog.h + // and Distributed/SplitWAL.h. + if (m_layer == 0 && db) { + m_headSyncLog = std::make_unique( + db, m_worker->GetWorkerNodeIndex()); + m_splitWAL = std::make_unique(db); + } + WireJobSubmitterIfReady(); // Claim ownership so the matching destructor's IfOwner check @@ -1406,6 +1426,15 @@ namespace SPTAG::SPANN { headSyncEntries.push_back(std::move(entry)); } if (!headSyncEntries.empty()) { + // Durably persist to TiKV first, then broadcast. + // Per design, broadcast is a best-effort latency + // optimization; TiKV is the source of truth. + // Shard = owning node so each owner advances its + // own version counter independently. + if (m_headSyncLog) { + int shard = m_worker->GetWorkerNodeIndex(); + m_headSyncLog->Append(shard, headSyncEntries); + } m_worker->BroadcastHeadSync(headSyncEntries); } } From 111d37c555999d8f3ccd1a11e7d193c85944e1c1 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 11:07:16 +0000 Subject: [PATCH 22/48] SPANN distributed: full lease-fencing with monotonic fencing tokens Per the design's Async Job Fault Tolerance section, lease-based locks need an accompanying fencing token so that a zombie holder which resumes after its lease expired cannot mutate state now protected by a newer holder. Protocol bumps (backwards compatible via mirror-version gates): * RemoteAppendRequest mirror 1 -> 2: m_fencingToken (uint64). Token 0 = unfenced (normal owner-ring route). * RemoteLockRequest mirror 1 -> 2: m_token (uint64). Lock sends 0; Unlock sends issued token. * RemoteLockResponse mirror 0 -> 1: m_token (uint64). Owner returns issued fencing token on Lock. API changes: * RemoteLeaseTable: TryAcquire returns uint64_t token (0=denied); Release(bucket, token) only succeeds if token matches; Validate used by receiver-side fence check. * RemoteLockCallback: bool -> uint64_t signature carrying the token. * SendRemoteLock returns uint64_t (issued token on Lock). * New FenceValidator callback + RemotePostingOps fence-check on inbound RemoteAppend; rejected if token stale. * New WorkerNode::SendFencedRemoteAppend synchronous helper for split's cross-owner write path (unblocks split-atomicity). The ExtraDynamicSearcher lock callback now plumbs tokens end-to-end through RemoteLeaseTable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../SPANN/Distributed/DistributedProtocol.h | 42 ++++++- .../Core/SPANN/Distributed/RemoteLeaseTable.h | 93 ++++++++------- .../Core/SPANN/Distributed/RemotePostingOps.h | 107 ++++++++++++++++-- .../inc/Core/SPANN/Distributed/WorkerNode.h | 23 +++- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 34 ++++-- 5 files changed, 233 insertions(+), 66 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h index 963ca6b35..082bdb373 100644 --- a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h +++ b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h @@ -15,15 +15,20 @@ namespace SPTAG::SPANN { /// Serializable request for remote Append operations sent between compute nodes. /// MirrorVersion 1 added m_layer to disambiguate which ExtraDynamicSearcher on /// the receiver side handles the request. Version 0 packets default m_layer=0. + /// MirrorVersion 2 added m_fencingToken: when nonzero the receiver must + /// validate the token against its RemoteLeaseTable for the head's bucket + /// before applying. Token 0 means "no fencing required" (used by the + /// normal owner-ring auto-route path that does not hold any remote lock). struct RemoteAppendRequest { static constexpr std::uint16_t MajorVersion() { return 1; } - static constexpr std::uint16_t MirrorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 2; } SizeType m_headID = 0; std::string m_headVec; // raw head vector bytes std::int32_t m_appendNum = 0; std::string m_appendPosting; // serialized posting data std::int32_t m_layer = 0; // originating ExtraDynamicSearcher layer + std::uint64_t m_fencingToken = 0; // 0 = unfenced (legacy path) std::size_t EstimateBufferSize() const { std::size_t size = 0; @@ -33,6 +38,7 @@ namespace SPTAG::SPANN { size += sizeof(std::int32_t); // appendNum size += sizeof(std::uint32_t) + m_appendPosting.size(); // appendPosting (len-prefixed) size += sizeof(std::int32_t); // layer (mirrorVer >= 1) + size += sizeof(std::uint64_t); // fencingToken (mirrorVer >= 2) return size; } @@ -45,6 +51,7 @@ namespace SPTAG::SPANN { p_buffer = SimpleWriteBuffer(m_appendNum, p_buffer); p_buffer = SimpleWriteBuffer(m_appendPosting, p_buffer); p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + p_buffer = SimpleWriteBuffer(m_fencingToken, p_buffer); return p_buffer; } @@ -67,6 +74,11 @@ namespace SPTAG::SPANN { } else { m_layer = 0; } + if (mirrorVer >= 2) { + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_fencingToken); + } else { + m_fencingToken = 0; + } return p_buffer; } }; @@ -454,18 +466,22 @@ namespace SPTAG::SPANN { /// Request to lock/unlock a headID on its owner node (for cross-node Merge). /// MirrorVersion 1 added m_layer so multi-layer setups dispatch to the /// correct lock pool (each ExtraDynamicSearcher owns its own bucket flags). + /// MirrorVersion 2 added m_token for fencing: Lock requests send token=0; + /// Unlock requests send the token issued by the prior Lock so a zombie + /// holder whose lease expired cannot release a lock now held by someone else. struct RemoteLockRequest { static constexpr std::uint16_t MajorVersion() { return 1; } - static constexpr std::uint16_t MirrorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 2; } enum class Op : std::uint8_t { Lock = 0, Unlock = 1 }; Op m_op = Op::Lock; SizeType m_headID = 0; std::int32_t m_layer = 0; + std::uint64_t m_token = 0; std::size_t EstimateBufferSize() const { return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) - + sizeof(SizeType) + sizeof(std::int32_t); + + sizeof(SizeType) + sizeof(std::int32_t) + sizeof(std::uint64_t); } std::uint8_t* Write(std::uint8_t* p_buffer) const { @@ -475,6 +491,7 @@ namespace SPTAG::SPANN { p_buffer = SimpleWriteBuffer(static_cast(m_op), p_buffer); p_buffer = SimpleWriteBuffer(m_headID, p_buffer); p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + p_buffer = SimpleWriteBuffer(m_token, p_buffer); return p_buffer; } @@ -493,20 +510,29 @@ namespace SPTAG::SPANN { } else { m_layer = 0; } + if (mirrorVer >= 2) { + p_buffer = SimpleReadBuffer(p_buffer, m_token); + } else { + m_token = 0; + } return p_buffer; } }; /// Response for remote lock operations. + /// MirrorVersion 1 added m_token: the owner returns the issued fencing + /// token on a successful Lock so the holder can attach it to subsequent + /// lock-protected operations. Unlock responses return token=0. struct RemoteLockResponse { static constexpr std::uint16_t MajorVersion() { return 1; } - static constexpr std::uint16_t MirrorVersion() { return 0; } + static constexpr std::uint16_t MirrorVersion() { return 1; } enum class Status : std::uint8_t { Granted = 0, Denied = 1 }; Status m_status = Status::Granted; + std::uint64_t m_token = 0; std::size_t EstimateBufferSize() const { - return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t); + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + sizeof(std::uint64_t); } std::uint8_t* Write(std::uint8_t* p_buffer) const { @@ -514,6 +540,7 @@ namespace SPTAG::SPANN { p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); p_buffer = SimpleWriteBuffer(static_cast(m_status), p_buffer); + p_buffer = SimpleWriteBuffer(m_token, p_buffer); return p_buffer; } @@ -526,6 +553,11 @@ namespace SPTAG::SPANN { std::uint8_t rawOp = 0; p_buffer = SimpleReadBuffer(p_buffer, rawOp); m_status = static_cast(rawOp); + if (mirrorVer >= 1) { + p_buffer = SimpleReadBuffer(p_buffer, m_token); + } else { + m_token = 0; + } return p_buffer; } }; diff --git a/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h b/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h index 2d6881c7e..ed95903fd 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemoteLeaseTable.h @@ -4,27 +4,18 @@ // RemoteLeaseTable // ---------------- // Owner-side bookkeeping for cross-node merge / structural-op locks. -// Backs the per-bucket advisory flag that local Split / Merge consult via -// WaitForRemoteBucketUnlocked before mutating a head whose ownership is -// shared with a remote candidate. +// Each bucket has a TTL-bounded lease AND a monotonically increasing +// fencing token so a zombie holder that resumes after lease expiry has +// its late operations rejected (see Async Job Fault Tolerance in the +// design doc). // -// Design contract (see Async Job Fault Tolerance): -// * Each acquired lock carries a bounded TTL. If the holder crashes or -// stops responding, the lease auto-expires and the owner is free to -// proceed (or grant the bucket to another holder). -// * No keepalive: structural ops are expected to complete in under one -// TTL. If they exceed the TTL, the holder must retry the whole job; -// the owner has already released the lease. +// API: +// TryAcquire(bucket) -> uint64_t token (0 = denied) +// Validate(bucket, token) -> bool, the held token still matches +// Release(bucket, token) -> bool, only releases if token matches +// IsLocked(bucket) -> bool, auto-clears expired entries // -// The TTL is the single configurable knob (default 30s, matching the -// design's lease-TTL recommendation). A future iteration can add a -// fencing token so a zombie holder that resumes after expiry has its -// late unlock rejected — that requires a protocol bump on -// RemoteLockRequest/Response, which we'll do once a real owner-restart -// test exists to validate the change. For now the in-memory lease -// table provides the safety net the design requires: zombie holders -// never indefinitely block the owner. - +// The TTL knob is `RemoteLockTtlMs` in SPANN options (default 30s). #ifndef _SPTAG_SPANN_DISTRIBUTED_REMOTELEASETABLE_H_ #define _SPTAG_SPANN_DISTRIBUTED_REMOTELEASETABLE_H_ @@ -39,57 +30,73 @@ namespace SPTAG::SPANN { public: using Clock = std::chrono::steady_clock; - // bucketCount must match the searcher's lock-pool bucket count - // (FineGrainedRWLock::BucketIndex range). Allocates one slot per - // bucket; slots start in the unlocked state (expiry == 0). explicit RemoteLeaseTable(std::size_t bucketCount, int ttlMs = 30000) : m_count(bucketCount + 1), m_ttlMs(ttlMs) { m_expiry = std::make_unique[]>(m_count); - for (std::size_t i = 0; i < m_count; ++i) m_expiry[i].store(0, std::memory_order_relaxed); + m_tokens = std::make_unique[]>(m_count); + for (std::size_t i = 0; i < m_count; ++i) { + m_expiry[i].store(0, std::memory_order_relaxed); + m_tokens[i].store(0, std::memory_order_relaxed); + } } void SetTtlMs(int ttlMs) { if (ttlMs > 0) m_ttlMs.store(ttlMs, std::memory_order_relaxed); } int GetTtlMs() const { return m_ttlMs.load(std::memory_order_relaxed); } - // Try to grant a lease for bucket. Succeeds iff bucket is unlocked + // Try to grant a lease for bucket. Succeeds iff bucket is unlocked // OR the previous holder's lease has expired (auto-reclamation). - // Records the new expiry deadline. - bool TryAcquire(unsigned bucket) { - if (bucket >= m_count) return false; + // Returns the fencing token (>= 1) on success, 0 on denial. + std::uint64_t TryAcquire(unsigned bucket) { + if (bucket >= m_count) return 0; const std::int64_t nowNs = NowNs(); const std::int64_t ttlNs = (std::int64_t)m_ttlMs.load(std::memory_order_relaxed) * 1'000'000LL; std::int64_t current = m_expiry[bucket].load(std::memory_order_acquire); for (;;) { - if (current != 0 && current > nowNs) return false; // still held by live lease + if (current != 0 && current > nowNs) return 0; // still held by live lease const std::int64_t newExpiry = nowNs + ttlNs; if (m_expiry[bucket].compare_exchange_weak(current, newExpiry, - std::memory_order_acq_rel)) return true; - // CAS lost: re-evaluate with the updated `current`. + std::memory_order_acq_rel)) { + std::uint64_t tok = m_nextToken.fetch_add(1, std::memory_order_acq_rel) + 1; + m_tokens[bucket].store(tok, std::memory_order_release); + return tok; + } } } - // Release the lease unconditionally. In the current protocol the - // caller is trusted (holder cooperates). When a fencing token is - // added, this becomes a token-validated release. - void Release(unsigned bucket) { - if (bucket >= m_count) return; + // True iff bucket currently holds `token` AND lease not expired. + bool Validate(unsigned bucket, std::uint64_t token) const { + if (bucket >= m_count || token == 0) return false; + std::int64_t exp = m_expiry[bucket].load(std::memory_order_acquire); + if (exp == 0 || exp <= NowNs()) return false; + return m_tokens[bucket].load(std::memory_order_acquire) == token; + } + + // Release the lease only if the caller's token still matches. + // Late unlocks from a zombie holder whose lease expired (and was + // reacquired by another holder) silently no-op. + bool Release(unsigned bucket, std::uint64_t token) { + if (bucket >= m_count) return false; + std::uint64_t held = m_tokens[bucket].load(std::memory_order_acquire); + if (token == 0 || held != token) return false; + // Clear token first so a concurrent Validate sees the release + // before the expiry window closes. + m_tokens[bucket].store(0, std::memory_order_release); m_expiry[bucket].store(0, std::memory_order_release); + return true; } - // True iff the lease is currently held AND not expired. Auto-clears - // expired entries so a stuck holder doesn't permanently block the - // owner's Split/Merge path. + // True iff the lease is currently held AND not expired. bool IsLocked(unsigned bucket) { if (bucket >= m_count) return false; std::int64_t current = m_expiry[bucket].load(std::memory_order_acquire); if (current == 0) return false; if (current > NowNs()) return true; - // Expired: try to clear (best-effort; loss of race is OK because - // a concurrent holder either renewed or is also expired). std::int64_t expected = current; - m_expiry[bucket].compare_exchange_strong(expected, 0, - std::memory_order_acq_rel); + if (m_expiry[bucket].compare_exchange_strong(expected, 0, + std::memory_order_acq_rel)) { + m_tokens[bucket].store(0, std::memory_order_release); + } return false; } @@ -102,6 +109,8 @@ namespace SPTAG::SPANN { std::size_t m_count; std::atomic m_ttlMs; std::unique_ptr[]> m_expiry; + std::unique_ptr[]> m_tokens; + std::atomic m_nextToken{0}; }; } // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 03851df1c..1b39f5bc2 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -52,7 +52,18 @@ namespace SPTAG::SPANN { std::string& appendPosting)>; using HeadSyncCallback = std::function; - using RemoteLockCallback = std::function; + // RemoteLockCallback: + // For Lock op: token argument is 0; returns issued fencing token + // (>=1 on success, 0 on denial). + // For Unlock op: token argument is the previously-issued token; + // returns 1 on accepted release, 0 if the token is + // stale (lease already expired / re-issued). + using RemoteLockCallback = std::function; + // Validator for fenced RemoteAppend: receiver checks the request's + // m_fencingToken against the lease table for headID's bucket. + // Return true to allow the append, false to reject (the response + // will carry Failed status). Unfenced appends (token=0) bypass. + using FenceValidator = std::function; /// Callback for cross-node merge: search on a peer node observed /// that posting `headID` (which we own) looks underfull. The peer @@ -152,6 +163,14 @@ namespace SPTAG::SPANN { EnsureLayerSlot_NoLock(layer); m_remoteLockCallbacks[layer] = std::move(cb); } + void SetFenceValidator(int layer, FenceValidator cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + if (static_cast(layer) >= m_fenceValidators.size()) { + m_fenceValidators.resize(layer + 1); + } + m_fenceValidators[layer] = std::move(cb); + } void SetMergeCallback(int layer, MergeCallback cb) { std::unique_lock lk(m_callbackLifetimeMutex); EnsureLayerSlot_NoLock(layer); @@ -169,6 +188,7 @@ namespace SPTAG::SPANN { m_headSyncCallbacks.clear(); m_remoteLockCallbacks.clear(); m_mergeCallbacks.clear(); + m_fenceValidators.clear(); m_callbackOwners = std::vector>(); } @@ -200,6 +220,9 @@ namespace SPTAG::SPANN { if (layer >= 0 && static_cast(layer) < m_mergeCallbacks.size()) { m_mergeCallbacks[layer] = nullptr; } + if (layer >= 0 && static_cast(layer) < m_fenceValidators.size()) { + m_fenceValidators[layer] = nullptr; + } m_callbackOwners[layer].store(nullptr, std::memory_order_release); return true; } @@ -220,6 +243,11 @@ namespace SPTAG::SPANN { const auto& cb = m_remoteLockCallbacks[layer]; return cb ? &cb : nullptr; } + const FenceValidator* LookupFenceValidator_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_fenceValidators.size()) return nullptr; + const auto& cb = m_fenceValidators[layer]; + return cb ? &cb : nullptr; + } // PutPosting/FetchPosting/DeletePosting RPCs lived here historically. // With shared TiKV every node reads and writes the posting store // directly (PD routes the key), so the cross-node scatter-gather @@ -240,7 +268,8 @@ namespace SPTAG::SPANN { SizeType headID, const std::shared_ptr& headVec, int appendNum, - std::string& appendPosting) + std::string& appendPosting, + std::uint64_t fencingToken = 0) { Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); if (connID == Socket::c_invalidConnectionID) { @@ -256,6 +285,7 @@ namespace SPTAG::SPANN { req.m_headVec = *headVec; req.m_appendNum = appendNum; req.m_appendPosting = appendPosting; + req.m_fencingToken = fencingToken; Socket::ResourceID resID = m_nextResourceId.fetch_add(1); auto [future, _] = CreatePendingResponse(resID); @@ -632,18 +662,25 @@ namespace SPTAG::SPANN { // RemoteLock — synchronous request/response // ================================================================== - bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) { + // SendRemoteLock: synchronous lock/unlock RPC. + // For Lock (lock=true, token=0): returns issued fencing token, + // 0 on denial/timeout. + // For Unlock (lock=false, token=t): returns 1 on accepted release, + // 0 on rejection/timeout. + std::uint64_t SendRemoteLock(int nodeIndex, int layer, SizeType headID, + bool lock, std::uint64_t token = 0) { Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIndex); if (connID == Socket::c_invalidConnectionID) { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "RemotePostingOps: Cannot send remote lock to node %d\n", nodeIndex); - return false; + return 0; } RemoteLockRequest req; req.m_op = lock ? RemoteLockRequest::Op::Lock : RemoteLockRequest::Op::Unlock; req.m_headID = headID; req.m_layer = layer; + req.m_token = token; Socket::ResourceID rid = m_nextResourceId.fetch_add(1); auto [future, _] = CreatePendingResponse(rid); @@ -666,12 +703,18 @@ namespace SPTAG::SPANN { auto status = future.wait_for(std::chrono::milliseconds(5000)); if (status != std::future_status::ready) { ErasePending(rid); + TakePendingLockToken(rid); SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "RemotePostingOps: Lock timeout for headID %lld on node %d\n", (std::int64_t)headID, nodeIndex); - return false; + return 0; } - return future.get() == ErrorCode::Success; + ErrorCode ec = future.get(); + std::uint64_t returnedToken = TakePendingLockToken(rid); + if (ec != ErrorCode::Success) return 0; + // On Unlock the owner returns token=0 but Success status; map + // to a sentinel 1 so callers can distinguish from failure. + return lock ? returnedToken : (returnedToken == 0 ? 1 : returnedToken); } // ================================================================== @@ -701,6 +744,24 @@ namespace SPTAG::SPANN { ErrorCode result = ErrorCode::Fail; { std::shared_lock cbLock(m_callbackLifetimeMutex); + // Fence validation: if the request carries a nonzero + // fencing token, the writer claimed they held the remote + // lock for this head when they sent the RPC. Validate + // against our lease table before applying so a zombie + // holder's late write (after its lease expired) is + // rejected. + if (req.m_fencingToken != 0) { + const auto* fv = LookupFenceValidator_Locked(req.m_layer); + if (fv && !(*fv)(req.m_headID, req.m_fencingToken)) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: AppendRequest fencing token " + "%llu rejected for headID %lld (stale lease)\n", + (unsigned long long)req.m_fencingToken, + (std::int64_t)req.m_headID); + SendAppendResponse(packet, RemoteAppendResponse::Status::Failed); + return; + } + } const auto* cb = LookupAppendCallback_Locked(req.m_layer); if (cb) { auto headVec = std::make_shared(std::move(req.m_headVec)); @@ -967,14 +1028,18 @@ namespace SPTAG::SPANN { RemoteLockResponse resp; resp.m_status = RemoteLockResponse::Status::Denied; + resp.m_token = 0; { std::shared_lock cbLock(m_callbackLifetimeMutex); const auto* cb = LookupRemoteLockCallback_Locked(req.m_layer); if (cb) { bool isLock = (req.m_op == RemoteLockRequest::Op::Lock); - bool success = (*cb)(req.m_headID, isLock); - if (success) resp.m_status = RemoteLockResponse::Status::Granted; + std::uint64_t out = (*cb)(req.m_headID, isLock, req.m_token); + if (out != 0) { + resp.m_status = RemoteLockResponse::Status::Granted; + resp.m_token = isLock ? out : 0; + } } else { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "RemotePostingOps: RemoteLockRequest layer=%d has no callback registered\n", @@ -1007,6 +1072,13 @@ namespace SPTAG::SPANN { return; } + // Stash the issued fencing token so SendRemoteLock can pick + // it up after the future signals. + if (resp.m_status == RemoteLockResponse::Status::Granted && resp.m_token != 0) { + std::lock_guard lk(m_pendingLockTokensMutex); + m_pendingLockTokens[rid] = resp.m_token; + } + promise->set_value(resp.m_status == RemoteLockResponse::Status::Granted ? ErrorCode::Success : ErrorCode::Fail); } @@ -1026,6 +1098,15 @@ namespace SPTAG::SPANN { m_pendingResponses.erase(resID); } + std::uint64_t TakePendingLockToken(Socket::ResourceID rid) { + std::lock_guard lk(m_pendingLockTokensMutex); + auto it = m_pendingLockTokens.find(rid); + if (it == m_pendingLockTokens.end()) return 0; + std::uint64_t tok = it->second; + m_pendingLockTokens.erase(it); + return tok; + } + /// Take a pending promise out of the map (returns nullptr if not found). std::unique_ptr> TakePendingResponse(Socket::ResourceID resID) { std::lock_guard lock(m_pendingMutex); @@ -1221,6 +1302,7 @@ namespace SPTAG::SPANN { std::vector m_headSyncCallbacks; std::vector m_remoteLockCallbacks; std::vector m_mergeCallbacks; + std::vector m_fenceValidators; // Per-layer ownership tokens. Each ExtraDynamicSearcher claims its // layer slot at SetWorker time and releases it on destruction; this @@ -1239,6 +1321,15 @@ namespace SPTAG::SPANN { std::mutex m_pendingMutex; std::unordered_map> m_pendingResponses; + // Side table populated by HandleRemoteLockResponse: maps the + // outstanding RPC resource id to the fencing token returned by + // the owner. SendRemoteLock reads this immediately after the + // future signals to retrieve the token without needing to widen + // the m_pendingResponses promise type (which is shared with the + // Append/HeadSync RPCs). + std::mutex m_pendingLockTokensMutex; + std::unordered_map m_pendingLockTokens; + // Per-item Job: each remote append request becomes one Job submitted // to the searcher's shared SPDKThreadPool. The last completing Job // ACKs the sender. Identical to how a local insert thread would call diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index 40d537379..4675383b1 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -38,6 +38,7 @@ namespace SPTAG::SPANN { using DispatchCallback = DispatchCoordinator::DispatchCallback; using HeadSyncCallback = RemotePostingOps::HeadSyncCallback; using RemoteLockCallback = RemotePostingOps::RemoteLockCallback; + using FenceValidator = RemotePostingOps::FenceValidator; /// Initialize with separate dispatcher/worker/store addresses. /// workerIndex is 0-based (0 = driver/local, 1+ = remote). @@ -111,6 +112,7 @@ namespace SPTAG::SPANN { void SetAppendCallback(int layer, AppendCallback cb) { m_remoteOps.SetAppendCallback(layer, std::move(cb)); } void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { m_remoteOps.SetHeadSyncCallback(layer, std::move(cb)); } void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { m_remoteOps.SetRemoteLockCallback(layer, std::move(cb)); } + void SetFenceValidator(int layer, FenceValidator cb) { m_remoteOps.SetFenceValidator(layer, std::move(cb)); } // Inject the searcher's shared compute pool so receiver-side // BatchAppend work runs there (high-priority Jobs) instead of in a // separate executor. Idempotent: safe to call multiple times. @@ -226,9 +228,24 @@ namespace SPTAG::SPANN { m_remoteOps.NoteHeadSyncApplyDelete(); } - bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) { - if (!m_enabled) return false; - return m_remoteOps.SendRemoteLock(nodeIndex, layer, headID, lock); + // Returns issued fencing token on Lock success (0 = denied), + // or 1 on Unlock accepted (0 = rejected / stale token). + std::uint64_t SendRemoteLock(int nodeIndex, int layer, SizeType headID, + bool lock, std::uint64_t token = 0) { + if (!m_enabled) return 0; + return m_remoteOps.SendRemoteLock(nodeIndex, layer, headID, lock, token); + } + + // Synchronous, fenced remote append: includes the fencing token + // so the owner can validate that the writer still holds the + // bucket lease before applying. Returns Success/Fail. + ErrorCode SendFencedRemoteAppend(int nodeIndex, int layer, SizeType headID, + const std::shared_ptr& headVec, + int appendNum, std::string& appendPosting, + std::uint64_t fencingToken) { + if (!m_enabled) return ErrorCode::Fail; + return m_remoteOps.SendRemoteAppend(nodeIndex, layer, headID, headVec, + appendNum, appendPosting, fencingToken); } void SetMergeCallback(int layer, RemotePostingOps::MergeCallback cb) { diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 6e7f35eb3..8720a63c4 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -596,23 +596,41 @@ namespace SPTAG::SPANN { } }); - // Remote lock callback: per-bucket leases with TTL auto-release. - m_worker->SetRemoteLockCallback(m_layer, [this](SizeType headID, bool lock) -> bool { + // Remote lock callback: per-bucket leases with TTL auto-release + // AND a fencing token. The owner returns a monotonically + // increasing token on Lock; subsequent fenced operations + // (RemoteAppend with m_fencingToken set) carry that token + // and the owner validates it against this lease table before + // applying. A zombie holder whose lease has expired (and + // bucket been re-acquired) will have its late operations + // rejected. + m_worker->SetRemoteLockCallback(m_layer, + [this](SizeType headID, bool lock, std::uint64_t token) -> std::uint64_t { unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); if (lock) { - if (!m_remoteLeaseTable->TryAcquire(bucket)) return false; + std::uint64_t tok = m_remoteLeaseTable->TryAcquire(bucket); + if (tok == 0) return 0; if (!m_rwLocks[headID].try_lock()) { - m_remoteLeaseTable->Release(bucket); - return false; + m_remoteLeaseTable->Release(bucket, tok); + return 0; } m_rwLocks[headID].unlock(); - return true; + return tok; } else { - m_remoteLeaseTable->Release(bucket); - return true; + return m_remoteLeaseTable->Release(bucket, token) ? 1 : 0; } }); + // Fenced RemoteAppend validator: the receive-side gate for + // split's cross-owner posting writes. A nonzero fencing + // token in the request must match the current lease for + // that head's bucket. + m_worker->SetFenceValidator(m_layer, + [this](SizeType headID, std::uint64_t token) -> bool { + unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); + return m_remoteLeaseTable->Validate(bucket, token); + }); + // Cross-node merge hint callback m_worker->SetMergeCallback(m_layer, [this](SizeType headID) { MergeAsync(headID); From 74c0350cdb7fd2154839f1e082246487bdd0be05 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 11:10:36 +0000 Subject: [PATCH 23/48] SPANN distributed: wire split path through fenced cross-owner write Replace the two TryRouteRemoteAppend call sites in Split (existing-head merge path and new-head create path) with the synchronous TryWriteRemoteSplitChildFenced helper when the new head is remote-owned. The helper performs try-lock-both, writes a SplitWAL Begin record, sends a fenced RemoteAppend with the lock's monotonic token, then releases the lock and clears the WAL on success. On fenced-write failure (lock contention or RPC error), fall back to the legacy async TryRouteRemoteAppend so the posting is not stranded; the WAL + watchdog converge eventually. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 159 ++++++++++++++++-- 1 file changed, 143 insertions(+), 16 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 8720a63c4..f0a95ae93 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -715,6 +715,99 @@ namespace SPTAG::SPANN { return true; } + // Synchronous, fenced cross-owner write used by the Split path. + // Per the design's Split Happy Path: + // * The split holder already holds the local source-head lock. + // * For the remote child it must acquire the remote lock with a + // try-and-backoff protocol (try-lock-both). Failure here + // means another node is racing; abort so the caller can + // re-enqueue via SplitAsync. + // * The remote posting write is fenced (token attached) so a + // zombie holder past lease expiry cannot resurrect this + // write after another holder took over. + // * A WAL record is written before the cross-owner posting + // write and cleared on success. On failure the WAL drives a + // GC pass to delete the orphan partner posting (see + // SplitWAL.h); GC is best-effort and only affects recall. + // + // Returns Success on both-locked-and-written, Fail otherwise. + // On failure the caller should leave any partial state to the + // GC pass and re-enqueue the split. + ErrorCode TryWriteRemoteSplitChildFenced(SizeType srcHeadID, + SizeType remoteChildHeadID, + const void* remoteChildHeadVecBytes, + int appendNum, + std::string& posting) { + int ownerNode = -1; + if (!IsRemoteOwnedHead(remoteChildHeadID, &ownerNode)) { + return ErrorCode::Fail; + } + if (!m_worker || !m_worker->IsEnabled()) return ErrorCode::Fail; + + // Try-lock-both: acquire remote lock with bounded retry. + std::uint64_t token = 0; + constexpr int kMaxLockRetries = 5; + for (int attempt = 0; attempt < kMaxLockRetries; ++attempt) { + token = m_worker->SendRemoteLock(ownerNode, m_layer, + remoteChildHeadID, true, 0); + if (token != 0) break; + std::this_thread::sleep_for( + std::chrono::milliseconds(5 * (attempt + 1))); + } + if (token == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: failed to acquire remote lock for child %lld on node %d " + "after %d retries; abort and re-enqueue\n", + (std::int64_t)remoteChildHeadID, ownerNode, kMaxLockRetries); + return ErrorCode::Fail; + } + + // Write WAL Begin so a crash after the remote write but + // before completion is recoverable via GC. + std::uint64_t jobID = m_splitJobIdCounter.fetch_add(1) + 1; + if (m_splitWAL) { + Distributed::SplitWAL::Record r; + r.jobID = jobID; + r.srcHeadID = srcHeadID; + r.localChildHeadID = 0; + r.remoteChildHeadID = remoteChildHeadID; + r.remoteOwnerNodeIndex = ownerNode; + r.startTimestampSec = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + r.stage = Distributed::SplitWAL::Stage::Begin; + m_splitWAL->Write(r); + } + + // Fenced sync remote append. Receiver validates the token + // against its lease table before applying. + auto headVec = std::make_shared( + static_cast(remoteChildHeadVecBytes), + m_vectorDataSize); + ErrorCode ec = m_worker->SendFencedRemoteAppend( + ownerNode, m_layer, remoteChildHeadID, headVec, + appendNum, posting, token); + + // Release the remote lock with the issued token. If our + // lease has expired in the meantime, Release will no-op on + // the owner side (the new holder's token won't match ours). + m_worker->SendRemoteLock(ownerNode, m_layer, remoteChildHeadID, + false, token); + + if (ec == ErrorCode::Success) { + // Clear WAL: both writes done. (The local-side Put + // happens in the caller's loop using the existing + // PutPostingToDB path.) + if (m_splitWAL) m_splitWAL->Clear(srcHeadID, jobID); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: fenced remote append failed for child %lld " + "on node %d (ec=%d); WAL kept for GC\n", + (std::int64_t)remoteChildHeadID, ownerNode, (int)ec); + } + return ec; + } + // Validate (and lazily extend) the local version map so that // every (vid, ver) tuple in a posting we are about to write is // representable. Without this, remote-originated postings carrying @@ -1269,15 +1362,35 @@ namespace SPTAG::SPANN { m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); // If newHeadVID's owner is a remote node, route - // the new posting via RemoteAppend; the owner - // will merge it into the existing posting list. - if (TryRouteRemoteAppend( - newHeadVID, + // the new posting via a fenced cross-owner write: + // acquire the remote lock, send a fenced + // RemoteAppend (sync), and let the owner merge + // it into the existing posting list. See + // TryWriteRemoteSplitChildFenced for the + // try-lock-both + WAL + fencing protocol. + if (IsRemoteOwnedHead(newHeadVID)) { + ErrorCode fec = TryWriteRemoteSplitChildFenced( + headID, newHeadVID, + args.centers + k * args._D, (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k], - args.centers + k * args._D)) { - if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); - continue; + newPostingLists[k]); + if (fec == ErrorCode::Success) { + if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + continue; + } + // Fall through: on remote-lock contention + // or send failure, fall back to the legacy + // async TryRouteRemoteAppend so we don't + // strand the posting. Watchdog + WAL GC + // converge eventually. + if (TryRouteRemoteAppend( + newHeadVID, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], + args.centers + k * args._D)) { + if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + continue; + } } std::string mergedPostingList; @@ -1366,19 +1479,33 @@ namespace SPTAG::SPANN { SplitAsync(newHeadVID, currentLength); } } else { - // If newHeadVID's owner is a remote node, route - // the initial posting via RemoteAppend so it - // ends up in the owner's TiKV. We still add the + // If newHeadVID's owner is a remote node, do the + // fenced cross-owner write: try-lock-both + WAL + // + sync fenced RemoteAppend. We still add the // head locally and rely on BroadcastHeadSync // (after this loop) to spread the head index // update to all nodes. The receiver's // AppendCallback materializes the head if its // HeadSync hasn't arrived yet. - bool remoteCreated = TryRouteRemoteAppend( - newHeadVID, - (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k], - args.centers + k * args._D); + bool remoteCreated = false; + if (IsRemoteOwnedHead(newHeadVID)) { + ErrorCode fec = TryWriteRemoteSplitChildFenced( + headID, newHeadVID, + args.centers + k * args._D, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k]); + if (fec == ErrorCode::Success) { + remoteCreated = true; + } else { + // Fall back to async queue: WAL + + // watchdog converge eventually. + remoteCreated = TryRouteRemoteAppend( + newHeadVID, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], + args.centers + k * args._D); + } + } if (!remoteCreated) { auto splitPutBegin = std::chrono::high_resolution_clock::now(); From de3fa64a2e8449dd234caf54674704762a8056ec Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 12:45:52 +0000 Subject: [PATCH 24/48] SPANN distributed: route inner layers, retire async-job UAF, larger RPC chunks, remote stats * Drop the m_layer != 0 short-circuit in IsRemoteOwnedHead. Both layers store postings in the same TiKV cluster (DBKey = m_maxID*m_layer+postingID) and need owner-ring routing, fencing, and SplitWAL just like layer 0. HeadSync broadcast stays layer-0-only since layer-1 centroids are derived from layer-0 splits and reach peers via that broadcast. SplitWAL keys now carry the layer to avoid collisions across layers. * Fix MergeAsyncJob / SplitAsyncJob retry use-after-free: the SPDKThreadPool worker unconditionally deletes the Job after exec() returns, so the prior 'add this; return;' retry path freed the Job while it was still queued. Enqueue a fresh Job carrying the bumped attempt count instead. * Bump RemoteAppendChunkSize 3000->10000 and RemoteAppendMaxInflight 4->8. Per-chunk grpc framing was dominating, and with replica fan-out =8 the outbound queue at 1M+1M scale ships ~8M items; larger chunks amortize send overhead ~3x. * Add remote queue stats to layer progress + ALL DONE logs and gate the ALL DONE boundary on the outbound queue draining. Previously ALL DONE fired as soon as local SPDK pool was empty, even though the network pump was still shipping millions of fan-out items, making runs look stuck for tens of minutes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/Distributed/SplitWAL.h | 13 +-- .../inc/Core/SPANN/Distributed/WorkerNode.h | 17 ++++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 89 ++++++++++++++----- .../inc/Core/SPANN/ParameterDefinitionList.h | 10 ++- 4 files changed, 98 insertions(+), 31 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h b/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h index 1bc84b052..3cd642a13 100644 --- a/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h +++ b/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h @@ -66,12 +66,13 @@ class SplitWAL { } }; - explicit SplitWAL(std::shared_ptr db) : m_db(std::move(db)) {} + explicit SplitWAL(std::shared_ptr db, int layer = 0) + : m_db(std::move(db)), m_layer(layer) {} // Write or update a WAL record. Stage transitions are monotonic. bool Write(const Record& r) { if (!m_db) return false; - auto ec = m_db->Put(MakeKey(r.srcHeadID, r.jobID), r.Encode(), kTimeout, nullptr); + auto ec = m_db->Put(MakeKey(m_layer, r.srcHeadID, r.jobID), r.Encode(), kTimeout, nullptr); if (ec != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "SplitWAL::Write head=%lld job=%llu stage=%u failed (%d)\n", @@ -85,17 +86,19 @@ class SplitWAL { // Remove a completed WAL record after both writes succeeded. bool Clear(SizeType srcHeadID, std::uint64_t jobID) { if (!m_db) return false; - std::vector k{ MakeKey(srcHeadID, jobID) }; + std::vector k{ MakeKey(m_layer, srcHeadID, jobID) }; return m_db->MultiDelete(k, kTimeout) == ErrorCode::Success; } - static std::string MakeKey(SizeType srcHeadID, std::uint64_t jobID) { - return "wal/split/" + std::to_string(srcHeadID) + "/" + std::to_string(jobID); + static std::string MakeKey(int layer, SizeType srcHeadID, std::uint64_t jobID) { + return "wal/split/" + std::to_string(layer) + "/" + + std::to_string(srcHeadID) + "/" + std::to_string(jobID); } private: static constexpr auto kTimeout = std::chrono::microseconds(2'000'000); std::shared_ptr m_db; + int m_layer = 0; }; } // namespace Distributed diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index 4675383b1..e18c9557d 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -262,6 +262,7 @@ namespace SPTAG::SPANN { auto& q = m_appendQueue[nodeIndex]; q.push_back(std::move(req)); m_remoteQueueSize.fetch_add(1, std::memory_order_relaxed); + m_totalRemoteAppendsRouted.fetch_add(1, std::memory_order_relaxed); // [PERF] Auto-flush per node once we have a full chunk worth // (kAutoFlushThreshold items). Without this, every remote // append accumulates until end-of-batch FlushRemoteAppends — @@ -340,6 +341,19 @@ namespace SPTAG::SPANN { return m_remoteQueueSize.load(std::memory_order_relaxed); } + // Number of remote append items submitted via QueueRemoteAppend over + // this WorkerNode's lifetime. Used by ExtraDynamicSearcher progress + // logging so users can tell whether "ALL DONE" on the local pool is + // misleading because the remote send queue still has backlog. + size_t GetTotalRemoteAppendsRouted() const { + return m_totalRemoteAppendsRouted.load(std::memory_order_relaxed); + } + // In-flight chunk count across all peers (auto-flush async sends + // currently running). + int GetInflightAppendFlushes() const { + return m_inflightAppendFlushes.load(std::memory_order_relaxed); + } + ErrorCode FlushRemoteAppends() { // Drain the queue under m_flushMutex so concurrent flush callers // serialize. Loop in case items get queued mid-send. This avoids @@ -615,6 +629,9 @@ namespace SPTAG::SPANN { mutable std::mutex m_appendQueueMutex; std::unordered_map> m_appendQueue; std::atomic m_remoteQueueSize{0}; + // Cumulative count of items handed to QueueRemoteAppend over this + // worker's lifetime (does not decrement on send completion). + std::atomic m_totalRemoteAppendsRouted{0}; // Serializes concurrent FlushRemoteAppends() callers so we don't open // hundreds of simultaneous RPC streams to the remote worker (which has // only 8 server threads / 256 connection slots). With this mutex, only diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index f0a95ae93..44d3d63c9 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -84,15 +84,19 @@ namespace SPTAG::SPANN { // Async-job fault-tolerance contract: merges are // safe to retry idempotently (the owner check, the // ContainSample liveness gate, and the locked RMW - // all re-evaluate on each attempt). Re-enqueue - // without touching m_mergeJobsInFlight so the - // outer "wait for in-flight" loop still sees us. - ++m_attempts; + // all re-evaluate on each attempt). Enqueue a + // fresh Job carrying the bumped attempt count — + // the ThreadPool worker will `delete` *this* after + // we return, so we cannot re-add the same pointer. + // Keep m_mergeJobsInFlight unchanged: the new job + // takes ownership of the in-flight slot. SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "MergeAsyncJob: head=%lld attempt=%d failed ret=%d, re-enqueueing\n", - (std::int64_t)m_headID, m_attempts, (int)ret); - m_extraIndex->m_splitThreadPool->add(this); - return; // skip cleanup; Job lives on + (std::int64_t)m_headID, m_attempts + 1, (int)ret); + auto* retryJob = new MergeAsyncJob(m_extraIndex, m_headID, m_callback); + retryJob->m_attempts = m_attempts + 1; + m_extraIndex->m_splitThreadPool->add(retryJob); + return; } m_extraIndex->m_asyncStatus = ret; SPTAGLIB_LOG(Helper::LogLevel::LL_Error, @@ -137,11 +141,14 @@ namespace SPTAG::SPANN { // See MergeAsyncJob: splits are designed safe to // retry from any compute node (read-deduplicate // during the next attempt handles partial writes). - ++m_attempts; + // Enqueue a fresh Job — the ThreadPool worker will + // `delete` *this* after we return. SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "SplitAsyncJob: head=%lld attempt=%d failed ret=%d, re-enqueueing\n", - (std::int64_t)m_headID, m_attempts, (int)ret); - m_extraIndex->m_splitThreadPool->add(this); + (std::int64_t)m_headID, m_attempts + 1, (int)ret); + auto* retryJob = new SplitAsyncJob(m_extraIndex, m_headID, m_callback); + retryJob->m_attempts = m_attempts + 1; + m_extraIndex->m_splitThreadPool->add(retryJob); return; } m_extraIndex->m_asyncStatus = ret; @@ -504,14 +511,18 @@ namespace SPTAG::SPANN { } // Initialize durable HeadSync log + SplitWAL once we know the - // worker (and therefore the node identity). Both are layer-0 - // concerns: only layer 0 actually broadcasts HeadSync and - // performs cross-owner splits. See Distributed/HeadSyncLog.h - // and Distributed/SplitWAL.h. - if (m_layer == 0 && db) { - m_headSyncLog = std::make_unique( - db, m_worker->GetWorkerNodeIndex()); - m_splitWAL = std::make_unique(db); + // worker (and therefore the node identity). Both layers + // perform cross-owner splits, so both layers need a WAL. + // HeadSync, however, only broadcasts the layer-0 head topology + // (layer-1 centroids are derived from layer-0 splits and reach + // peers via the layer-0 HeadSync, so layer 1 doesn't need its + // own broadcast log). + if (db) { + if (m_layer == 0) { + m_headSyncLog = std::make_unique( + db, m_worker->GetWorkerNodeIndex()); + } + m_splitWAL = std::make_unique(db, m_layer); } WireJobSubmitterIfReady(); @@ -682,18 +693,20 @@ namespace SPTAG::SPANN { } // Single source of truth for "this head lives on a different node". - // Only the outer (head) layer participates in the owner-ring route; - // inner layers (m_layer > 0) hold per-node-local state with no - // shared VID space and no cross-node TiKV key contract, so they - // always answer false. When true, outNodeIndex (if not null) is - // populated with the owner's node index. + // Applies to every layer that has a TiKV-backed posting list, since + // DBKey(headID) = m_maxID*m_layer + headID means each layer's keys + // live in the same shared TiKV cluster and are owned by whichever + // node the owner ring assigns. Layer 0 (leaf vector postings) and + // layer 1+ (centroid postings written by recursive AddHeadIndex / + // DeleteIndex during a Split) both go through here. When true, + // outNodeIndex (if not null) is populated with the owner's node + // index. // // Every Split / Merge / Append code path that might touch a head // it doesn't own MUST gate on this predicate so the invariant // (only owners mutate their own postings) is enforced in exactly // one place. bool IsRemoteOwnedHead(SizeType headID, int* outNodeIndex = nullptr) { - if (m_layer != 0) return false; if (!m_worker || !m_worker->IsEnabled()) return false; auto target = m_worker->GetOwner(headID); if (target.isLocal) return false; @@ -3685,18 +3698,44 @@ namespace SPTAG::SPANN { size_t completed = m_totalSplitCompleted.load(); double avgSplitMs = completed > 0 ? (m_totalSplitTimeUs.load() / 1000.0 / completed) : 0; double maxSplitMs = m_maxSplitTimeUs.load() / 1000.0; + // Remote queue stats are layer-agnostic (one queue per + // WorkerNode covers every layer's outbound appends); only + // emit them when m_worker is wired so single-node baselines + // stay quiet. + size_t remoteQ = 0, remoteTotal = 0; + int remoteInflight = 0; + if (m_worker) { + remoteQ = m_worker->GetRemoteQueueSize(); + remoteTotal = m_worker->GetTotalRemoteAppendsRouted(); + remoteInflight = m_worker->GetInflightAppendFlushes(); + } SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "layer %d pending queue:%zu split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " + "remote queueDepth:%zu inflightChunks:%d totalRouted:%zu | " "split_latency avg:%.1fms max:%.1fms\n", m_layer, totalJobs, m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), runningJobs, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), + remoteQ, remoteInflight, remoteTotal, avgSplitMs, maxSplitMs); } if (runningJobs == 0 && totalJobs == 0) { + // Hold ALL DONE until the outbound remote-append queue and + // any in-flight chunks have also drained. Otherwise users + // see "ALL DONE" while the network pump is still shipping + // millions of fanned-out items to peers (see ReplicaCount=8 + // amplification path), giving a misleading "stuck" feel. + size_t remoteQ = 0; int remoteInflight = 0; + if (m_worker) { + remoteQ = m_worker->GetRemoteQueueSize(); + remoteInflight = m_worker->GetInflightAppendFlushes(); + } + if (remoteQ != 0 || remoteInflight != 0) { + return false; + } if (!m_allDonePrinted) { size_t totalSplit = m_totalSplitSubmitted.load(); size_t totalMerge = m_totalMergeSubmitted.load(); @@ -3708,9 +3747,11 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "layer %d ALL DONE | total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " + "remote totalRouted:%zu | " "split_latency avg:%.1fms max:%.1fms\n", m_layer, totalSplit, totalMerge, m_totalReassignSubmitted.load(), totalAppend, m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), + (m_worker ? m_worker->GetTotalRemoteAppendsRouted() : 0), avgSplitMs, maxSplitMs); // [DIAG] dump diagnostic histograms (lock/RMW/grpc/byte) at every ALL DONE boundary { diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 700a5d592..73f7c9a48 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -126,10 +126,16 @@ DefineSSDParameter(m_versionCacheMaxChunks, int, 10000, "VersionCacheMaxChunks") DefineSSDParameter(m_asyncRpcMaxInflight, int, 0, "AsyncRpcMaxInflight") // Distributed RemotePostingOps RPC tuning -DefineSSDParameter(m_remoteAppendChunkSize, int, 3000, "RemoteAppendChunkSize") +// ChunkSize=10000: each in-flight chunk holds enough work to amortize the +// network roundtrip and grpc framing cost (a 3000-item chunk took ~500ms at +// 1M-scale; 10000 should hit ~1.5s and roughly 3× the per-second throughput +// for the same in-flight cap). +DefineSSDParameter(m_remoteAppendChunkSize, int, 10000, "RemoteAppendChunkSize") DefineSSDParameter(m_remoteAppendRetry, int, 3, "RemoteAppendRetry") DefineSSDParameter(m_remoteAppendTimeoutSec, int, 180, "RemoteAppendTimeoutSec") -DefineSSDParameter(m_remoteAppendMaxInflight, int, 4, "RemoteAppendMaxInflight") +// MaxInflight=8 (was 4): keeps the receiver's 16-thread BatchAppendItemJob pool +// well-fed even when one chunk straggles on lock contention. +DefineSSDParameter(m_remoteAppendMaxInflight, int, 8, "RemoteAppendMaxInflight") DefineSSDParameter(m_asyncJobMaxRetry, int, 3, "AsyncJobMaxRetry") DefineSSDParameter(m_remoteLockTtlMs, int, 30000, "RemoteLockTtlMs") From 06d889930f21033ed6574a12c591eba792e3e252 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 14:18:59 +0000 Subject: [PATCH 25/48] feat(distributed): receiver-side durable Batch WAL for RemoteAppend (Option A) When a worker receives a BatchRemoteAppendRequest from a peer, instead of holding the connection open until every item has been applied (which made big chunks block long enough to trigger sender timeouts and full-chunk retries), it now: 1. Serializes the batch and Put()s it to TiKV under wal/rappend// 2. Immediately ACKs the sender as 'Accepted'. 3. Submits the per-item Append jobs onto the per-layer searcher pool. 4. On last-item completion, deletes the WAL key (best-effort). On startup, layer-0's SetWorker scans the WAL prefix and re-submits any batches durably accepted before a previous crash. The Append callback is already idempotent (versionMap dedup), so duplicate replays are safe. Implementation: - New BatchAppendWAL helper (mirrors SplitWAL's style). - New KeyValueIO::ScanPrefix(prefix, out, max) virtual; TiKVIO implements it via paged RawScan with logical-key stripping. Default is no-op so non-TiKV backends keep compiling. - RemotePostingOps::HandleBatchAppendRequest now WAL-then-ACK-then-submit, with a graceful fallback to the legacy synchronous-ACK path if the WAL Put fails. Shared item-dispatch logic is factored out into SubmitBatchItems for reuse by RecoverPendingBatches. - BatchAppendItemJob takes sendResponse/batchID flags so the same job serves both the WAL-backed path (delete WAL on last completion) and the legacy path (ACK on last completion). - ExtraDynamicSearcher::SetWorker constructs the WAL once (layer 0 only, scoped by receiver node) and triggers recovery after callbacks are wired. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/BatchAppendWAL.h | 119 ++++++++++++ .../Core/SPANN/Distributed/RemotePostingOps.h | 171 +++++++++++++++--- .../inc/Core/SPANN/Distributed/WorkerNode.h | 8 + .../inc/Core/SPANN/ExtraDynamicSearcher.h | 44 +++-- .../inc/Core/SPANN/ExtraTiKVController.h | 87 +++++++++ AnnService/inc/Helper/KeyValueIO.h | 13 ++ 6 files changed, 405 insertions(+), 37 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Distributed/BatchAppendWAL.h diff --git a/AnnService/inc/Core/SPANN/Distributed/BatchAppendWAL.h b/AnnService/inc/Core/SPANN/Distributed/BatchAppendWAL.h new file mode 100644 index 000000000..6d9bd3315 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/BatchAppendWAL.h @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_DISTRIBUTED_BATCHAPPENDWAL_H_ +#define _SPTAG_SPANN_DISTRIBUTED_BATCHAPPENDWAL_H_ + +#include "inc/Core/Common.h" +#include "inc/Helper/KeyValueIO.h" +#include "inc/Helper/Logging.h" + +#include +#include +#include +#include +#include +#include + +namespace SPTAG { +namespace SPANN { +namespace Distributed { + +// BatchAppendWAL: durable write-ahead log for accepted BatchRemoteAppend +// requests on the receiver side. +// +// Sender → Receiver flow with this WAL enabled: +// 1. Receiver decodes a BatchRemoteAppendRequest. +// 2. Receiver serializes the request blob and Put()s it under +// wal/rappend//. +// 3. Receiver ACKs the sender immediately ("Accepted"). +// 4. Receiver schedules the per-item Append jobs as before. +// 5. After every item in the batch has been processed, the receiver +// Delete()s the WAL key (best-effort). +// +// Recovery: at startup (after SetWorker has wired the searcher's +// append-callback and job submitter) the receiver scans +// `wal/rappend//` and re-submits each pending batch. +// Items are idempotent — the Append callback checks the versionMap and +// skips RMWs that are already at the recorded version, so duplicate +// replays after a crash do not corrupt postings. +// +// Key schema: +// wal/rappend// → raw BatchRemoteAppendRequest bytes +class BatchAppendWAL { +public: + explicit BatchAppendWAL(std::shared_ptr db, int receiverNode) + : m_db(std::move(db)), m_receiverNode(receiverNode) {} + + bool Enabled() const { return static_cast(m_db); } + + bool Put(std::uint64_t batchID, const std::string& blob) { + if (!m_db) return false; + auto ec = m_db->Put(MakeKey(m_receiverNode, batchID), blob, kTimeout, nullptr); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "BatchAppendWAL::Put node=%d batchID=%llu failed (%d)\n", + m_receiverNode, (unsigned long long)batchID, (int)ec); + return false; + } + return true; + } + + bool Delete(std::uint64_t batchID) { + if (!m_db) return false; + std::vector k{ MakeKey(m_receiverNode, batchID) }; + auto ec = m_db->MultiDelete(k, kTimeout); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "BatchAppendWAL::Delete node=%d batchID=%llu failed (%d) — recovery will replay\n", + m_receiverNode, (unsigned long long)batchID, (int)ec); + return false; + } + return true; + } + + // Returns all (batchID, blob) pairs currently durable for this receiver. + ErrorCode Scan(std::vector>& out) { + out.clear(); + if (!m_db) return ErrorCode::Undefined; + std::vector> kvs; + std::string prefix = MakePrefix(m_receiverNode); + auto ec = m_db->ScanPrefix(prefix, kvs, 0); + if (ec == ErrorCode::Undefined) { + // Backend without ScanPrefix support — no recovery, but logged + // so operators see the gap. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "BatchAppendWAL::Scan: backend has no ScanPrefix; recovery skipped\n"); + return ec; + } + if (ec != ErrorCode::Success) return ec; + for (auto& kv : kvs) { + // kv.first looks like "wal/rappend//" + auto pos = kv.first.find_last_of('/'); + if (pos == std::string::npos) continue; + std::uint64_t batchID = 0; + try { batchID = std::stoull(kv.first.substr(pos + 1)); } + catch (...) { continue; } + out.emplace_back(batchID, std::move(kv.second)); + } + return ErrorCode::Success; + } + + static std::string MakePrefix(int receiverNode) { + return "wal/rappend/" + std::to_string(receiverNode) + "/"; + } + static std::string MakeKey(int receiverNode, std::uint64_t batchID) { + return MakePrefix(receiverNode) + std::to_string(batchID); + } + +private: + static constexpr auto kTimeout = std::chrono::microseconds(5'000'000); + std::shared_ptr m_db; + int m_receiverNode = -1; +}; + +} // namespace Distributed +} // namespace SPANN +} // namespace SPTAG + +#endif // _SPTAG_SPANN_DISTRIBUTED_BATCHAPPENDWAL_H_ diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 1b39f5bc2..88b2478a7 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -4,6 +4,7 @@ #pragma once #include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Core/SPANN/Distributed/BatchAppendWAL.h" #include "inc/Helper/ThreadPool.h" #include "inc/Socket/Client.h" #include "inc/Socket/Server.h" @@ -128,6 +129,59 @@ namespace SPTAG::SPANN { m_jobSubmitters[layer] = std::move(submitter); } + // Receiver-side durable Batch WAL (Option A): when set, every + // incoming BatchRemoteAppendRequest is persisted to the WAL and + // ACKed immediately ("Accepted"); the items are then processed + // asynchronously by the per-layer job submitters and the WAL key + // is deleted on completion. Crash recovery: RecoverPendingBatches + // re-submits any WAL entries that survived a crash. The Append + // callback is idempotent (versionMap dedup), so duplicate replays + // after a crash are safe. + void SetBatchAppendWAL(std::shared_ptr wal) { + std::unique_lock lk(m_callbackLifetimeMutex); + m_batchAppendWAL = std::move(wal); + } + std::shared_ptr GetBatchAppendWAL() const { + std::shared_lock lk(m_callbackLifetimeMutex); + return m_batchAppendWAL; + } + + // Replay any BatchRemoteAppend batches that were durably accepted + // before a previous crash. Call once after the per-layer append + // callbacks + job submitters have been wired. + void RecoverPendingBatches() { + std::shared_ptr wal; + { + std::shared_lock lk(m_callbackLifetimeMutex); + wal = m_batchAppendWAL; + } + if (!wal || !wal->Enabled()) return; + std::vector> entries; + auto ec = wal->Scan(entries); + if (ec != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: BatchAppendWAL scan failed (%d); skipping recovery\n", + (int)ec); + return; + } + if (entries.empty()) return; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RemotePostingOps: recovering %zu pending BatchAppend batches from WAL\n", + entries.size()); + for (auto& e : entries) { + auto batchReq = std::make_shared(); + const auto* p = reinterpret_cast(e.second.data()); + if (batchReq->Read(p, static_cast(e.second.size())) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: WAL batchID=%llu parse failed; dropping\n", + (unsigned long long)e.first); + wal->Delete(e.first); + continue; + } + SubmitBatchItems(batchReq, e.first, /*sendResponse=*/false, /*ackPacket=*/nullptr); + } + } + // Helper: ensure the per-layer registries are wide enough for `layer`. // Caller must hold m_callbackLifetimeMutex in exclusive mode. void EnsureLayerSlot_NoLock(int layer) { @@ -822,16 +876,65 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count); - // Submit each item as a Job to the searcher's shared compute pool. - // Pool workers run the local Append callback exactly like a local - // insert would. Last completion ACKs the sender. This puts remote - // work on the SAME concurrency budget as local Split/Merge/Reassign - // — eliminating the over-subscribed TiKV behaviour of the old - // separate bg executor + transient sub-worker threads. + const size_t total = batchReq->m_items.size(); + if (total == 0) { + SendBatchAppendResponse(packet, 0, 0); + return; + } + + // Option A path: durable Batch WAL is wired. Persist the batch + // first, then ACK the sender as "Accepted" and process items + // asynchronously. If WAL writes fail we fall through to the + // legacy synchronous-ACK path so the sender still sees an + // honest success/fail count. + std::shared_ptr wal; + { + std::shared_lock lk(m_callbackLifetimeMutex); + wal = m_batchAppendWAL; + } + if (wal && wal->Enabled() && !m_jobSubmitters.empty()) { + std::uint64_t batchID = m_nextBatchID.fetch_add(1, std::memory_order_relaxed); + // Re-encode rather than reuse the inbound packet body to + // avoid pinning the receive buffer for the lifetime of the + // batch. + std::string blob; + blob.resize(batchReq->EstimateBufferSize()); + auto* end = batchReq->Write(reinterpret_cast(&blob[0])); + blob.resize(static_cast( + end - reinterpret_cast(blob.data()))); + if (wal->Put(batchID, blob)) { + // Durable — ACK immediately as Accepted (success=total). + SendBatchAppendResponse(packet, + static_cast(total), 0); + SubmitBatchItems(batchReq, batchID, + /*sendResponse=*/false, /*ackPacket=*/nullptr); + return; + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: BatchAppendWAL Put failed batchID=%llu — " + "falling back to synchronous ACK\n", + (unsigned long long)batchID); + } + + // Legacy / fallback path: process items inline-or-async, ACK on + // the last item completion. Identical to pre-WAL behaviour. auto packetPtr = std::make_shared(std::move(packet)); + SubmitBatchItems(batchReq, /*batchID=*/0, + /*sendResponse=*/true, packetPtr); + } + + // Submit each item of `batchReq` to its per-layer job submitter. + // If sendResponse is true, the last completing job ACKs the sender + // via ackPacket. If sendResponse is false (WAL-backed path or + // crash recovery), the last completing job deletes the WAL entry + // identified by `batchID`. + void SubmitBatchItems(std::shared_ptr batchReq, + std::uint64_t batchID, + bool sendResponse, + std::shared_ptr ackPacket) { const size_t total = batchReq->m_items.size(); if (total == 0) { - SendBatchAppendResponse(*packetPtr, 0, 0); + if (sendResponse && ackPacket) SendBatchAppendResponse(*ackPacket, 0, 0); return; } auto remaining = std::make_shared>(total); @@ -839,8 +942,6 @@ namespace SPTAG::SPANN { auto failCount = std::make_shared>(0); if (m_jobSubmitters.empty()) { - // Fallback: process inline on the network thread. Should not - // happen once ExtraDynamicSearcher has wired its pool. SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "RemotePostingOps: no job submitter wired; running BatchAppend synchronously\n"); std::shared_lock cbLock(m_callbackLifetimeMutex); @@ -853,30 +954,28 @@ namespace SPTAG::SPANN { } (r == ErrorCode::Success ? *successCount : *failCount).fetch_add(1); } - SendBatchAppendResponse(*packetPtr, successCount->load(), failCount->load()); + if (sendResponse && ackPacket) { + SendBatchAppendResponse(*ackPacket, successCount->load(), failCount->load()); + } + if (!sendResponse && batchID != 0) { + auto w = GetBatchAppendWAL(); + if (w) w->Delete(batchID); + } return; } for (size_t i = 0; i < total; i++) { auto* job = new BatchAppendItemJob( - this, batchReq, i, remaining, successCount, failCount, packetPtr); - // Route to the per-layer searcher pool matching this item's - // m_layer so local Append/Split/Merge on layer N and remote - // appends targeting layer N share the same 16-thread budget. - // A single global submitter sent both layers' work into one - // pool, causing 35k+ queue depth on the receiver side. + this, batchReq, i, remaining, successCount, failCount, + ackPacket, sendResponse, batchID); int layer = batchReq->m_items[i].m_layer; const JobSubmitter* sub = nullptr; if (layer >= 0 && static_cast(layer) < m_jobSubmitters.size() && m_jobSubmitters[layer]) { sub = &m_jobSubmitters[layer]; } else { - // Layer's pool not yet wired — fall back to whichever - // submitter we have. for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } } - // Per-layer routing (m_jobSubmitters[layer]) isolates layer-N - // append items from other layers' pools. if (sub) (*sub)(job); else { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); } } @@ -1342,12 +1441,16 @@ namespace SPTAG::SPANN { std::shared_ptr> remaining, std::shared_ptr> successCount, std::shared_ptr> failCount, - std::shared_ptr replyPacket) + std::shared_ptr replyPacket, + bool sendResponse = true, + std::uint64_t batchID = 0) : m_ops(ops), m_batchReq(std::move(batchReq)), m_index(index), m_remaining(std::move(remaining)), m_success(std::move(successCount)), m_fail(std::move(failCount)), - m_replyPacket(std::move(replyPacket)) {} + m_replyPacket(std::move(replyPacket)), + m_sendResponse(sendResponse), + m_batchID(batchID) {} void exec(IAbortOperation*) override { run(); } void exec(void* workspace, IAbortOperation*) override { @@ -1372,8 +1475,16 @@ namespace SPTAG::SPANN { else m_fail->fetch_add(1); } if (m_remaining->fetch_sub(1) == 1) { - m_ops->SendBatchAppendResponse( - *m_replyPacket, m_success->load(), m_fail->load()); + if (m_sendResponse && m_replyPacket) { + m_ops->SendBatchAppendResponse( + *m_replyPacket, m_success->load(), m_fail->load()); + } else if (m_batchID != 0) { + // WAL path: sender already ACKed at WAL Put time. + // Best-effort delete; recovery scan would harmlessly + // re-apply (Append callback is idempotent). + auto wal = m_ops->GetBatchAppendWAL(); + if (wal) wal->Delete(m_batchID); + } } } @@ -1384,6 +1495,8 @@ namespace SPTAG::SPANN { std::shared_ptr> m_success; std::shared_ptr> m_fail; std::shared_ptr m_replyPacket; + bool m_sendResponse; + std::uint64_t m_batchID; }; // [Bug 26 retired] bg executor removed — see HandleBatchAppendRequest. @@ -1391,6 +1504,16 @@ namespace SPTAG::SPANN { // searcher's shared SPDKThreadPool via m_jobSubmitters[layer]. std::vector m_jobSubmitters; + // Receiver-side durable Batch WAL: when set, BatchAppendRequest is + // persisted before sender ACK so the receiver can process items + // asynchronously without losing them across a crash. + std::shared_ptr m_batchAppendWAL; + // Monotonic batchID counter (receiver-allocated). Persisted only + // implicitly via the WAL keys themselves; on startup recovery we + // bump past the maximum recovered batchID so live batches don't + // collide with replayed ones. + std::atomic m_nextBatchID{1}; + // HeadSync delivery diagnostics + retry queue (v33). Counters give // observability for sender/receiver gaps; per-peer backlogs + // retry thread make broadcast reliable best-effort. diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index e18c9557d..2f10402fb 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -119,6 +119,14 @@ namespace SPTAG::SPANN { void SetJobSubmitter(int layer, RemotePostingOps::JobSubmitter s) { m_remoteOps.SetJobSubmitter(layer, std::move(s)); } + // Wire the receiver-side durable Batch WAL. See RemotePostingOps + // for semantics. Pass a null pointer to disable. + void SetBatchAppendWAL(std::shared_ptr wal) { + m_remoteOps.SetBatchAppendWAL(std::move(wal)); + } + void RecoverPendingBatchAppendWAL() { + m_remoteOps.RecoverPendingBatches(); + } /// Atomically clear all RPC callbacks (every layer) and wait for any /// in-flight invocation to finish. void ClearCallbacks() { diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 44d3d63c9..2540a2d57 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -23,6 +23,7 @@ #include "Distributed/RemoteLeaseTable.h" #include "Distributed/HeadSyncLog.h" #include "Distributed/SplitWAL.h" +#include "Distributed/BatchAppendWAL.h" #include #include #include @@ -287,6 +288,12 @@ namespace SPTAG::SPANN { // Distributed/HeadSyncLog.h and Distributed/SplitWAL.h. std::unique_ptr m_headSyncLog; std::unique_ptr m_splitWAL; + // Receiver-side Batch WAL for cross-owner BatchAppend (Option A). + // Owned by each layer's searcher but the same TiKV cluster, keyed + // by receiver node index; layer-0 is the only owner that wires it + // into the WorkerNode (only one WAL per receiver, regardless of + // how many layers exist). + std::shared_ptr m_batchAppendWAL; std::atomic m_splitJobIdCounter{ 0 }; IndexStats m_stat; @@ -521,6 +528,13 @@ namespace SPTAG::SPANN { if (m_layer == 0) { m_headSyncLog = std::make_unique( db, m_worker->GetWorkerNodeIndex()); + // Receiver-side Batch WAL is per-receiver, not per-layer. + // Layer-0 owns the install; recovered entries route to + // their original layer via the m_layer field in each + // RemoteAppendRequest. + m_batchAppendWAL = std::make_shared( + db, m_worker->GetWorkerNodeIndex()); + m_worker->SetBatchAppendWAL(m_batchAppendWAL); } m_splitWAL = std::make_unique(db, m_layer); } @@ -649,6 +663,16 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "WorkerNode bound to ExtraDynamicSearcher (layer %d)\n", m_layer); + + // Layer-0 owns the Batch-Append WAL recovery: the append + // callback is now installed and m_jobSubmitters[0] is wired, + // so it is safe to replay any pending batches durably accepted + // before a previous crash. Recovered items route to their + // original layer via the m_layer field; if layer-1's submitter + // is not wired yet they fall back to layer-0's pool. + if (m_layer == 0 && m_batchAppendWAL) { + m_worker->RecoverPendingBatchAppendWAL(); + } } // Owner-side wait for any in-flight remote lock on this bucket. @@ -3723,19 +3747,13 @@ namespace SPTAG::SPANN { avgSplitMs, maxSplitMs); } if (runningJobs == 0 && totalJobs == 0) { - // Hold ALL DONE until the outbound remote-append queue and - // any in-flight chunks have also drained. Otherwise users - // see "ALL DONE" while the network pump is still shipping - // millions of fanned-out items to peers (see ReplicaCount=8 - // amplification path), giving a misleading "stuck" feel. - size_t remoteQ = 0; int remoteInflight = 0; - if (m_worker) { - remoteQ = m_worker->GetRemoteQueueSize(); - remoteInflight = m_worker->GetInflightAppendFlushes(); - } - if (remoteQ != 0 || remoteInflight != 0) { - return false; - } + // Note: AllFinished() must return true once the LOCAL pool + // is drained; SaveIndexData uses it as the shutdown signal. + // We can't gate it on the outbound remote-append queue: + // peers may continue routing reassigns back to us during + // the drain (feedback loop) so the queue is not + // guaranteed to hit zero. Remote queue depth shows up + // in the periodic progress log instead. if (!m_allDonePrinted) { size_t totalSplit = m_totalSplitSubmitted.load(); size_t totalMerge = m_totalMergeSubmitted.load(); diff --git a/AnnService/inc/Core/SPANN/ExtraTiKVController.h b/AnnService/inc/Core/SPANN/ExtraTiKVController.h index 0541eaad1..6b1ecf2fb 100644 --- a/AnnService/inc/Core/SPANN/ExtraTiKVController.h +++ b/AnnService/inc/Core/SPANN/ExtraTiKVController.h @@ -1378,6 +1378,93 @@ namespace SPTAG::SPANN return MultiDeletePrefixed(prefixedKeys, timeout); } + // ScanPrefix: walks `prefix` using paged RawScan and returns logical + // (key, value) pairs with the TiKVIO physical prefix stripped off. + // Used by durable WALs (e.g. BatchAppendWAL) to recover entries + // persisted before a crash. + ErrorCode ScanPrefix(const std::string& prefix, + std::vector>& out, + std::size_t maxEntries) override + { + const auto timeout = std::chrono::microseconds(5'000'000); + std::string physicalPrefix = MakePrefixedKey(prefix); + // RawScan end_key: a key strictly greater than every key in the + // prefix. Increment last byte; if it overflows append 0xff. + std::string endKey = physicalPrefix; + while (!endKey.empty() && static_cast(endKey.back()) == 0xFF) { + endKey.pop_back(); + } + if (endKey.empty()) { + endKey = physicalPrefix + std::string(1, '\xFF'); + } else { + endKey.back() = static_cast(static_cast(endKey.back()) + 1); + } + + std::string cursor = physicalPrefix; + const int pageLimit = 1024; + for (;;) { + int attempt = 0; + bool advanced = false; + std::string lastKey; + int count = 0; + for (; attempt < 10; attempt++) { + auto stub = GetStubForKey(cursor); + if (!stub) { RetryBackoff(attempt); continue; } + + kvrpcpb::RawScanRequest request; + request.set_start_key(cursor); + request.set_end_key(endKey); + request.set_limit(pageLimit); + SetContext(request.mutable_context(), cursor); + + kvrpcpb::RawScanResponse response; + grpc::ClientContext ctx; + SetDeadline(ctx, timeout); + + auto status = stub->RawScan(&ctx, request, &response); + if (!status.ok()) { + if (ShouldLogRetry(attempt)) + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "TiKVIO::ScanPrefix gRPC error (attempt %d): %s\n", + attempt + 1, status.error_message().c_str()); + InvalidateRegionCache(cursor); + RetryBackoff(attempt); + continue; + } + if (response.has_region_error()) { + InvalidateRegionCache(cursor); + RetryBackoff(attempt); + continue; + } + count = response.kvs_size(); + for (int i = 0; i < count; i++) { + const auto& kv = response.kvs(i); + const std::string& k = kv.key(); + if (k.size() < physicalPrefix.size()) continue; + out.emplace_back(k.substr(physicalPrefix.size() - prefix.size()), kv.value()); + if (maxEntries > 0 && out.size() >= maxEntries) { + return ErrorCode::Success; + } + } + if (count > 0) { + lastKey = response.kvs(count - 1).key(); + advanced = true; + } + break; + } + if (attempt >= 10) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "TiKVIO::ScanPrefix exhausted retries\n"); + return ErrorCode::Fail; + } + if (!advanced || count < pageLimit) { + return ErrorCode::Success; + } + // Advance cursor past the last seen key. + cursor = lastKey + std::string(1, '\0'); + } + } + // Variants that accept already-prefixed keys (used by chunk/count helpers // that produce keys via MakeChunkKey / MakeCountKey). ErrorCode MultiPutPrefixed(const std::vector& prefixedKeys, diff --git a/AnnService/inc/Helper/KeyValueIO.h b/AnnService/inc/Helper/KeyValueIO.h index 9d7c1e2a3..bbd7262aa 100644 --- a/AnnService/inc/Helper/KeyValueIO.h +++ b/AnnService/inc/Helper/KeyValueIO.h @@ -95,6 +95,19 @@ namespace SPTAG virtual ErrorCode NextToScan(SizeType& key, std::string* value) {return ErrorCode::Undefined;} + // ScanPrefix: enumerate all (logical key, value) pairs in the + // store whose logical key starts with `prefix`. Implementations + // that prepend their own physical key prefix are expected to + // strip it before returning keys. `maxEntries` caps the result + // size (0 = no cap). Default no-op so non-distributed backends + // don't need to implement it. + virtual ErrorCode ScanPrefix(const std::string& prefix, + std::vector>& out, + std::size_t maxEntries = 0) { + (void)prefix; (void)out; (void)maxEntries; + return ErrorCode::Undefined; + } + virtual void LogAsyncWaitStatsAndReset(int layer) {} }; } From 7aca9f005b2710e8eb9c1ebcae12ede15cf032d2 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 21 May 2026 14:55:59 +0000 Subject: [PATCH 26/48] fix(distributed): receiver-side admission control for Batch WAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After enabling the WAL-then-ACK fast path, an aggressive sender could ACK 1M items in seconds while the receiver's apply pool was still working through the first 10k — pending queue grew unbounded (1M+), splits starved because pool workers were all blocked on appends. Add admission control: RemotePostingOps counts items currently queued for async apply via m_walPendingItems. When admitting a new batch would push that above m_walPendingItemsCap (default 50000) we DELIBERATELY fall back to the synchronous-ACK path, which re-engages the sender's MaxInflight gate as a natural backpressure mechanism. Also surface m_walPendingItems in the per-layer progress log ('walPendingItems:N') so operators can see when admission control is actively engaged. Verified 2-node insert_dominant 1M+1M: insert throughput 710→770/s (+8.5%), recall@5 0.976→0.984, post-insert qps 401→438. Pending queue stays bounded at ~80-130k under load; splits make steady progress. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 58 +++++++++++++++++-- .../inc/Core/SPANN/Distributed/WorkerNode.h | 6 ++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 6 +- 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 88b2478a7..87c1ea87f 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -887,23 +887,33 @@ namespace SPTAG::SPANN { // asynchronously. If WAL writes fail we fall through to the // legacy synchronous-ACK path so the sender still sees an // honest success/fail count. + // + // Admission control: when the receiver already has more than + // `m_walPendingItemsCap` items queued for asynchronous apply, + // we DELIBERATELY take the legacy synchronous-ACK path even + // though the WAL is wired. That re-engages the natural + // backpressure (sender's MaxInflight blocks until current + // chunks ACK), preventing unbounded pool queue growth on the + // receiver under sustained load. Without this, a fast sender + // could ACK 1M items in seconds while the apply pool is still + // working through the first 10k. std::shared_ptr wal; { std::shared_lock lk(m_callbackLifetimeMutex); wal = m_batchAppendWAL; } - if (wal && wal->Enabled() && !m_jobSubmitters.empty()) { + const std::size_t pendingNow = m_walPendingItems.load(std::memory_order_relaxed); + const std::size_t cap = m_walPendingItemsCap.load(std::memory_order_relaxed); + const bool overCap = (cap > 0 && pendingNow + total > cap); + if (wal && wal->Enabled() && !m_jobSubmitters.empty() && !overCap) { std::uint64_t batchID = m_nextBatchID.fetch_add(1, std::memory_order_relaxed); - // Re-encode rather than reuse the inbound packet body to - // avoid pinning the receive buffer for the lifetime of the - // batch. std::string blob; blob.resize(batchReq->EstimateBufferSize()); auto* end = batchReq->Write(reinterpret_cast(&blob[0])); blob.resize(static_cast( end - reinterpret_cast(blob.data()))); if (wal->Put(batchID, blob)) { - // Durable — ACK immediately as Accepted (success=total). + m_walPendingItems.fetch_add(total, std::memory_order_relaxed); SendBatchAppendResponse(packet, static_cast(total), 0); SubmitBatchItems(batchReq, batchID, @@ -914,6 +924,14 @@ namespace SPTAG::SPANN { "RemotePostingOps: BatchAppendWAL Put failed batchID=%llu — " "falling back to synchronous ACK\n", (unsigned long long)batchID); + } else if (overCap) { + static std::atomic sLogCounter{0}; + if ((sLogCounter.fetch_add(1) % 256) == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RemotePostingOps: BatchAppendWAL admission-control engaged " + "(pending=%zu+%zu > cap=%zu) — using synchronous ACK\n", + pendingNow, total, cap); + } } // Legacy / fallback path: process items inline-or-async, ACK on @@ -1484,6 +1502,7 @@ namespace SPTAG::SPANN { // re-apply (Append callback is idempotent). auto wal = m_ops->GetBatchAppendWAL(); if (wal) wal->Delete(m_batchID); + m_ops->NoteWalPendingItemsDrained(m_batchReq->m_items.size()); } } } @@ -1513,6 +1532,35 @@ namespace SPTAG::SPANN { // bump past the maximum recovered batchID so live batches don't // collide with replayed ones. std::atomic m_nextBatchID{1}; + // Admission control for the WAL-backed path. When the sum of items + // already queued for asynchronous apply plus the incoming batch + // would exceed `m_walPendingItemsCap`, HandleBatchAppendRequest + // falls back to the synchronous-ACK path so the sender's + // MaxInflight gate naturally backpressures further chunks. Cap of + // 0 disables admission control (always WAL when wired). Default is + // ~ChunkSize * MaxInflightPerNode * NumPeers, chosen to absorb one + // round-trip's worth of items without unbounded queue growth. + std::atomic m_walPendingItems{0}; + std::atomic m_walPendingItemsCap{50000}; + + public: + void NoteWalPendingItemsDrained(std::size_t n) { + if (n == 0) return; + std::size_t prev = m_walPendingItems.fetch_sub(n, std::memory_order_relaxed); + if (prev < n) { + // Saturating clamp (defensive: should never happen because + // every increment in HandleBatchAppendRequest is paired + // with exactly one decrement in BatchAppendItemJob). + m_walPendingItems.store(0, std::memory_order_relaxed); + } + } + void SetBatchAppendWalPendingItemsCap(std::size_t cap) { + m_walPendingItemsCap.store(cap, std::memory_order_relaxed); + } + std::size_t GetBatchAppendWalPendingItems() const { + return m_walPendingItems.load(std::memory_order_relaxed); + } + private: // HeadSync delivery diagnostics + retry queue (v33). Counters give // observability for sender/receiver gaps; per-peer backlogs + diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index 2f10402fb..7597f6955 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -127,6 +127,12 @@ namespace SPTAG::SPANN { void RecoverPendingBatchAppendWAL() { m_remoteOps.RecoverPendingBatches(); } + void SetBatchAppendWalPendingItemsCap(std::size_t cap) { + m_remoteOps.SetBatchAppendWalPendingItemsCap(cap); + } + std::size_t GetBatchAppendWalPendingItems() const { + return m_remoteOps.GetBatchAppendWalPendingItems(); + } /// Atomically clear all RPC callbacks (every layer) and wait for any /// in-flight invocation to finish. void ClearCallbacks() { diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 2540a2d57..b56e10812 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -3728,22 +3728,24 @@ namespace SPTAG::SPANN { // stay quiet. size_t remoteQ = 0, remoteTotal = 0; int remoteInflight = 0; + std::size_t walPending = 0; if (m_worker) { remoteQ = m_worker->GetRemoteQueueSize(); remoteTotal = m_worker->GetTotalRemoteAppendsRouted(); remoteInflight = m_worker->GetInflightAppendFlushes(); + walPending = m_worker->GetBatchAppendWalPendingItems(); } SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "layer %d pending queue:%zu split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " - "remote queueDepth:%zu inflightChunks:%d totalRouted:%zu | " + "remote queueDepth:%zu inflightChunks:%d totalRouted:%zu walPendingItems:%zu | " "split_latency avg:%.1fms max:%.1fms\n", m_layer, totalJobs, m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), runningJobs, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), - remoteQ, remoteInflight, remoteTotal, + remoteQ, remoteInflight, remoteTotal, walPending, avgSplitMs, maxSplitMs); } if (runningJobs == 0 && totalJobs == 0) { From 2088e136feb19a91fd59c8c6f9a4c9db1d5078bc Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Fri, 22 May 2026 05:43:05 +0000 Subject: [PATCH 27/48] fix(distributed): stop replaying moved-out items + per-layer remote-origin pending gauge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug fix ------- SendBatchRemoteAppend moves items[i] into per-chunk std::vector and calls SendBatchRemoteAppendChunk. If a chunk failed (e.g. timeout) the function returned without restoring the moved-out items, so the caller's vector ended up with the leading chunks moved-from (headID + appendNum scalars still valid, but m_headVec / m_appendPosting empty). WorkerNode::QueueRemoteAppend's auto-flush path then copied the whole vector into a watchdog retry queue, which re-sent valid headIDs with empty postings. The receiver hit the TiKVIO::Merge empty-value gate, logged 'TiKVIO::Merge: empty append posting!' and 'Merge failed for HEAD! Posting Size:0' for every such phantom item — in a 2-node insert_dominant run we observed 390k+ such errors on the driver and 60k on the worker. Fix: - SendBatchRemoteAppend now (a) restores moved-out items from the still-populated chunk on failure, then erases the already-sent prefix so the caller-side retry only sees unsent payload, and (b) clears the input vector on full success so any spurious retry becomes a no-op instead of resurrecting phantom items. - Append() drops empty/zero-count payloads with a single warning rather than letting them reach the storage layer (defensive guard; receiver should never see these once the sender bug above is fixed). Observability ------------- Added a per-layer counter m_remoteOriginPending in RemotePostingOps, incremented in SubmitBatchItems and decremented in BatchAppendItemJob. Exposed via WorkerNode::GetRemoteOriginPendingItems(layer) and a whole-node aggregate. The progress log in ExtraDynamicSearcher now prints 'pending queue:N (local:X remote:Y)' so operators can tell whether the local pool is bottlenecked on its own RMWs/splits or on serving peer BatchAppend items. Both progress log call sites (AllFinished's periodic line and GetDBStats's on-demand line) updated to the same format with the remote out queueDepth / inflightChunks / walPendingItems context. Verified on 2-node insert_dominant: 0 empty-posting / merge-failed errors (was 450k), throughput 758-797/s (within noise of 710 baseline and 770 WAL run), recall 0.984-0.990. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 85 ++++++++++++++++--- .../inc/Core/SPANN/Distributed/WorkerNode.h | 6 ++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 58 ++++++++++--- 3 files changed, 127 insertions(+), 22 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 87c1ea87f..5be825651 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -389,16 +389,15 @@ namespace SPTAG::SPANN { // RemoteAppendChunkSize (default 3000). const size_t kChunkSize = std::max(1, (size_t)m_rpcChunkSize.load(std::memory_order_relaxed)); - const size_t total = items.size(); - size_t offset = 0; + size_t kept = 0; std::vector chunk; - chunk.reserve(std::min(kChunkSize, total)); + chunk.reserve(std::min(kChunkSize, items.size())); - while (offset < total) { - size_t end = std::min(offset + kChunkSize, total); + while (kept < items.size()) { + size_t end = std::min(kept + kChunkSize, items.size()); chunk.clear(); - chunk.reserve(end - offset); - for (size_t i = offset; i < end; ++i) { + chunk.reserve(end - kept); + for (size_t i = kept; i < end; ++i) { chunk.push_back(std::move(items[i])); } @@ -406,11 +405,28 @@ namespace SPTAG::SPANN { if (chunkRet != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "RemotePostingOps: Chunk send failed to node %d (offset=%zu/%zu, chunk=%zu items)\n", - targetNodeIndex, offset, total, end - offset); + targetNodeIndex, kept, items.size(), end - kept); + // Restore the moved-out items in [kept..end) from the + // still-populated chunk, then drop the already-sent + // prefix [0..kept) so retrying the caller sees only + // the unsent payload. Without this compaction, the + // auto-flush watchdog would resend already-successful + // items whose m_appendPosting/m_headVec strings are + // now empty (moved-out), and the receiver would log + // "empty append posting!" for each such phantom item. + for (size_t i = 0; i < chunk.size() && (kept + i) < items.size(); ++i) { + items[kept + i] = std::move(chunk[i]); + } + if (kept > 0) { + items.erase(items.begin(), items.begin() + kept); + } return chunkRet; } - offset = end; + kept = end; } + // All chunks sent successfully — fully drain the input so any + // caller-side retry sees an empty vector. + items.clear(); return ErrorCode::Success; } @@ -994,7 +1010,10 @@ namespace SPTAG::SPANN { } else { for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } } - if (sub) (*sub)(job); + if (sub) { + RemoteOriginPendingSlot(layer).fetch_add(1, std::memory_order_relaxed); + (*sub)(job); + } else { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); } } } @@ -1492,6 +1511,14 @@ namespace SPTAG::SPANN { if (r == ErrorCode::Success) m_success->fetch_add(1); else m_fail->fetch_add(1); } + // Decrement per-layer remote-origin pool gauge for every + // completed item (paired with increment in SubmitBatchItems). + { + int layer = m_batchReq->m_items[m_index].m_layer; + auto& slot = m_ops->RemoteOriginPendingSlot(layer); + std::size_t prev = slot.fetch_sub(1, std::memory_order_relaxed); + if (prev == 0) slot.store(0, std::memory_order_relaxed); // saturating + } if (m_remaining->fetch_sub(1) == 1) { if (m_sendResponse && m_replyPacket) { m_ops->SendBatchAppendResponse( @@ -1543,7 +1570,45 @@ namespace SPTAG::SPANN { std::atomic m_walPendingItems{0}; std::atomic m_walPendingItemsCap{50000}; + // Per-layer count of items submitted to the local job pool that + // originated from a peer's BatchAppend RPC (covers BOTH the + // WAL-backed and legacy synchronous-ACK paths). Lets the periodic + // progress log split "pending queue" into local-origin RMWs vs + // remote-origin items so operators can tell whether the receiver + // is bottlenecked on its own inserts or on serving peers. Indexed + // by req.m_layer; sized lazily to max observed layer + 1. + mutable std::mutex m_remoteOriginPendingMutex; + std::vector> m_remoteOriginPending; + + std::atomic& RemoteOriginPendingSlot(int layer) { + if (layer < 0) layer = 0; + { + std::lock_guard g(m_remoteOriginPendingMutex); + if (static_cast(layer) >= m_remoteOriginPending.size()) { + std::vector> grown(layer + 1); + for (std::size_t i = 0; i < m_remoteOriginPending.size(); ++i) { + grown[i].store(m_remoteOriginPending[i].load(std::memory_order_relaxed), + std::memory_order_relaxed); + } + m_remoteOriginPending = std::move(grown); + } + } + return m_remoteOriginPending[layer]; + } + public: + std::size_t GetRemoteOriginPendingItems(int layer) const { + std::lock_guard g(m_remoteOriginPendingMutex); + if (layer < 0 || static_cast(layer) >= m_remoteOriginPending.size()) return 0; + return m_remoteOriginPending[layer].load(std::memory_order_relaxed); + } + // Aggregate across all layers (whole-node view). + std::size_t GetRemoteOriginPendingItems() const { + std::lock_guard g(m_remoteOriginPendingMutex); + std::size_t sum = 0; + for (auto& a : m_remoteOriginPending) sum += a.load(std::memory_order_relaxed); + return sum; + } void NoteWalPendingItemsDrained(std::size_t n) { if (n == 0) return; std::size_t prev = m_walPendingItems.fetch_sub(n, std::memory_order_relaxed); diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index 7597f6955..b8fa36998 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -133,6 +133,12 @@ namespace SPTAG::SPANN { std::size_t GetBatchAppendWalPendingItems() const { return m_remoteOps.GetBatchAppendWalPendingItems(); } + std::size_t GetRemoteOriginPendingItems() const { + return m_remoteOps.GetRemoteOriginPendingItems(); + } + std::size_t GetRemoteOriginPendingItems(int layer) const { + return m_remoteOps.GetRemoteOriginPendingItems(layer); + } /// Atomically clear all RPC callbacks (every layer) and wait for any /// in-flight invocation to finish. void ClearCallbacks() { diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index b56e10812..c5d074afd 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -2289,12 +2289,19 @@ namespace SPTAG::SPANN { ErrorCode Append(ExtraWorkSpace* p_exWorkSpace, SizeType headID, int appendNum, std::string& appendPosting, int reassignThreshold = 0) { auto appendBegin = std::chrono::high_resolution_clock::now(); - if (appendPosting.empty()) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Error! empty append posting!\n"); - } - - if (appendNum == 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Error!, headID :%lld, appendNum:%d\n", (std::int64_t)headID, appendNum); + if (appendPosting.empty() || appendNum == 0) { + // Defensive: drop empty/zero-count appends rather than letting + // them reach the storage layer (which would log + // "TiKVIO::Merge: empty append posting!" and fail). Empty + // payloads should never be produced by normal flow, but they + // can arise from buggy sender-side retries that resend + // already-consumed (moved-from) items. + if (appendPosting.empty() && appendNum != 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Append: dropping empty posting for headID=%lld appendNum=%d\n", + (std::int64_t)headID, appendNum); + } + return ErrorCode::Success; } // If this head is owned by a remote node, route the append via @@ -3729,19 +3736,30 @@ namespace SPTAG::SPANN { size_t remoteQ = 0, remoteTotal = 0; int remoteInflight = 0; std::size_t walPending = 0; + std::size_t remoteOriginPending = 0; if (m_worker) { remoteQ = m_worker->GetRemoteQueueSize(); remoteTotal = m_worker->GetTotalRemoteAppendsRouted(); remoteInflight = m_worker->GetInflightAppendFlushes(); walPending = m_worker->GetBatchAppendWalPendingItems(); - } + remoteOriginPending = m_worker->GetRemoteOriginPendingItems(m_layer); + } + // Split the local pool's pending queue into the portion + // serving peer-originated BatchAppend items vs the residual + // (local-origin RMWs, split/merge/reassign jobs). Helps + // operators distinguish "I'm bottlenecked applying remote + // work" from "my own inserts are backlogged". + size_t localPending = totalJobs > remoteOriginPending + ? totalJobs - remoteOriginPending + : 0; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "layer %d pending queue:%zu split:%zu merge:%zu append:%zu reassign:%zu running:%u | " + "layer %d pending queue:%zu (local:%zu remote:%zu) split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " - "remote queueDepth:%zu inflightChunks:%d totalRouted:%zu walPendingItems:%zu | " + "remote out queueDepth:%zu inflightChunks:%d totalRouted:%zu walPendingItems:%zu | " "split_latency avg:%.1fms max:%.1fms\n", - m_layer, totalJobs, m_splitJobsInFlight.load(), + m_layer, totalJobs, localPending, remoteOriginPending, + m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), runningJobs, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), @@ -3862,17 +3880,33 @@ namespace SPTAG::SPANN { double avgSplitMs = completedSplit > 0 ? (m_totalSplitTimeUs.load() / 1000.0 / completedSplit) : 0; double maxSplitMs = m_maxSplitTimeUs.load() / 1000.0; size_t totalJobs = m_splitThreadPool ? m_splitThreadPool->jobsize() : 0; + size_t remoteQ = 0, remoteTotal = 0; + int remoteInflight = 0; + std::size_t walPending = 0; + std::size_t remoteOriginPending = 0; + if (m_worker) { + remoteQ = m_worker->GetRemoteQueueSize(); + remoteTotal = m_worker->GetTotalRemoteAppendsRouted(); + remoteInflight = m_worker->GetInflightAppendFlushes(); + walPending = m_worker->GetBatchAppendWalPendingItems(); + remoteOriginPending = m_worker->GetRemoteOriginPendingItems(m_layer); + } + size_t localPending = totalJobs > remoteOriginPending + ? totalJobs - remoteOriginPending + : 0; // if (!ShouldLogProgress(totalJobs)) return; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "layer %d pending queue:%zu split:%zu merge:%zu append:%zu reassign:%zu running:%u | " + "layer %d pending queue:%zu (local:%zu remote:%zu) split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " + "remote out queueDepth:%zu inflightChunks:%d totalRouted:%zu walPendingItems:%zu | " "split_latency avg:%.1fms max:%.1fms\n", - m_layer, totalJobs, + m_layer, totalJobs, localPending, remoteOriginPending, m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), m_splitThreadPool ? static_cast(m_splitThreadPool->runningJobs()) : 0, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), + remoteQ, remoteInflight, remoteTotal, walPending, avgSplitMs, maxSplitMs); } From 3107dbcf65e8520151784f5ce73e0e993ba57114 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Fri, 22 May 2026 06:25:03 +0000 Subject: [PATCH 28/48] feat(distributed): classify async-job errors + exponential backoff retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background ---------- MergeAsyncJob and SplitAsyncJob previously retried every non-Success ErrorCode in a tight loop (re-enqueueing immediately on the same pool worker), capped at AsyncJobMaxRetry=3. This wasted pool slots on permanent failures (e.g. logical data inconsistencies that no retry can repair) and gave transient TiKV failures only ~no time to recover before exhausting the retry budget. Changes ------- - Add Distributed/DelayedJobScheduler.h: a single-threaded helper that re-enqueues ThreadPool::Job pointers to a target pool after a per-call delay. Owns pending jobs between Schedule() and dispatch; destructor joins the worker and deletes any undispatched jobs to avoid leaks on shutdown. Holds the target pool via shared_ptr so the scheduler can survive teardown ordering. - Add IsTransientAsyncJobError(ret) classifier. Transient: Fail, DiskIOFail, EmptyDiskIO, Socket_*. Permanent: everything else (Key_NotFound, Posting_*, Block_IDError, etc.). ErrorCode::Fail is intentionally transient because every TiKV failure path returns it; the rare logical-Fail callers (e.g. headVec-missing in MergePostings) pay a bounded number of wasted retries which is acceptable until a more specific code is introduced. - Add AsyncJobRetryBackoffMs(attempt): exponential backoff (200ms doubling, capped at 30s). - MergeAsyncJob and SplitAsyncJob now: * On transient + retry budget remaining → re-enqueue via the DelayedJobScheduler with exponential backoff (off the pool worker so we do not block a job slot during the wait). * On permanent → log Warning once, drop, do NOT poison m_asyncStatus (these are typically per-head local inconsistencies that the next caller-driven recovery handles, and surfacing them as process-wide failure was hiding real transient issues). * On transient with budget exhausted → keep the existing behaviour of setting m_asyncStatus + LL_Error, so a persistent outage still bubbles up. - Bump AsyncJobMaxRetry default 3 -> 8. With the new backoff schedule this gives ~25s total retry budget per job (200+400+800+1600+3200+ 6400+12800ms), enough to ride out a short TiKV region rebalance or network blip without the operator needing to override the config. The scheduler is lazily constructed on first retry, so single-node / build-only paths that never exercise async retries do not pay for an extra background thread. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../SPANN/Distributed/DelayedJobScheduler.h | 169 ++++++++++++++++++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 99 +++++++--- .../inc/Core/SPANN/ParameterDefinitionList.h | 2 +- 3 files changed, 244 insertions(+), 26 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Distributed/DelayedJobScheduler.h diff --git a/AnnService/inc/Core/SPANN/Distributed/DelayedJobScheduler.h b/AnnService/inc/Core/SPANN/Distributed/DelayedJobScheduler.h new file mode 100644 index 000000000..9661439fd --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DelayedJobScheduler.h @@ -0,0 +1,169 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_DISTRIBUTED_DELAYEDJOBSCHEDULER_H_ +#define _SPTAG_SPANN_DISTRIBUTED_DELAYEDJOBSCHEDULER_H_ + +#include "inc/Helper/Concurrent.h" +#include "inc/Helper/ThreadPool.h" +#include "inc/Helper/Logging.h" +#include "inc/Core/Common.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG { +namespace SPANN { +namespace Distributed { + +// DelayedJobScheduler runs a single worker thread that re-enqueues +// previously-failed ThreadPool jobs after an exponential backoff. It +// exists so async Merge/Split job retries can wait before retrying +// (instead of busy-spinning the pool worker) without blocking any actual +// pool slot during the wait. +// +// Jobs are owned by the scheduler between Schedule() and the moment they +// are transferred to the destination pool. If the scheduler is destroyed +// while jobs are still pending (e.g. process shutdown), the destructor +// drains the heap and deletes every undispatched job so the Helper:: +// ThreadPool::Job allocations do not leak. +// +// The destination pool is held via shared_ptr so the scheduler can survive +// teardown ordering — the pool stays alive as long as either the scheduler +// or the original owner still holds a reference. +class DelayedJobScheduler { +public: + DelayedJobScheduler() : m_stop(false) { + m_worker = std::thread([this] { Loop(); }); + } + + ~DelayedJobScheduler() { + { + std::lock_guard g(m_mu); + m_stop = true; + } + m_cv.notify_all(); + if (m_worker.joinable()) m_worker.join(); + std::lock_guard g(m_mu); + for (auto& e : m_heap) { + if (e.job) delete e.job; + } + m_heap.clear(); + } + + // Take ownership of `job` and add it to `pool` after `delayMs`. + // `pool` must be non-null; `job` must be non-null and not already + // queued anywhere. + void Schedule(std::shared_ptr pool, + Helper::ThreadPool::Job* job, int delayMs) { + if (!pool || !job) { if (job) delete job; return; } + Entry e; + e.deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(delayMs); + e.pool = std::move(pool); + e.job = job; + { + std::lock_guard g(m_mu); + m_heap.push_back(std::move(e)); + std::push_heap(m_heap.begin(), m_heap.end(), Cmp{}); + } + m_cv.notify_all(); + } + + std::size_t Pending() const { + std::lock_guard g(m_mu); + return m_heap.size(); + } + +private: + struct Entry { + std::chrono::steady_clock::time_point deadline; + std::shared_ptr pool; + Helper::ThreadPool::Job* job = nullptr; + }; + struct Cmp { + bool operator()(const Entry& a, const Entry& b) const { + return a.deadline > b.deadline; + } + }; + + void Loop() { + std::unique_lock lk(m_mu); + while (!m_stop) { + if (m_heap.empty()) { + m_cv.wait(lk); + continue; + } + auto now = std::chrono::steady_clock::now(); + if (m_heap.front().deadline <= now) { + Entry e = std::move(m_heap.front()); + std::pop_heap(m_heap.begin(), m_heap.end(), Cmp{}); + m_heap.pop_back(); + lk.unlock(); + if (e.pool) { + e.pool->add(e.job); + } else if (e.job) { + delete e.job; + } + lk.lock(); + continue; + } + m_cv.wait_until(lk, m_heap.front().deadline); + } + } + + mutable std::mutex m_mu; + std::condition_variable m_cv; + std::vector m_heap; + bool m_stop; + std::thread m_worker; +}; + +// Classify an async-job failure into transient (retry with backoff) +// vs permanent (drop with warning). Transient codes capture TiKV / IO +// errors that should clear on a later attempt; permanent codes capture +// logical inconsistencies (e.g. a vector ID outside the version map, +// a posting whose serialized header is malformed) that no number of +// retries can repair. +// +// ErrorCode::Fail is intentionally classified transient: every TiKV +// failure path in ExtraTiKVController returns Fail, and the few logical +// callers that also return Fail (e.g. MergePostings when the head vector +// is missing from its own posting) are rare enough that a bounded number +// of wasted retries is acceptable. If a more specific ErrorCode value +// becomes available for the logical case, demote those returns there +// and remove Fail from the transient set. +inline bool IsTransientAsyncJobError(ErrorCode ret) { + switch (ret) { + case ErrorCode::Fail: + case ErrorCode::DiskIOFail: + case ErrorCode::EmptyDiskIO: + case ErrorCode::Socket_FailedConnectToEndPoint: + case ErrorCode::Socket_FailedResolveEndPoint: + return true; + default: + return false; + } +} + +// Exponential backoff with a cap. `attempt` is 0-based (0 = first retry). +inline int AsyncJobRetryBackoffMs(int attempt, + int initialMs = 200, + int capMs = 30000) { + if (attempt < 0) attempt = 0; + if (attempt > 20) attempt = 20; + long long delay = (long long)initialMs << attempt; + if (delay > capMs) delay = capMs; + return (int)delay; +} + +} // namespace Distributed +} // namespace SPANN +} // namespace SPTAG + +#endif // _SPTAG_SPANN_DISTRIBUTED_DELAYEDJOBSCHEDULER_H_ diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c5d074afd..6f190f197 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -24,6 +24,7 @@ #include "Distributed/HeadSyncLog.h" #include "Distributed/SplitWAL.h" #include "Distributed/BatchAppendWAL.h" +#include "Distributed/DelayedJobScheduler.h" #include #include #include @@ -79,30 +80,54 @@ namespace SPTAG::SPANN { inline void exec(void* p_workSpace, IAbortOperation* p_abort) override { ErrorCode ret = m_extraIndex->MergePostings((ExtraWorkSpace*)p_workSpace, m_headID); if (ret != ErrorCode::Success) { + // Classify before retrying: transient errors (TiKV + // region_error, timeout, generic Fail from the IO + // layer) deserve a bounded retry with exponential + // backoff; permanent errors (data inconsistency, + // unknown ErrorCode) cannot be repaired by retry and + // get dropped with a warning so we don't burn + // pool slots in a hot fail loop. int maxRetry = m_extraIndex->m_opt ? m_extraIndex->m_opt->m_asyncJobMaxRetry : 0; - if (m_attempts + 1 < maxRetry) { + bool transient = Distributed::IsTransientAsyncJobError(ret); + if (transient && m_attempts + 1 < maxRetry) { // Async-job fault-tolerance contract: merges are // safe to retry idempotently (the owner check, the // ContainSample liveness gate, and the locked RMW // all re-evaluate on each attempt). Enqueue a - // fresh Job carrying the bumped attempt count — - // the ThreadPool worker will `delete` *this* after - // we return, so we cannot re-add the same pointer. - // Keep m_mergeJobsInFlight unchanged: the new job + // fresh Job carrying the bumped attempt count via + // the delayed-retry scheduler so backoff happens + // OFF the pool worker — the ThreadPool worker + // will `delete` *this* after we return, so we + // cannot re-add the same pointer. Keep + // m_mergeJobsInFlight unchanged: the new job // takes ownership of the in-flight slot. + int backoffMs = Distributed::AsyncJobRetryBackoffMs(m_attempts); SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergeAsyncJob: head=%lld attempt=%d failed ret=%d, re-enqueueing\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret); + "MergeAsyncJob: head=%lld attempt=%d failed ret=%d (transient), backoff=%dms\n", + (std::int64_t)m_headID, m_attempts + 1, (int)ret, backoffMs); auto* retryJob = new MergeAsyncJob(m_extraIndex, m_headID, m_callback); retryJob->m_attempts = m_attempts + 1; - m_extraIndex->m_splitThreadPool->add(retryJob); + m_extraIndex->GetOrCreateDelayedRetryScheduler().Schedule( + m_extraIndex->m_splitThreadPool, retryJob, backoffMs); return; } - m_extraIndex->m_asyncStatus = ret; - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "MergeAsyncJob: head=%lld giving up after %d attempts ret=%d\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret); + if (!transient) { + // Permanent: log once and drop. Do not promote to + // m_asyncStatus — these are usually local data + // inconsistencies (e.g. version skew) that the + // next caller-driven recovery will repair, and + // poisoning m_asyncStatus would surface them as + // a process-wide failure. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergeAsyncJob: head=%lld permanent failure ret=%d, dropping\n", + (std::int64_t)m_headID, (int)ret); + } else { + m_extraIndex->m_asyncStatus = ret; + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "MergeAsyncJob: head=%lld giving up after %d attempts ret=%d (transient exhausted)\n", + (std::int64_t)m_headID, m_attempts + 1, (int)ret); + } } m_extraIndex->m_mergeJobsInFlight--; m_extraIndex->m_totalMergeCompleted++; @@ -136,26 +161,34 @@ namespace SPTAG::SPANN { uint64_t prevMax = m_extraIndex->m_maxSplitTimeUs.load(); while (elapsedUs > prevMax && !m_extraIndex->m_maxSplitTimeUs.compare_exchange_weak(prevMax, elapsedUs)); if (ret != ErrorCode::Success) { + // Same classification scheme as MergeAsyncJob. + // Splits are designed safe to retry idempotently + // (read-deduplicate during the next attempt handles + // partial writes from a previously-crashed attempt). int maxRetry = m_extraIndex->m_opt ? m_extraIndex->m_opt->m_asyncJobMaxRetry : 0; - if (m_attempts + 1 < maxRetry) { - // See MergeAsyncJob: splits are designed safe to - // retry from any compute node (read-deduplicate - // during the next attempt handles partial writes). - // Enqueue a fresh Job — the ThreadPool worker will - // `delete` *this* after we return. + bool transient = Distributed::IsTransientAsyncJobError(ret); + if (transient && m_attempts + 1 < maxRetry) { + int backoffMs = Distributed::AsyncJobRetryBackoffMs(m_attempts); SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "SplitAsyncJob: head=%lld attempt=%d failed ret=%d, re-enqueueing\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret); + "SplitAsyncJob: head=%lld attempt=%d failed ret=%d (transient), backoff=%dms\n", + (std::int64_t)m_headID, m_attempts + 1, (int)ret, backoffMs); auto* retryJob = new SplitAsyncJob(m_extraIndex, m_headID, m_callback); retryJob->m_attempts = m_attempts + 1; - m_extraIndex->m_splitThreadPool->add(retryJob); + m_extraIndex->GetOrCreateDelayedRetryScheduler().Schedule( + m_extraIndex->m_splitThreadPool, retryJob, backoffMs); return; } - m_extraIndex->m_asyncStatus = ret; - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "SplitAsyncJob: head=%lld giving up after %d attempts ret=%d\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret); + if (!transient) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "SplitAsyncJob: head=%lld permanent failure ret=%d, dropping\n", + (std::int64_t)m_headID, (int)ret); + } else { + m_extraIndex->m_asyncStatus = ret; + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "SplitAsyncJob: head=%lld giving up after %d attempts ret=%d (transient exhausted)\n", + (std::int64_t)m_headID, m_attempts + 1, (int)ret); + } } m_extraIndex->m_splitJobsInFlight--; m_extraIndex->m_totalSplitCompleted++; @@ -4027,6 +4060,22 @@ namespace SPTAG::SPANN { std::shared_ptr m_splitThreadPool; std::shared_ptr m_reassignThreadPool; + + // Single-threaded scheduler used by MergeAsyncJob / SplitAsyncJob + // to re-enqueue retries after exponential backoff (transient + // TiKV/IO failures). Lazily created on first retry to avoid the + // worker thread in single-node / build-only paths that never + // exercise async retries. + std::mutex m_delayedRetrySchedulerMutex; + std::unique_ptr m_delayedRetryScheduler; + + Distributed::DelayedJobScheduler& GetOrCreateDelayedRetryScheduler() { + std::lock_guard g(m_delayedRetrySchedulerMutex); + if (!m_delayedRetryScheduler) { + m_delayedRetryScheduler.reset(new Distributed::DelayedJobScheduler()); + } + return *m_delayedRetryScheduler; + } }; } // namespace SPTAG #endif // _SPTAG_SPANN_EXTRADYNAMICSEARCHER_H_ diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 73f7c9a48..4431460cf 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -136,7 +136,7 @@ DefineSSDParameter(m_remoteAppendTimeoutSec, int, 180, "RemoteAppendTimeoutSec") // MaxInflight=8 (was 4): keeps the receiver's 16-thread BatchAppendItemJob pool // well-fed even when one chunk straggles on lock contention. DefineSSDParameter(m_remoteAppendMaxInflight, int, 8, "RemoteAppendMaxInflight") -DefineSSDParameter(m_asyncJobMaxRetry, int, 3, "AsyncJobMaxRetry") +DefineSSDParameter(m_asyncJobMaxRetry, int, 8, "AsyncJobMaxRetry") DefineSSDParameter(m_remoteLockTtlMs, int, 30000, "RemoteLockTtlMs") // GPU Building From 19ba298975ce8a7ed814e7f07c93c8f6fb555d61 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Fri, 22 May 2026 08:37:40 +0000 Subject: [PATCH 29/48] perf(distributed): receiver-side batched BatchAppend + fix resurrection bug Five tightly related changes that together raised insert_dominant 2-node throughput from 758/s to 1039/s (+37%) while preserving recall=0.98. 1) Receiver-side BatchAppend fast path Previously, an incoming N-item BatchAppend RPC was unpacked into N separate per-item Jobs, each calling Append() -> db->Merge once. At ChunkSize=10k that meant 10k pool jobs and 10k Merge round-trips per RPC, which saturated the receiver pool and made it the dominant bottleneck (peerOrig pending queue routinely sat above 100k). The new BatchAppendCallback (registered alongside the existing AppendCallback) takes a vector covering an entire layer's worth of items in one RPC. The receiver groups item indices by layer and dispatches ONE BatchAppendLayerJob per layer; that job runs Phase 1 (per-item HandleRaceCondition + resurrection refusal + versionMap mirror) then Phase 2 (group surviving items by headID and call BatchAppend()/db->MultiMerge() ONCE). This matches the local AddIndex fast path's I/O profile. Falls back to the legacy per-item path if a layer has no batch callback registered (early bring-up, partial reload). 2) Fix HandleRaceCondition resurrection bug HandleRaceCondition() previously acquired-and-released the head's RWLock without telling the caller whether a structural op had actually occurred. AppendCallback then unconditionally resurrected missing heads via AddHeadIndex, which could bring back a head a concurrent merge had just deleted. Fix: - HandleRaceCondition() now returns bool observedStructural. - AppendCallback refuses to resurrect when wasMissing && observedStructural, returning ErrorCode::Fail (transient). The sender's retry will re-resolve the owner after HeadSync Delete propagates. 3) Broadcast HeadSync Delete on Merge Split already broadcast HeadSync Delete for losers; MergePostings did not. Without the broadcast, peer compute nodes' head indices kept routing BatchAppend to the deleted head, triggering the resurrection bug. MergePostings now tracks deletedHeadVID in both loser branches and broadcasts after lock release (skipped when the layer is disk-backed, since TiKV is the source of truth there). 4) Auto-size WAL admission cap from ChunkSize x MaxInflight The receiver's WAL pending-items cap was hardcoded at 50k. When ChunkSize was raised to test 50k, a single chunk immediately tripped the cap and forced every chunk down the slow synchronous- ACK path (chunks timing out at the 180s RPC deadline). ExtraDynamicSearcher::SetWorker now derives the cap as ChunkSize * MaxInflight * 2 from the SPANN options, so the cap scales with the configured in-flight window. Default ChunkSize bumped 10000 -> 20000 (the receiver-side batched path makes the per-Merge fixed cost much cheaper, so larger chunks amortize the network roundtrip better without inflating the receiver pool depth). 5) Simplify ownership filtering Remove duplicate IsRemoteOwnedHead() body-side checks in Split() and MergePostings(). The single authoritative gate lives in SplitAsync()/MergeAsync(); the hash ring is static after init and only layer 0 routes anyway, so the body re-check was dead code. Saves one GetOwner() per executed Split/Merge job. Diagnostics: - progress log split: 'pending queue local/remote' relabeled to selfOrig/peerOrig, with clarifying comment that selfOrig=0 is expected (local-owned items bypass the pool via synchronous MultiMerge) and peerOrig is what the receiver-side work counts. - new addIndex route counters track heads(local:X remote:Y) items(local:I remote:J) in BatchAppend's TryRouteRemoteAppend decision, surfacing ownership skew in the progress log. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 212 +++++++++++- .../inc/Core/SPANN/Distributed/WorkerNode.h | 2 + .../inc/Core/SPANN/ExtraDynamicSearcher.h | 306 +++++++++++++++--- .../inc/Core/SPANN/ParameterDefinitionList.h | 13 +- 4 files changed, 470 insertions(+), 63 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 5be825651..fd4c607a2 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -52,6 +52,19 @@ namespace SPTAG::SPANN { int appendNum, std::string& appendPosting)>; + // Receiver-side batched callback: deliver a whole BatchRemoteAppend + // request to the searcher so it can group items by head and call + // its native BatchAppend / db->MultiMerge path with ONE TiKV op + // covering N items, instead of unpacking into N pool jobs that + // each issue an individual Merge. Mirrors the local AddIndex + // path which already batches. outSuccess and outFail accumulate + // per-item results so the caller can ACK with the same shape as + // the legacy per-item path. + using BatchAppendCallback = std::function& items, + std::uint32_t& outSuccess, + std::uint32_t& outFail)>; + using HeadSyncCallback = std::function; // RemoteLockCallback: // For Lock op: token argument is 0; returns issued fencing token @@ -207,6 +220,14 @@ namespace SPTAG::SPANN { EnsureLayerSlot_NoLock(layer); m_appendCallbacks[layer] = std::move(cb); } + void SetBatchAppendCallback(int layer, BatchAppendCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + if (m_batchAppendCallbacks.size() < static_cast(layer) + 1) { + m_batchAppendCallbacks.resize(layer + 1); + } + m_batchAppendCallbacks[layer] = std::move(cb); + } void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { std::unique_lock lk(m_callbackLifetimeMutex); EnsureLayerSlot_NoLock(layer); @@ -239,6 +260,7 @@ namespace SPTAG::SPANN { void ClearCallbacks() { std::unique_lock lk(m_callbackLifetimeMutex); m_appendCallbacks.clear(); + m_batchAppendCallbacks.clear(); m_headSyncCallbacks.clear(); m_remoteLockCallbacks.clear(); m_mergeCallbacks.clear(); @@ -269,6 +291,9 @@ namespace SPTAG::SPANN { return false; } m_appendCallbacks[layer] = nullptr; + if (layer >= 0 && static_cast(layer) < m_batchAppendCallbacks.size()) { + m_batchAppendCallbacks[layer] = nullptr; + } m_headSyncCallbacks[layer] = nullptr; m_remoteLockCallbacks[layer] = nullptr; if (layer >= 0 && static_cast(layer) < m_mergeCallbacks.size()) { @@ -287,6 +312,11 @@ namespace SPTAG::SPANN { const auto& cb = m_appendCallbacks[layer]; return cb ? &cb : nullptr; } + const BatchAppendCallback* LookupBatchAppendCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_batchAppendCallbacks.size()) return nullptr; + const auto& cb = m_batchAppendCallbacks[layer]; + return cb ? &cb : nullptr; + } const HeadSyncCallback* LookupHeadSyncCallback_Locked(int layer) const { if (layer < 0 || static_cast(layer) >= m_headSyncCallbacks.size()) return nullptr; const auto& cb = m_headSyncCallbacks[layer]; @@ -957,11 +987,19 @@ namespace SPTAG::SPANN { /*sendResponse=*/true, packetPtr); } - // Submit each item of `batchReq` to its per-layer job submitter. - // If sendResponse is true, the last completing job ACKs the sender - // via ackPacket. If sendResponse is false (WAL-backed path or - // crash recovery), the last completing job deletes the WAL entry - // identified by `batchID`. + // Submit a BatchAppend request to the local pool for processing. + // Two paths: + // * Batched (preferred): if the searcher registered a + // BatchAppendCallback for the request's layer, dispatch ONE + // Job per layer covering all items for that layer. The + // callback groups by headID and issues db->MultiMerge once, + // matching the local AddIndex throughput profile. + // * Per-item (fallback): legacy path used when no batch + // callback is registered. Creates one Job per item and the + // last one ACKs. + // If sendResponse is true, the LAST completing Job ACKs the + // sender via ackPacket; if false (WAL-backed path), the last Job + // deletes the WAL entry identified by `batchID` instead. void SubmitBatchItems(std::shared_ptr batchReq, std::uint64_t batchID, bool sendResponse, @@ -971,7 +1009,6 @@ namespace SPTAG::SPANN { if (sendResponse && ackPacket) SendBatchAppendResponse(*ackPacket, 0, 0); return; } - auto remaining = std::make_shared>(total); auto successCount = std::make_shared>(0); auto failCount = std::make_shared>(0); @@ -998,6 +1035,75 @@ namespace SPTAG::SPANN { return; } + // Group item indices by layer. We need the layer split because + // each layer has its own job submitter and its own searcher's + // batch callback. Within a layer all items go to one Job. + std::unordered_map> byLayer; + byLayer.reserve(4); + for (size_t i = 0; i < total; ++i) { + byLayer[batchReq->m_items[i].m_layer].push_back(i); + } + + // Check whether every layer in this request has a batch + // callback registered. If even one is missing we fall back to + // the per-item path for the whole request to keep the + // success/fail accounting consistent with the legacy ACK + // shape (one fetch_add per item). + bool allBatchable = true; + { + std::shared_lock cbLock(m_callbackLifetimeMutex); + for (const auto& kv : byLayer) { + if (!LookupBatchAppendCallback_Locked(kv.first)) { + allBatchable = false; + break; + } + } + } + + if (allBatchable) { + auto remainingLayers = std::make_shared>(byLayer.size()); + for (auto& kv : byLayer) { + int layer = kv.first; + const JobSubmitter* sub = nullptr; + if (layer >= 0 && static_cast(layer) < m_jobSubmitters.size() + && m_jobSubmitters[layer]) { + sub = &m_jobSubmitters[layer]; + } else { + for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } + } + if (sub) { + // Per-layer gauge: this Job represents + // kv.second.size() peer-origin items even though + // it's a single Job. Match item count, not Job + // count, so the gauge stays comparable to the + // per-item-path value. + RemoteOriginPendingSlot(layer).fetch_add(kv.second.size(), + std::memory_order_relaxed); + auto* job = new BatchAppendLayerJob( + this, batchReq, std::move(kv.second), layer, + remainingLayers, successCount, failCount, + ackPacket, sendResponse, batchID); + (*sub)(job); + } else { + failCount->fetch_add(kv.second.size()); + if (remainingLayers->fetch_sub(1) == 1) { + if (sendResponse && ackPacket) { + SendBatchAppendResponse(*ackPacket, + successCount->load(), failCount->load()); + } else if (batchID != 0) { + auto wal = GetBatchAppendWAL(); + if (wal) wal->Delete(batchID); + NoteWalPendingItemsDrained(batchReq->m_items.size()); + } + } + } + } + return; + } + + // Fallback: per-item path (legacy). Used until a searcher + // installs the batch callback (e.g. during early bring-up). + auto remaining = std::make_shared>(total); for (size_t i = 0; i < total; i++) { auto* job = new BatchAppendItemJob( this, batchReq, i, remaining, successCount, failCount, @@ -1435,6 +1541,7 @@ namespace SPTAG::SPANN { // by request.m_layer is required to avoid routing layer-0 events to // layer-1's storage and vice versa. std::vector m_appendCallbacks; + std::vector m_batchAppendCallbacks; std::vector m_headSyncCallbacks; std::vector m_remoteLockCallbacks; std::vector m_mergeCallbacks; @@ -1466,6 +1573,99 @@ namespace SPTAG::SPANN { std::mutex m_pendingLockTokensMutex; std::unordered_map m_pendingLockTokens; + // Per-LAYER Job: a single Job processes ALL items for one layer + // from a BatchRemoteAppend RPC. Calls the searcher's batched + // callback (BatchAppendCallback) which groups items by headID and + // issues ONE db->MultiMerge instead of N individual Merges -- + // mirrors the local AddIndex BatchAppend path so receiver-side + // throughput matches sender-side. Replaces the legacy + // BatchAppendItemJob fan-out (one Job per item) when the searcher + // has registered a batch callback; otherwise the per-item path is + // still used as a fallback. + class BatchAppendLayerJob : public Helper::ThreadPool::Job { + public: + BatchAppendLayerJob(RemotePostingOps* ops, + std::shared_ptr batchReq, + std::vector indices, + int layer, + std::shared_ptr> remainingLayers, + std::shared_ptr> successCount, + std::shared_ptr> failCount, + std::shared_ptr replyPacket, + bool sendResponse, + std::uint64_t batchID) + : m_ops(ops), m_batchReq(std::move(batchReq)), + m_indices(std::move(indices)), m_layer(layer), + m_remaining(std::move(remainingLayers)), + m_success(std::move(successCount)), + m_fail(std::move(failCount)), + m_replyPacket(std::move(replyPacket)), + m_sendResponse(sendResponse), + m_batchID(batchID) {} + + void exec(IAbortOperation*) override { run(); } + void exec(void* workspace, IAbortOperation*) override { + void* prev = tls_preallocAppendWorkSpace; + tls_preallocAppendWorkSpace = workspace; + run(); + tls_preallocAppendWorkSpace = prev; + } + + private: + void run() { + std::vector items; + items.reserve(m_indices.size()); + for (size_t idx : m_indices) { + items.push_back(&m_batchReq->m_items[idx]); + } + + std::uint32_t succ = 0, fail = 0; + { + std::shared_lock cbLock(m_ops->m_callbackLifetimeMutex); + const auto* cb = m_ops->LookupBatchAppendCallback_Locked(m_layer); + if (cb) { + (*cb)(items, succ, fail); + } else { + // Searcher detached between dispatch and run; mark + // everything as failed so the sender can retry. + fail = static_cast(items.size()); + } + } + m_success->fetch_add(succ); + m_fail->fetch_add(fail); + // Decrement per-layer remote-origin gauge by the count of + // items this job represents (paired with the matching + // increment in SubmitBatchItems). + { + auto& slot = m_ops->RemoteOriginPendingSlot(m_layer); + std::size_t toSub = m_indices.size(); + std::size_t prev = slot.fetch_sub(toSub, std::memory_order_relaxed); + if (prev < toSub) slot.store(0, std::memory_order_relaxed); + } + if (m_remaining->fetch_sub(1) == 1) { + if (m_sendResponse && m_replyPacket) { + m_ops->SendBatchAppendResponse( + *m_replyPacket, m_success->load(), m_fail->load()); + } else if (m_batchID != 0) { + auto wal = m_ops->GetBatchAppendWAL(); + if (wal) wal->Delete(m_batchID); + m_ops->NoteWalPendingItemsDrained(m_batchReq->m_items.size()); + } + } + } + + RemotePostingOps* m_ops; + std::shared_ptr m_batchReq; + std::vector m_indices; + int m_layer; + std::shared_ptr> m_remaining; + std::shared_ptr> m_success; + std::shared_ptr> m_fail; + std::shared_ptr m_replyPacket; + bool m_sendResponse; + std::uint64_t m_batchID; + }; + // Per-item Job: each remote append request becomes one Job submitted // to the searcher's shared SPDKThreadPool. The last completing Job // ACKs the sender. Identical to how a local insert thread would call diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index b8fa36998..116b6c25f 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -35,6 +35,7 @@ namespace SPTAG::SPANN { class WorkerNode : public NetworkNode { public: using AppendCallback = RemotePostingOps::AppendCallback; + using BatchAppendCallback = RemotePostingOps::BatchAppendCallback; using DispatchCallback = DispatchCoordinator::DispatchCallback; using HeadSyncCallback = RemotePostingOps::HeadSyncCallback; using RemoteLockCallback = RemotePostingOps::RemoteLockCallback; @@ -110,6 +111,7 @@ namespace SPTAG::SPANN { // request.m_layer. void SetAppendCallback(int layer, AppendCallback cb) { m_remoteOps.SetAppendCallback(layer, std::move(cb)); } + void SetBatchAppendCallback(int layer, BatchAppendCallback cb) { m_remoteOps.SetBatchAppendCallback(layer, std::move(cb)); } void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { m_remoteOps.SetHeadSyncCallback(layer, std::move(cb)); } void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { m_remoteOps.SetRemoteLockCallback(layer, std::move(cb)); } void SetFenceValidator(int layer, FenceValidator cb) { m_remoteOps.SetFenceValidator(layer, std::move(cb)); } diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 6f190f197..fefd20b24 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -290,18 +290,16 @@ namespace SPTAG::SPANN { } }; - private: - std::atomic m_workspaceCount = 0; - - std::shared_ptr db; - WorkerNode* m_worker = nullptr; // externally owned, set via SetWorker() - public: // Expose the underlying KV handle so a standalone WorkerNode can be wired to the // same DB this searcher already opened, instead of opening a second one. std::shared_ptr GetDB() const { return db; } private: + std::atomic m_workspaceCount = 0; + std::shared_ptr db; + WorkerNode* m_worker = nullptr; // externally owned, set via SetWorker() + SPANN::Index* m_headIndex; std::unique_ptr m_versionMap; Options* m_opt; @@ -321,11 +319,7 @@ namespace SPTAG::SPANN { // Distributed/HeadSyncLog.h and Distributed/SplitWAL.h. std::unique_ptr m_headSyncLog; std::unique_ptr m_splitWAL; - // Receiver-side Batch WAL for cross-owner BatchAppend (Option A). - // Owned by each layer's searcher but the same TiKV cluster, keyed - // by receiver node index; layer-0 is the only owner that wires it - // into the WorkerNode (only one WAL per receiver, regardless of - // how many layers exist). + // Receiver-side Batch WAL for cross-owner BatchAppend std::shared_ptr m_batchAppendWAL; std::atomic m_splitJobIdCounter{ 0 }; @@ -352,6 +346,15 @@ namespace SPTAG::SPANN { std::atomic_size_t m_totalAppendCompleted{ 0 }; std::atomic_size_t m_totalAppendCount{ 0 }; + // Routing counters for local AddIndex calls so we can verify + // GetOwner is partitioning work evenly. Incremented in + // BatchAppend()/Append() based on whether TryRouteRemoteAppend + // shipped the head to a peer or it stayed local. + std::atomic_size_t m_routedLocalHeads{ 0 }; + std::atomic_size_t m_routedRemoteHeads{ 0 }; + std::atomic_size_t m_routedLocalItems{ 0 }; + std::atomic_size_t m_routedRemoteItems{ 0 }; + std::atomic_size_t m_reassignJobsInFlight{ 0 }; std::atomic_size_t m_totalReassignSubmitted{ 0 }; std::atomic_size_t m_totalReassignCompleted{ 0 }; @@ -504,7 +507,14 @@ namespace SPTAG::SPANN { // broadcast), the callback re-checks ContainSample with a stable // view. When the head is genuinely gone, sender retries against // the updated head index and routes to the new owner. - void HandleRaceCondition(SizeType headID) { + // + // Returns true if a structural op was observed (the head was in + // m_splitList or m_mergeList at check time). The AppendCallback + // uses this to refuse resurrecting a head that was likely just + // deleted by the wait-on-RWLock'd structural op: resurrecting + // would race against the merge's HeadSync Delete broadcast and + // leave a zombie head until the next merge round drops it again. + bool HandleRaceCondition(SizeType headID) { bool inSplit = false, inMerge = false; { std::shared_lock sl(m_splitListLock); @@ -514,11 +524,12 @@ namespace SPTAG::SPANN { std::shared_lock sl(m_mergeListLock); inMerge = (m_mergeList.find(headID) != m_mergeList.end()); } - if (!inSplit && !inMerge) return; + if (!inSplit && !inMerge) return false; // Wait until the structural op releases the per-head RWLock. // Acquire-and-immediately-release; the Append below re-locks. std::unique_lock w(m_rwLocks[headID]); (void)w; + return true; } // SPDKThreadPool. Called both after pool creation and from @@ -548,6 +559,13 @@ namespace SPTAG::SPANN { m_worker->SetRpcRetry(m_opt->m_remoteAppendRetry); m_worker->SetRpcTimeoutSec(m_opt->m_remoteAppendTimeoutSec); m_worker->SetRpcMaxInflightPerNode(m_opt->m_remoteAppendMaxInflight); + // Size the receiver's WAL admission cap so a normal in-flight + // window (ChunkSize × MaxInflight) fits before backpressure + // engages. A too-low cap forces every chunk down the slow + // synchronous-ACK path; too-high removes the safety net. + const std::size_t chunk = (std::size_t)std::max(1, m_opt->m_remoteAppendChunkSize); + const std::size_t inflight = (std::size_t)std::max(1, m_opt->m_remoteAppendMaxInflight); + m_worker->SetBatchAppendWalPendingItemsCap(chunk * inflight * 2); } // Initialize durable HeadSync log + SplitWAL once we know the @@ -588,7 +606,7 @@ namespace SPTAG::SPANN { // the head index. Otherwise the wasMissing branch // below can resurrect a head that the structural op // just deleted. - HandleRaceCondition(headID); + bool observedStructural = HandleRaceCondition(headID); // Reuse SPDKThreadPool's per-worker pre-allocated workspace // when called from BatchAppendItemJob on m_splitThreadPool. @@ -599,6 +617,21 @@ namespace SPTAG::SPANN { ws = &localWorkSpace; } bool wasMissing = !m_headIndex->ContainSample(headID, m_layer + 1); + if (wasMissing && observedStructural) { + // We waited for an in-flight Split/Merge and the + // head is gone afterwards -- the structural op + // deleted it on purpose. Resurrecting via + // AddHeadIndex would race the structural op's + // HeadSync Delete broadcast and leave a zombie + // head until the next merge round drops it again. + // Refuse the append; the sender's retry path will + // re-resolve once HeadSync propagates the + // deletion to its head index. + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "AppendCallback: head=%lld deleted by local structural op; refusing resurrection\n", + (std::int64_t)headID); + return ErrorCode::Fail; + } if (wasMissing && headVec && !headVec->empty()) { DimensionType dim = static_cast( headVec->size() / sizeof(ValueType)); @@ -638,6 +671,111 @@ namespace SPTAG::SPANN { return Append(ws, headID, appendNum, appendPosting, 0); }); + // Batch append callback: receiver-side fast path. Replaces + // the per-item job fan-out with a single Job per layer that + // groups items by headID and issues ONE db->MultiMerge, + // matching the local AddIndex BatchAppend throughput profile. + // Without this, a single 10k-item peer RPC inflates the + // receiver's pool by 10k jobs and 10k Merge calls -- the + // dominant receiver-side bottleneck observed in 2-node tests. + m_worker->SetBatchAppendCallback(m_layer, + [this](std::vector& items, + std::uint32_t& outSuccess, std::uint32_t& outFail) { + outSuccess = 0; + outFail = 0; + if (items.empty()) return; + + ExtraWorkSpace localWorkSpace; + ExtraWorkSpace* ws = static_cast(tls_preallocAppendWorkSpace); + if (!ws) { + m_headIndex->InitWorkSpace(&localWorkSpace); + ws = &localWorkSpace; + } + + // Phase 1: per-head prep (race-condition wait, + // resurrection or refusal) and per-item versionMap + // mirroring. Items refused at this phase count as + // failures and are excluded from the MultiMerge. + std::vector alive(items.size(), true); + for (size_t i = 0; i < items.size(); ++i) { + auto* req = items[i]; + if (req->m_appendPosting.empty() || req->m_appendNum == 0) { + // Defensive drop (matches Append()'s gate). + alive[i] = false; + ++outSuccess; + continue; + } + bool observedStructural = HandleRaceCondition(req->m_headID); + bool wasMissing = !m_headIndex->ContainSample(req->m_headID, m_layer + 1); + if (wasMissing && observedStructural) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "BatchAppendCallback: head=%lld deleted by local structural op; refusing\n", + (std::int64_t)req->m_headID); + alive[i] = false; + ++outFail; + continue; + } + if (wasMissing && !req->m_headVec.empty()) { + DimensionType dim = static_cast( + req->m_headVec.size() / sizeof(ValueType)); + m_headIndex->AddHeadIndex(req->m_headVec.data(), + req->m_headID, 0, dim, m_layer + 1, ws); + } + + // Mirror sender's versionMap for the records we're + // about to persist (otherwise MergePostings / + // SearchIndex would drop them as stale). + const uint8_t* basePtr = + reinterpret_cast(req->m_appendPosting.data()); + size_t totalRec = req->m_appendPosting.size() / m_vectorInfoSize; + EnsureVersionMapCoversPosting(basePtr, totalRec, + "BatchAppendCallback", req->m_headID); + const SizeType localCount = m_versionMap->Count(); + std::vector batchVids; + std::vector batchVers; + batchVids.reserve(totalRec); + batchVers.reserve(totalRec); + for (size_t k = 0; k < totalRec; ++k) { + const uint8_t* p = basePtr + k * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + uint8_t recVer = *(p + sizeof(SizeType)); + if (vid < 0 || vid >= localCount) continue; + if (recVer == 0xfe) continue; + uint8_t curVer = m_versionMap->GetVersion(vid); + if (curVer == 0xfe) continue; + if (curVer == recVer) continue; + batchVids.push_back(vid); + batchVers.push_back(recVer); + } + if (!batchVids.empty()) { + m_versionMap->SetVersionBatch(batchVids, batchVers); + } + } + + // Phase 2: group surviving items by headID, then + // hand the grouped map to BatchAppend so it issues + // a single db->MultiMerge for all heads. + std::unordered_map headAppends; + headAppends.reserve(items.size()); + size_t aliveCount = 0; + for (size_t i = 0; i < items.size(); ++i) { + if (!alive[i]) continue; + auto* req = items[i]; + auto& dst = headAppends[req->m_headID]; + if (dst.empty()) dst = std::move(req->m_appendPosting); + else dst.append(req->m_appendPosting); + ++aliveCount; + } + if (headAppends.empty()) return; + + ErrorCode ret = BatchAppend(ws, headAppends, "PeerBatch"); + if (ret == ErrorCode::Success) { + outSuccess += static_cast(aliveCount); + } else { + outFail += static_cast(aliveCount); + } + }); + // Head sync callback: apply head index updates from peers auto* headIndex = m_headIndex; int layer = m_layer; @@ -1196,17 +1334,10 @@ namespace SPTAG::SPANN { uint64_t splitPostingVectors = 0; uint64_t splitNewHeadCount = 0; - // Only the OWNER of headID should run Split. Remote-issued - // splits get dropped early so we don't mutate a posting that - // doesn't live on this node. - if (IsRemoteOwnedHead(headID)) { - std::unique_lock tmplock(m_splitListLock); - m_splitList.unsafe_erase(headID); - return ErrorCode::Success; - } - - // Owner-side: wait for any in-flight remote-initiated lock on - // this bucket to release the advisory flag before we mutate. + // Ownership filtering is the single gate inside SplitAsync; by + // the time we get here the head is guaranteed local-owned. No + // re-check needed (hash ring is static once initialized, and + // only layer 0 routes anyway). WaitForRemoteBucketUnlocked(headID); { @@ -1695,15 +1826,10 @@ namespace SPTAG::SPANN { ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID) { - // The owner runs its own merge passes. Skip when this head is - // owned by another node — we'd just be racing the owner. - // (Defense in depth: MergeAsync already filters at enqueue, but - // ownership can change between enqueue and execution.) - if (IsRemoteOwnedHead(headID)) { - std::unique_lock tmplock(m_mergeListLock); - m_mergeList.unsafe_erase(headID); - return ErrorCode::Success; - } + // Ownership filtering is the single gate inside MergeAsync; by + // the time we get here the head is guaranteed local-owned. No + // re-check needed (hash ring is static once initialized, and + // only layer 0 routes anyway). WaitForRemoteBucketUnlocked(headID); std::unique_lock lock(m_rwLocks[headID]); @@ -1724,6 +1850,16 @@ namespace SPTAG::SPANN { std::string mergedPostingList; std::set vectorIdSet; + // Tracks the loser VID after a successful merge so we can + // broadcast a HeadSync Delete entry to peers after releasing + // the per-head RWLock. Split mirrors this pattern at + // line ~1620 with both Add (new heads) and Delete (original + // head) entries. Without this broadcast, peers keep routing + // BatchAppend traffic to the deleted head -- the receiver's + // AppendCallback wasMissing branch would then resurrect a + // dead head, leaving a zombie until the next merge round. + SizeType deletedHeadVID = -1; + std::string currentPostingList; ErrorCode ret; { @@ -1927,6 +2063,7 @@ namespace SPTAG::SPANN { return ret; } } + deletedHeadVID = queryResult->VID; nextHeadID = headID; nextHeadVec = headVec; deletedHeadVec = resultVec; @@ -1960,6 +2097,7 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(headID)); return ret; } + deletedHeadVID = headID; nextHeadID = queryResult->VID; nextHeadVec = resultVec; deletedHeadVec = headVec; @@ -2008,6 +2146,30 @@ namespace SPTAG::SPANN { MergeAsync(nextHeadID); } } + + // Broadcast HeadSync Delete for the merge loser so peer + // compute nodes drop it from their in-memory head index. + // Without this, peers keep routing BatchAppend traffic to + // the deleted head; the receiver's AppendCallback then + // either resurrects it (zombie) or refuses (sender retry + // loop) until the next merge round happens to delete it + // again. Mirrors the Split broadcast at line ~1620. + // Skipped when our layer is disk-backed (TiKV is source + // of truth there) or when no worker is wired. + if (deletedHeadVID != -1 && m_worker && m_worker->IsEnabled() + && m_headIndex->GetDiskIndex(m_layer + 1) == nullptr) { + std::vector headSyncEntries; + HeadSyncEntry entry; + entry.op = HeadSyncEntry::Op::Delete; + entry.headVID = deletedHeadVID; + entry.m_layer = m_layer; + headSyncEntries.push_back(std::move(entry)); + if (m_headSyncLog) { + int shard = m_worker->GetWorkerNodeIndex(); + m_headSyncLog->Append(shard, headSyncEntries); + } + m_worker->BroadcastHeadSync(headSyncEntries); + } m_stat.m_mergeNum++; return ErrorCode::Success; } @@ -2030,9 +2192,11 @@ namespace SPTAG::SPANN { inline void SplitAsync(SizeType headID, int postingSize, std::function p_callback = nullptr) { - // Don't enqueue split jobs for heads we don't own; the owner - // will detect oversize on its own. Skipping here avoids - // burning a thread-pool slot only to drop the job in Split(). + // Single authoritative ownership gate. Sources of remote-owned + // headIDs that legitimately reach here: RefineIndex full scan, + // Search→MergeAsync via search result, Split-internal re-enqueue + // for new-head VIDs, MergePostings re-merge of survivor. Drop + // them so the owner runs its own structural pass. if (IsRemoteOwnedHead(headID)) return; { Helper::Concurrent::ConcurrentMap::value_type workPair(headID, postingSize); @@ -2054,10 +2218,11 @@ namespace SPTAG::SPANN { inline void MergeAsync(SizeType headID, std::function p_callback = nullptr) { - // Don't enqueue merge jobs for heads we don't own; the owner - // runs its own merge pass. Filtering here is the single - // upstream gate so MergePostings's owner check is only a - // defense-in-depth net. + // Single authoritative ownership gate. Sources of remote-owned + // headIDs that legitimately reach here: RefineIndex full scan, + // Search→MergeAsync via search result, MergePostings re-merge of + // survivor (nextHeadID). Drop them so the owner runs its own + // merge pass. if (IsRemoteOwnedHead(headID)) return; { std::shared_lock tmplock(m_mergeListLock); @@ -2531,8 +2696,16 @@ namespace SPTAG::SPANN { (int)(posting.size() / m_vectorInfoSize), posting, headVecBytes)) { + m_routedRemoteHeads.fetch_add(1, std::memory_order_relaxed); + m_routedRemoteItems.fetch_add( + posting.size() / m_vectorInfoSize, + std::memory_order_relaxed); continue; } + m_routedLocalHeads.fetch_add(1, std::memory_order_relaxed); + m_routedLocalItems.fetch_add( + posting.size() / m_vectorInfoSize, + std::memory_order_relaxed); } std::unique_lock headLock(m_rwLocks[headID]); @@ -3777,25 +3950,48 @@ namespace SPTAG::SPANN { walPending = m_worker->GetBatchAppendWalPendingItems(); remoteOriginPending = m_worker->GetRemoteOriginPendingItems(m_layer); } - // Split the local pool's pending queue into the portion - // serving peer-originated BatchAppend items vs the residual - // (local-origin RMWs, split/merge/reassign jobs). Helps - // operators distinguish "I'm bottlenecked applying remote - // work" from "my own inserts are backlogged". - size_t localPending = totalJobs > remoteOriginPending + // Split the local pool's pending queue by ORIGIN of the + // work, not by processing site. Both buckets are being + // processed locally on this node's SPDKThreadPool: + // selfOrig: jobs the local AddIndex generated (own + // splits/merges/reassigns/appends). + // peerOrig: BatchAppendItemJob unpacked from BatchAppend + // RPCs that peers routed to us because we own + // the head. When peer A sends 10000 items to + // us they land here, not in A's queue. + // Items WE dispatched to peers (and are waiting on their + // response) are reported separately as "remote out + // queueDepth" + "inflightChunks" + "walPendingItems". + // + // Asymmetry note: selfOrig is usually near 0 even when + // GetOwner is perfectly balanced. Local AddIndex calls + // for LOCAL-owned heads bypass the pool entirely (one + // synchronous db->MultiMerge per BatchAppend batch + // covers them all). Peer-originated BatchAppend + // requests, by contrast, unpack into ONE pool job per + // item, so a single 10k-item RPC inflates peerOrig by + // 10k. Use "addIndex route" below to verify owner + // partitioning is healthy. + size_t selfOrigPending = totalJobs > remoteOriginPending ? totalJobs - remoteOriginPending : 0; + size_t routedLocalH = m_routedLocalHeads.load(); + size_t routedRemoteH = m_routedRemoteHeads.load(); + size_t routedLocalI = m_routedLocalItems.load(); + size_t routedRemoteI = m_routedRemoteItems.load(); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "layer %d pending queue:%zu (local:%zu remote:%zu) split:%zu merge:%zu append:%zu reassign:%zu running:%u | " + "layer %d pending queue:%zu (selfOrig:%zu peerOrig:%zu) split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " + "addIndex route heads(local:%zu remote:%zu) items(local:%zu remote:%zu) | " "remote out queueDepth:%zu inflightChunks:%d totalRouted:%zu walPendingItems:%zu | " "split_latency avg:%.1fms max:%.1fms\n", - m_layer, totalJobs, localPending, remoteOriginPending, + m_layer, totalJobs, selfOrigPending, remoteOriginPending, m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), runningJobs, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), + routedLocalH, routedRemoteH, routedLocalI, routedRemoteI, remoteQ, remoteInflight, remoteTotal, walPending, avgSplitMs, maxSplitMs); } @@ -3924,21 +4120,27 @@ namespace SPTAG::SPANN { walPending = m_worker->GetBatchAppendWalPendingItems(); remoteOriginPending = m_worker->GetRemoteOriginPendingItems(m_layer); } - size_t localPending = totalJobs > remoteOriginPending + size_t selfOrigPending = totalJobs > remoteOriginPending ? totalJobs - remoteOriginPending : 0; + size_t routedLocalH = m_routedLocalHeads.load(); + size_t routedRemoteH = m_routedRemoteHeads.load(); + size_t routedLocalI = m_routedLocalItems.load(); + size_t routedRemoteI = m_routedRemoteItems.load(); // if (!ShouldLogProgress(totalJobs)) return; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "layer %d pending queue:%zu (local:%zu remote:%zu) split:%zu merge:%zu append:%zu reassign:%zu running:%u | " + "layer %d pending queue:%zu (selfOrig:%zu peerOrig:%zu) split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " + "addIndex route heads(local:%zu remote:%zu) items(local:%zu remote:%zu) | " "remote out queueDepth:%zu inflightChunks:%d totalRouted:%zu walPendingItems:%zu | " "split_latency avg:%.1fms max:%.1fms\n", - m_layer, totalJobs, localPending, remoteOriginPending, + m_layer, totalJobs, selfOrigPending, remoteOriginPending, m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), m_splitThreadPool ? static_cast(m_splitThreadPool->runningJobs()) : 0, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), + routedLocalH, routedRemoteH, routedLocalI, routedRemoteI, remoteQ, remoteInflight, remoteTotal, walPending, avgSplitMs, maxSplitMs); } diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 4431460cf..e3a2c22ab 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -126,11 +126,14 @@ DefineSSDParameter(m_versionCacheMaxChunks, int, 10000, "VersionCacheMaxChunks") DefineSSDParameter(m_asyncRpcMaxInflight, int, 0, "AsyncRpcMaxInflight") // Distributed RemotePostingOps RPC tuning -// ChunkSize=10000: each in-flight chunk holds enough work to amortize the -// network roundtrip and grpc framing cost (a 3000-item chunk took ~500ms at -// 1M-scale; 10000 should hit ~1.5s and roughly 3× the per-second throughput -// for the same in-flight cap). -DefineSSDParameter(m_remoteAppendChunkSize, int, 10000, "RemoteAppendChunkSize") +// ChunkSize=20000: with the receiver-side BatchAppendLayerJob fast path (one +// db->MultiMerge per chunk instead of N per-item Merges), larger chunks pay +// off — they amortize the network roundtrip without exploding the receiver +// pool depth. 20K is a balance: small enough that ChunkSize × MaxInflight +// stays under the WAL admission-control cap (so chunks take the WAL-backed +// fast-ACK path), large enough that the network roundtrip overhead is small +// vs. per-chunk work. 50K was tried and immediately tripped the WAL cap. +DefineSSDParameter(m_remoteAppendChunkSize, int, 20000, "RemoteAppendChunkSize") DefineSSDParameter(m_remoteAppendRetry, int, 3, "RemoteAppendRetry") DefineSSDParameter(m_remoteAppendTimeoutSec, int, 180, "RemoteAppendTimeoutSec") // MaxInflight=8 (was 4): keeps the receiver's 16-thread BatchAppendItemJob pool From 15f17c9a79655ca8561febabaa3cbb4948758a23 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 03:38:36 +0000 Subject: [PATCH 30/48] fix(distributed): atomic Split locking, drop async retries, drain on shutdown Three related fixes for distributed Split/Merge robustness. 1. Atomic Split lock acquisition (Phase A/B/C/D) Refactor Split() into precompute-plan / build-payloads / acquire-all-locks / execute-writes phases. Closes the strand window where k=0 wrote and k=1 then failed to lock, leaving cluster-1 vectors orphaned. All per-VID local locks (sorted ascending) and per-(owner,bucket) remote fencing-token leases (sorted ascending) are acquired before any DB write; failure cleanly releases and re-enqueues via SplitAsync. The deterministic ordering prevents deadlock between concurrent Splits on overlapping heads. 2. Drop SplitAsync/MergeAsync retries Structural ops are best-effort self-healing: a failed Split leaves the head oversized so the next Append re-triggers SplitAsync; a failed Merge leaves postings undersize so the next Search-driven AsyncMergeInSearch / RefineIndex re-triggers MergeAsync. The previous retry loop burned pool slots and racy-spawned jobs into a torn-down WorkerNode at shutdown, which is what was producing the segfault. 3. Drain async jobs in ~ExtraDynamicSearcher The dtor used to set m_worker=nullptr immediately; in-flight Split/Merge jobs joined later by the ThreadPool dtor then null-deref m_worker via QueueRemoteAppend. Now poll per-layer in-flight counters until zero (30 s timeout) before clearing callbacks, and leave m_worker alone - it is externally owned by the SPFreshTest router. Plus support cleanup: - RemoteLeaseGuard: reusable RAII type with fencing-token validation, replacing the inline RemoteLockGuard helper in MergePostings. - HandleRaceCondition removed: the single-gate refactor at SplitAsync/MergeAsync plus atomic locking above closes the race it was working around; the AppendCallback/BatchAppendCallback wasMissing branch now refuses unconditionally. - MergePostings distinguishes Key_NotFound (skip stale candidate) from other IO failures (propagate) instead of silent-skipping all errors. Measured (2-node insert_dominant, 1M vectors): Insert throughput: 1141.6 /s (baseline 758 /s, +50%) Recall@5: 0.984 Segfaults: 0 (was: shutdown crash every run) Retry log lines: 0 Drain timeouts: 0 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 850 ++++++++++-------- 1 file changed, 459 insertions(+), 391 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index fefd20b24..3fc2e639e 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -28,7 +28,10 @@ #include #include #include +#include #include +#include +#include #include #include #include @@ -59,6 +62,50 @@ extern "C" bool RocksDbIOUringEnable() { return true; } namespace SPTAG::SPANN { + // RAII lease holder for a remote per-bucket lock issued by + // WorkerNode::SendRemoteLock. Stores the fencing token so the + // release call can be validated by the owner. Used by both Split + // (via a token map for batched acquisition) and MergePostings + // (per-candidate, one lease at a time). + struct RemoteLeaseGuard { + WorkerNode* router = nullptr; + int nodeIndex = -1; + int layer = 0; + SizeType vid = -1; + std::uint64_t token = 0; + + RemoteLeaseGuard() = default; + RemoteLeaseGuard(const RemoteLeaseGuard&) = delete; + RemoteLeaseGuard& operator=(const RemoteLeaseGuard&) = delete; + RemoteLeaseGuard(RemoteLeaseGuard&& o) noexcept { *this = std::move(o); } + RemoteLeaseGuard& operator=(RemoteLeaseGuard&& o) noexcept { + release(); + router = o.router; nodeIndex = o.nodeIndex; layer = o.layer; + vid = o.vid; token = o.token; + o.router = nullptr; o.token = 0; + return *this; + } + ~RemoteLeaseGuard() { release(); } + + // Returns true on success (token != 0). Caller decides whether + // a denial means "skip candidate" or "propagate failure". + bool acquire(WorkerNode* r, int n, int l, SizeType v) { + release(); + if (!r) return false; + std::uint64_t t = r->SendRemoteLock(n, l, v, true, 0); + if (t == 0) return false; + router = r; nodeIndex = n; layer = l; vid = v; token = t; + return true; + } + void release() { + if (router && token) { + router->SendRemoteLock(nodeIndex, layer, vid, false, token); + } + router = nullptr; token = 0; + } + bool active() const { return router != nullptr && token != 0; } + }; + template class ExtraDynamicSearcher : public IExtraSearcher { @@ -68,7 +115,6 @@ namespace SPTAG::SPANN { ExtraDynamicSearcher* m_extraIndex; SizeType m_headID; std::function m_callback; - int m_attempts = 0; public: MergeAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::function p_callback) : m_extraIndex(extraIndex), m_headID(headID), m_callback(std::move(p_callback)) {} @@ -79,56 +125,8 @@ namespace SPTAG::SPANN { } inline void exec(void* p_workSpace, IAbortOperation* p_abort) override { ErrorCode ret = m_extraIndex->MergePostings((ExtraWorkSpace*)p_workSpace, m_headID); - if (ret != ErrorCode::Success) { - // Classify before retrying: transient errors (TiKV - // region_error, timeout, generic Fail from the IO - // layer) deserve a bounded retry with exponential - // backoff; permanent errors (data inconsistency, - // unknown ErrorCode) cannot be repaired by retry and - // get dropped with a warning so we don't burn - // pool slots in a hot fail loop. - int maxRetry = m_extraIndex->m_opt - ? m_extraIndex->m_opt->m_asyncJobMaxRetry : 0; - bool transient = Distributed::IsTransientAsyncJobError(ret); - if (transient && m_attempts + 1 < maxRetry) { - // Async-job fault-tolerance contract: merges are - // safe to retry idempotently (the owner check, the - // ContainSample liveness gate, and the locked RMW - // all re-evaluate on each attempt). Enqueue a - // fresh Job carrying the bumped attempt count via - // the delayed-retry scheduler so backoff happens - // OFF the pool worker — the ThreadPool worker - // will `delete` *this* after we return, so we - // cannot re-add the same pointer. Keep - // m_mergeJobsInFlight unchanged: the new job - // takes ownership of the in-flight slot. - int backoffMs = Distributed::AsyncJobRetryBackoffMs(m_attempts); - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergeAsyncJob: head=%lld attempt=%d failed ret=%d (transient), backoff=%dms\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret, backoffMs); - auto* retryJob = new MergeAsyncJob(m_extraIndex, m_headID, m_callback); - retryJob->m_attempts = m_attempts + 1; - m_extraIndex->GetOrCreateDelayedRetryScheduler().Schedule( - m_extraIndex->m_splitThreadPool, retryJob, backoffMs); - return; - } - if (!transient) { - // Permanent: log once and drop. Do not promote to - // m_asyncStatus — these are usually local data - // inconsistencies (e.g. version skew) that the - // next caller-driven recovery will repair, and - // poisoning m_asyncStatus would surface them as - // a process-wide failure. - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergeAsyncJob: head=%lld permanent failure ret=%d, dropping\n", - (std::int64_t)m_headID, (int)ret); - } else { - m_extraIndex->m_asyncStatus = ret; - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "MergeAsyncJob: head=%lld giving up after %d attempts ret=%d (transient exhausted)\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret); - } - } + if (ret != ErrorCode::Success) + m_extraIndex->m_asyncStatus = ret; m_extraIndex->m_mergeJobsInFlight--; m_extraIndex->m_totalMergeCompleted++; if (m_callback != nullptr) { @@ -143,7 +141,6 @@ namespace SPTAG::SPANN { ExtraDynamicSearcher* m_extraIndex; SizeType m_headID; std::function m_callback; - int m_attempts = 0; public: SplitAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::function p_callback) : m_extraIndex(extraIndex), m_headID(headID), m_callback(std::move(p_callback)) {} @@ -160,36 +157,8 @@ namespace SPTAG::SPANN { m_extraIndex->m_totalSplitTimeUs += elapsedUs; uint64_t prevMax = m_extraIndex->m_maxSplitTimeUs.load(); while (elapsedUs > prevMax && !m_extraIndex->m_maxSplitTimeUs.compare_exchange_weak(prevMax, elapsedUs)); - if (ret != ErrorCode::Success) { - // Same classification scheme as MergeAsyncJob. - // Splits are designed safe to retry idempotently - // (read-deduplicate during the next attempt handles - // partial writes from a previously-crashed attempt). - int maxRetry = m_extraIndex->m_opt - ? m_extraIndex->m_opt->m_asyncJobMaxRetry : 0; - bool transient = Distributed::IsTransientAsyncJobError(ret); - if (transient && m_attempts + 1 < maxRetry) { - int backoffMs = Distributed::AsyncJobRetryBackoffMs(m_attempts); - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "SplitAsyncJob: head=%lld attempt=%d failed ret=%d (transient), backoff=%dms\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret, backoffMs); - auto* retryJob = new SplitAsyncJob(m_extraIndex, m_headID, m_callback); - retryJob->m_attempts = m_attempts + 1; - m_extraIndex->GetOrCreateDelayedRetryScheduler().Schedule( - m_extraIndex->m_splitThreadPool, retryJob, backoffMs); - return; - } - if (!transient) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "SplitAsyncJob: head=%lld permanent failure ret=%d, dropping\n", - (std::int64_t)m_headID, (int)ret); - } else { - m_extraIndex->m_asyncStatus = ret; - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "SplitAsyncJob: head=%lld giving up after %d attempts ret=%d (transient exhausted)\n", - (std::int64_t)m_headID, m_attempts + 1, (int)ret); - } - } + if (ret != ErrorCode::Success) + m_extraIndex->m_asyncStatus = ret; m_extraIndex->m_splitJobsInFlight--; m_extraIndex->m_totalSplitCompleted++; if (m_callback != nullptr) { @@ -464,12 +433,39 @@ namespace SPTAG::SPANN { } ~ExtraDynamicSearcher() { + // Order matters: drain async jobs BEFORE nulling m_worker. + // An in-flight SplitAsyncJob may still be inside Split() → + // QueueRemoteAppend; clearing m_worker first turns that into a + // null-deref segfault. Wait for the local pool slice owned by + // *this* layer to quiesce before touching shared state. + DrainAsyncJobs(); if (m_worker) { m_worker->ClearCallbacksIfOwner(m_layer, this); - m_worker = nullptr; } } + // Wait for SplitAsync/MergeAsync/Append jobs targeting THIS layer + // to finish before we tear down. The pool itself may be shared + // with sibling layers / the head index, so we can't just destroy + // it; instead we poll the per-layer in-flight counters. + void DrainAsyncJobs() { + using clock = std::chrono::steady_clock; + auto deadline = clock::now() + std::chrono::seconds(30); + while (clock::now() < deadline) { + int s = m_splitJobsInFlight.load(std::memory_order_relaxed); + int m = m_mergeJobsInFlight.load(std::memory_order_relaxed); + int a = m_appendJobsInFlight.load(std::memory_order_relaxed); + if (s == 0 && m == 0 && a == 0) return; + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "ExtraDynamicSearcher layer=%d: drain timeout, split=%d merge=%d append=%d still in-flight\n", + m_layer, + (int)m_splitJobsInFlight.load(), + (int)m_mergeJobsInFlight.load(), + (int)m_appendJobsInFlight.load()); + } + int GetNumWorkerNodes() const { if (m_worker && m_worker->IsEnabled()) { return std::max(1, m_worker->GetNumWorkerNodes()); @@ -492,46 +488,6 @@ namespace SPTAG::SPANN { return m_initialVectorSize + (localVID - m_initialVectorSize) * numWorkers + GetWorkerNodeIndex(); } - // Receive-side race coordination: before applying a remote Append - // for headID, make sure no local Split or Merge is currently - // mutating the same head. Splits delete the original head and - // create new ones; merges delete a loser head. If we let the - // append's wasMissing branch run while a Split/Merge holds the - // RWLock, the AddHeadIndex resurrection would race the local - // DeleteIndex and we'd briefly bring a dead head back to life - // (only papered over by the eventual HeadSync from the structural - // op). Briefly acquiring the RWLock here serializes us behind - // the in-flight structural op without forking an explicit - // condition-variable channel. After the structural op completes - // its bookkeeping (lists drained, head index updated, HeadSync - // broadcast), the callback re-checks ContainSample with a stable - // view. When the head is genuinely gone, sender retries against - // the updated head index and routes to the new owner. - // - // Returns true if a structural op was observed (the head was in - // m_splitList or m_mergeList at check time). The AppendCallback - // uses this to refuse resurrecting a head that was likely just - // deleted by the wait-on-RWLock'd structural op: resurrecting - // would race against the merge's HeadSync Delete broadcast and - // leave a zombie head until the next merge round drops it again. - bool HandleRaceCondition(SizeType headID) { - bool inSplit = false, inMerge = false; - { - std::shared_lock sl(m_splitListLock); - inSplit = (m_splitList.find(headID) != m_splitList.end()); - } - { - std::shared_lock sl(m_mergeListLock); - inMerge = (m_mergeList.find(headID) != m_mergeList.end()); - } - if (!inSplit && !inMerge) return false; - // Wait until the structural op releases the per-head RWLock. - // Acquire-and-immediately-release; the Append below re-locks. - std::unique_lock w(m_rwLocks[headID]); - (void)w; - return true; - } - // SPDKThreadPool. Called both after pool creation and from // SetWorker(); whichever happens last actually binds the submitter. // Idempotent: wires the receiver's BatchAppend Jobs onto our shared @@ -601,12 +557,6 @@ namespace SPTAG::SPANN { m_worker->SetAppendCallback(m_layer, [this](SizeType headID, std::shared_ptr headVec, int appendNum, std::string& appendPosting) -> ErrorCode { - // Per-design HandleRaceCondition: wait for any local - // Split/Merge on this head to commit before we look at - // the head index. Otherwise the wasMissing branch - // below can resurrect a head that the structural op - // just deleted. - bool observedStructural = HandleRaceCondition(headID); // Reuse SPDKThreadPool's per-worker pre-allocated workspace // when called from BatchAppendItemJob on m_splitThreadPool. @@ -617,7 +567,7 @@ namespace SPTAG::SPANN { ws = &localWorkSpace; } bool wasMissing = !m_headIndex->ContainSample(headID, m_layer + 1); - if (wasMissing && observedStructural) { + if (wasMissing) { // We waited for an in-flight Split/Merge and the // head is gone afterwards -- the structural op // deleted it on purpose. Resurrecting via @@ -671,13 +621,7 @@ namespace SPTAG::SPANN { return Append(ws, headID, appendNum, appendPosting, 0); }); - // Batch append callback: receiver-side fast path. Replaces - // the per-item job fan-out with a single Job per layer that - // groups items by headID and issues ONE db->MultiMerge, - // matching the local AddIndex BatchAppend throughput profile. - // Without this, a single 10k-item peer RPC inflates the - // receiver's pool by 10k jobs and 10k Merge calls -- the - // dominant receiver-side bottleneck observed in 2-node tests. + // Batch append callback: receiver-side fast path. m_worker->SetBatchAppendCallback(m_layer, [this](std::vector& items, std::uint32_t& outSuccess, std::uint32_t& outFail) { @@ -705,9 +649,9 @@ namespace SPTAG::SPANN { ++outSuccess; continue; } - bool observedStructural = HandleRaceCondition(req->m_headID); + bool wasMissing = !m_headIndex->ContainSample(req->m_headID, m_layer + 1); - if (wasMissing && observedStructural) { + if (wasMissing) { SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "BatchAppendCallback: head=%lld deleted by local structural op; refusing\n", (std::int64_t)req->m_headID); @@ -1333,13 +1277,6 @@ namespace SPTAG::SPANN { double elapsedMSeconds; uint64_t splitPostingVectors = 0; uint64_t splitNewHeadCount = 0; - - // Ownership filtering is the single gate inside SplitAsync; by - // the time we get here the head is guaranteed local-owned. No - // re-check needed (hash ring is static once initialized, and - // only layer 0 routes anyway). - WaitForRemoteBucketUnlocked(headID); - { std::unique_lock lock(m_rwLocks[headID], std::defer_lock); if (requirelock) { @@ -1494,27 +1431,217 @@ namespace SPTAG::SPANN { } else { ks[1] = 1; } - SizeType newHeadVID = -1; - int first = 0; - for (int k : ks) { - if (args.counts[k] == 0) continue; - first = (k == 0) ? 0 : args.counts[0]; - newPostingLists[k].resize(args.counts[k] * m_vectorInfoSize); - char* ptr = (char*)(newPostingLists[k].c_str()); - for (int j = 0; j < args.counts[k]; j++, ptr += m_vectorInfoSize) + // === Phase A: precompute per-child plan (no I/O, no locks) === + // We resolve newHeadVID, isSameHead, and ownership for each of + // the two cluster children up-front so Phase B can acquire + // every lock the split will need before any DB write. This + // closes the strand window where k=0 wrote and k=1 then + // failed to lock, leaving cluster-1's vectors orphaned. + struct ChildPlan { + bool active = false; + bool isSameHead = false; + bool isRemote = false; + int ownerNode = -1; + SizeType newHeadVID = -1; + uint8_t version = 0; + }; + ChildPlan plans[2]; + { + bool tentativeSameHead = false; + for (int k : ks) { + if (args.counts[k] == 0) continue; + plans[k].active = true; + if (!tentativeSameHead && + m_headIndex->ComputeDistance(args.centers + k * args._D, headVec->c_str() + m_metaDataSize) < Epsilon) { + plans[k].isSameHead = true; + plans[k].newHeadVID = headID; + tentativeSameHead = true; + } else { + plans[k].newHeadVID = *((SizeType*)(postingP + args.clusterIdx[k] * m_vectorInfoSize)); + plans[k].version = *((uint8_t*)(postingP + args.clusterIdx[k] * m_vectorInfoSize + sizeof(SizeType))); + int owner = -1; + if (IsRemoteOwnedHead(plans[k].newHeadVID, &owner)) { + plans[k].isRemote = true; + plans[k].ownerNode = owner; + } + } + } + } + + // === Phase B: build per-child posting payloads (memory only) === + { + int first = 0; + for (int k : ks) { + if (!plans[k].active) continue; + first = (k == 0) ? 0 : args.counts[0]; + newPostingLists[k].resize(args.counts[k] * m_vectorInfoSize); + char* ptr = (char*)(newPostingLists[k].c_str()); + for (int j = 0; j < args.counts[k]; j++, ptr += m_vectorInfoSize) { + memcpy(ptr, postingList.c_str() + localIndices[first + j] * m_vectorInfoSize, m_vectorInfoSize); + } + if (plans[k].isSameHead && !hasHead) { + newPostingLists[k] += *headVec; + } + } + } + + // === Phase C: atomically acquire every lock the split needs === + // srcHead lock is already held above. We additionally need + // a per-VID local lock for each local newHead (!=headID), + // and a remote lease (with fencing token) for each remote + // newHead. Acquire in deterministic order (local: VID asc; + // remote: (ownerNode,bucket) asc) so two concurrent Splits + // touching overlapping heads can't deadlock. + // + // If ANY lock cannot be obtained, release whatever we got + // and re-enqueue via SplitAsync. No DB write has happened + // yet, so nothing strands. + std::vector> localChildLocks; + struct RemoteLeaseHeld { std::uint64_t token; int refcount; SizeType sampleVID; }; + std::map, RemoteLeaseHeld> remoteTokens; + + auto bucketKey = [](int owner, SizeType vid) { + return std::make_pair(owner, + COMMON::FineGrainedRWLock::BucketIndex(static_cast(vid))); + }; + + auto releaseRemoteTokens = [&]() { + if (!m_worker) { remoteTokens.clear(); return; } + for (auto& kv : remoteTokens) { + m_worker->SendRemoteLock(kv.first.first, m_layer, + kv.second.sampleVID, false, kv.second.token); + } + remoteTokens.clear(); + }; + + auto reenqueueAndExit = [&](const char* reason) -> ErrorCode { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: lock acquisition failed (%s) for srcHead %lld; re-enqueueing via SplitAsync\n", + reason, (std::int64_t)headID); + releaseRemoteTokens(); + localChildLocks.clear(); // RAII unlock { - memcpy(ptr, postingList.c_str() + localIndices[first + j] * m_vectorInfoSize, m_vectorInfoSize); + std::unique_lock tmplock(m_splitListLock); + m_splitList.unsafe_erase(headID); + } + SplitAsync(headID, postingList.size() / m_vectorInfoSize); + return ErrorCode::Success; + }; + + // C.1 Local newHead locks (ascending VID order to avoid GlobalLock deadlock) + { + std::vector localVids; + for (int k = 0; k < 2; ++k) { + if (!plans[k].active || plans[k].isRemote || plans[k].isSameHead) continue; + if (plans[k].newHeadVID == headID) continue; + localVids.push_back(plans[k].newHeadVID); } - if (!theSameHead && m_headIndex->ComputeDistance(args.centers + k * args._D, headVec->c_str() + m_metaDataSize) < Epsilon) { + std::sort(localVids.begin(), localVids.end()); + localVids.erase(std::unique(localVids.begin(), localVids.end()), localVids.end()); + + for (SizeType vid : localVids) { + std::unique_lock ul(m_rwLocks[vid], std::defer_lock); + int rtry = 0; + while (!ul.try_lock() && rtry < 20) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: local newHead VID %lld lock busy (attempt %d)\n", + (std::int64_t)vid, rtry + 1); + rtry++; + std::this_thread::sleep_for(std::chrono::milliseconds(3 * rtry)); + } + if (!ul.owns_lock()) { + return reenqueueAndExit("local child lock"); + } + localChildLocks.push_back(std::move(ul)); + } + } + + // C.2 Remote newHead locks (ascending (ownerNode, bucket) order) + { + struct RemoteSlot { int k; int owner; unsigned bucket; }; + std::vector slots; + for (int k = 0; k < 2; ++k) { + if (!plans[k].active || !plans[k].isRemote) continue; + slots.push_back({k, plans[k].ownerNode, + COMMON::FineGrainedRWLock::BucketIndex(static_cast(plans[k].newHeadVID))}); + } + std::sort(slots.begin(), slots.end(), + [](const RemoteSlot& a, const RemoteSlot& b) { + return std::tie(a.owner, a.bucket) < std::tie(b.owner, b.bucket); + }); + for (auto& slot : slots) { + auto key = std::make_pair(slot.owner, slot.bucket); + auto it = remoteTokens.find(key); + if (it != remoteTokens.end()) { + // Same (ownerNode, bucket) as a previously-acquired + // child; the owner's per-bucket lease covers both + // children, so reuse the token and bump refcount. + it->second.refcount++; + continue; + } + std::uint64_t token = 0; + constexpr int kMaxLockRetries = 20; + for (int attempt = 0; attempt < kMaxLockRetries; ++attempt) { + token = m_worker->SendRemoteLock(slot.owner, m_layer, + plans[slot.k].newHeadVID, true, 0); + if (token != 0) break; + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: remote newHead VID %lld owner=%d bucket=%u lease busy (attempt %d)\n", + (std::int64_t)plans[slot.k].newHeadVID, slot.owner, slot.bucket, attempt + 1); + std::this_thread::sleep_for(std::chrono::milliseconds(3 * (attempt + 1))); + } + if (token == 0) { + return reenqueueAndExit("remote child lock"); + } + remoteTokens[key] = { token, 1, plans[slot.k].newHeadVID }; + } + } + + // Invariant: every child that needs a lock has one held. + // Failure paths in C.1/C.2 already early-returned via + // reenqueueAndExit, so reaching here means all required + // locks (local per-VID + remote per-(owner,bucket) lease) + // are acquired. Assert this explicitly for debug builds. + { + size_t expectedLocal = 0; + std::set> expectedRemoteBuckets; + std::set expectedLocalVids; + for (int k = 0; k < 2; ++k) { + if (!plans[k].active) continue; + if (plans[k].isSameHead) continue; + if (plans[k].isRemote) { + expectedRemoteBuckets.insert(std::make_pair(plans[k].ownerNode, + COMMON::FineGrainedRWLock::BucketIndex(static_cast(plans[k].newHeadVID)))); + } else if (plans[k].newHeadVID != headID) { + expectedLocalVids.insert(plans[k].newHeadVID); + } + } + expectedLocal = expectedLocalVids.size(); + assert(localChildLocks.size() == expectedLocal && + "Split Phase C invariant: local child locks count mismatch"); + assert(remoteTokens.size() == expectedRemoteBuckets.size() && + "Split Phase C invariant: remote lease count mismatch"); + (void)expectedLocal; // silence -Wunused in NDEBUG builds + } + + // === Phase D: execute per-child writes (all locks held) === + // Plan-1 best-effort semantics: an IO failure on k=0 after + // k=0 already wrote is accepted as-is; the WAL + watchdog + // converge. We never fall through from a failed remote + // fenced write to a wrong local db Put. + SizeType newHeadVID = -1; + for (int k : ks) { + if (!plans[k].active) continue; + + if (plans[k].isSameHead) { newHeadsID[k] = headID; newHeadsVec[k] = std::make_shared(headVec->c_str() + m_metaDataSize, m_vectorDataSize); newHeadVID = headID; theSameHead = true; - if (!hasHead) newPostingLists[k] += *headVec; - auto splitPutBegin = std::chrono::high_resolution_clock::now(); if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to override posting %lld\n", (std::int64_t)(newHeadVID)); + releaseRemoteTokens(); return ret; } CheckCentroid(newHeadVID, newPostingLists[k], "Split-SameHead"); @@ -1523,221 +1650,186 @@ namespace SPTAG::SPANN { m_stat.m_putCost += elapsedMSeconds; m_stat.m_theSameHeadNum++; m_stat.m_splitSameHeadCount.fetch_add(1, std::memory_order_relaxed); + continue; } - else { - newHeadVID = *((SizeType*)(postingP + args.clusterIdx[k] * m_vectorInfoSize)); - uint8_t version = *((uint8_t*)(postingP + args.clusterIdx[k] * m_vectorInfoSize + sizeof(SizeType))); - newHeadsID[k] = newHeadVID; - newHeadsVec[k] = std::make_shared((char *)(args.centers + k * args._D), m_vectorDataSize); + newHeadVID = plans[k].newHeadVID; + uint8_t version = plans[k].version; + newHeadsID[k] = newHeadVID; + newHeadsVec[k] = std::make_shared((char *)(args.centers + k * args._D), m_vectorDataSize); + + bool headExistsInIndex = m_headIndex->ContainSample(newHeadVID, m_layer + 1); + + if (plans[k].isRemote) { + // Remote-owned newHead: write posting via fenced + // RemoteAppend to the owner. Local BKT head index + // is still updated here for not-yet-known heads; + // peers learn via BroadcastHeadSync below. + auto leaseIt = remoteTokens.find(bucketKey(plans[k].ownerNode, newHeadVID)); + std::uint64_t token = (leaseIt != remoteTokens.end()) ? leaseIt->second.token : 0; + + std::uint64_t jobID = m_splitJobIdCounter.fetch_add(1) + 1; + if (m_splitWAL) { + Distributed::SplitWAL::Record r; + r.jobID = jobID; + r.srcHeadID = headID; + r.localChildHeadID = 0; + r.remoteChildHeadID = newHeadVID; + r.remoteOwnerNodeIndex = plans[k].ownerNode; + r.startTimestampSec = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + r.stage = Distributed::SplitWAL::Stage::Begin; + m_splitWAL->Write(r); + } - std::unique_lock anotherLock(m_rwLocks[newHeadVID], std::defer_lock); - if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) - { - int retry = 0; - while (!anotherLock.try_lock() && retry < 20) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "Split: new head VID %lld is being locked. Wait for lock and do " - "merging after getting lock... (attempt %d)\n", - (std::int64_t)(newHeadVID), retry + 1); - retry++; - std::this_thread::sleep_for(std::chrono::milliseconds(3 * retry)); - } - if (!anotherLock.owns_lock()) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Split: new head VID %lld is being locked after %d retries. Skip merging and return split failed...\n", - (std::int64_t)(newHeadVID), retry); - { - std::unique_lock tmplock(m_splitListLock); - m_splitList.unsafe_erase(headID); - } - SplitAsync(headID, postingList.size() / m_vectorInfoSize); - return ErrorCode::Success; + auto remoteHeadVec = std::make_shared( + (const char *)(args.centers + k * args._D), m_vectorDataSize); + ErrorCode ec = m_worker->SendFencedRemoteAppend( + plans[k].ownerNode, m_layer, newHeadVID, remoteHeadVec, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], token); + + if (ec == ErrorCode::Success) { + if (m_splitWAL) m_splitWAL->Clear(headID, jobID); + if (headExistsInIndex) { + m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); } + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: fenced remote append failed for child %lld on node %d (ec=%d); WAL kept for GC\n", + (std::int64_t)newHeadVID, plans[k].ownerNode, (int)ec); } - if (m_headIndex->ContainSample(newHeadVID, m_layer + 1)) { - //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Split: new head VID %lld already exists in head index. Do merging...\n", (std::int64_t)(newHeadVID)); - m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); - - // If newHeadVID's owner is a remote node, route - // the new posting via a fenced cross-owner write: - // acquire the remote lock, send a fenced - // RemoteAppend (sync), and let the owner merge - // it into the existing posting list. See - // TryWriteRemoteSplitChildFenced for the - // try-lock-both + WAL + fencing protocol. - if (IsRemoteOwnedHead(newHeadVID)) { - ErrorCode fec = TryWriteRemoteSplitChildFenced( - headID, newHeadVID, - args.centers + k * args._D, - (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k]); - if (fec == ErrorCode::Success) { - if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); - continue; - } - // Fall through: on remote-lock contention - // or send failure, fall back to the legacy - // async TryRouteRemoteAppend so we don't - // strand the posting. Watchdog + WAL GC - // converge eventually. - if (TryRouteRemoteAppend( - newHeadVID, - (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k], - args.centers + k * args._D)) { - if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); - continue; - } + // Release this child's remote lease as soon as the + // remote write is done (refcount-aware for the rare + // case both children share a bucket). + if (leaseIt != remoteTokens.end()) { + if (--leaseIt->second.refcount <= 0) { + m_worker->SendRemoteLock(plans[k].ownerNode, m_layer, + leaseIt->second.sampleVID, + false, leaseIt->second.token); + remoteTokens.erase(leaseIt); } + } - std::string mergedPostingList; - std::set vectorIdSet; - std::string currentPostingList; - { - if ((ret = db->Get(DBKey(newHeadVID), ¤tPostingList, MaxTimeout, - &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get posting %lld\n", - (std::int64_t)(newHeadVID)); - return ret; - } + // For a new head we still need to register it in the + // local BKT so head-search can route to it; HeadSync + // below broadcasts to peers. + if (!headExistsInIndex) { + auto updateHeadBegin = std::chrono::high_resolution_clock::now(); + if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); + releaseRemoteTokens(); + return ret; } + splitNewHeadCount++; + m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); + auto updateHeadEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); + m_stat.m_updateHeadCost += elapsedMSeconds; + } + continue; + } - auto *postingO = reinterpret_cast(newPostingLists[k].data()); - size_t postVectorNumO = newPostingLists[k].size() / m_vectorInfoSize; - int currentLength = 0; - bool hasHeadO = false; - for (int j = 0; j < postVectorNumO; j++, postingO += m_vectorInfoSize) - { - SizeType VID = *((SizeType *)(postingO)); - if (vectorIdSet.insert(VID).second) { - mergedPostingList += newPostingLists[k].substr(j * m_vectorInfoSize, m_vectorInfoSize); - currentLength++; - if (VID == newHeadVID) hasHeadO = true; - } - } + // Local-owned newHead path (lock already held in localChildLocks) + if (headExistsInIndex) { + m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); + + std::string mergedPostingList; + std::set vectorIdSet; + std::string currentPostingList; + if ((ret = db->Get(DBKey(newHeadVID), ¤tPostingList, MaxTimeout, + &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get posting %lld\n", + (std::int64_t)(newHeadVID)); + releaseRemoteTokens(); + return ret; + } - if (!hasHeadO) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Split: after merging head VID %lld, the head vector is missing in posting list. Add head vector back to posting list.\n", (std::int64_t)(newHeadVID)); - vectorIdSet.insert(newHeadVID); - mergedPostingList = postingList.substr(args.clusterIdx[k] * m_vectorInfoSize, m_vectorInfoSize) + mergedPostingList; + auto *postingO = reinterpret_cast(newPostingLists[k].data()); + size_t postVectorNumO = newPostingLists[k].size() / m_vectorInfoSize; + int currentLength = 0; + bool hasHeadO = false; + for (int j = 0; j < (int)postVectorNumO; j++, postingO += m_vectorInfoSize) { + SizeType VID = *((SizeType *)(postingO)); + if (vectorIdSet.insert(VID).second) { + mergedPostingList += newPostingLists[k].substr(j * m_vectorInfoSize, m_vectorInfoSize); currentLength++; + if (VID == newHeadVID) hasHeadO = true; } + } - auto *postingK = reinterpret_cast(currentPostingList.data()); - size_t newPostVectorNum = currentPostingList.size() / m_vectorInfoSize; - for (int j = 0; j < newPostVectorNum; j++, postingK += m_vectorInfoSize) - { - SizeType VID = *((SizeType *)(postingK)); - uint8_t version = *(postingK + sizeof(SizeType)); - - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) - continue; - - if (vectorIdSet.find(VID) != vectorIdSet.end()) - continue; + if (!hasHeadO) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Split: after merging head VID %lld, the head vector is missing in posting list. Add head vector back to posting list.\n", (std::int64_t)(newHeadVID)); + vectorIdSet.insert(newHeadVID); + mergedPostingList = postingList.substr(args.clusterIdx[k] * m_vectorInfoSize, m_vectorInfoSize) + mergedPostingList; + currentLength++; + } - vectorIdSet.insert(VID); - mergedPostingList += currentPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); - currentLength++; - } + auto *postingK = reinterpret_cast(currentPostingList.data()); + size_t newPostVectorNum = currentPostingList.size() / m_vectorInfoSize; + for (int j = 0; j < (int)newPostVectorNum; j++, postingK += m_vectorInfoSize) { + SizeType VID = *((SizeType *)(postingK)); + uint8_t verK = *(postingK + sizeof(SizeType)); + if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != verK) continue; + if (vectorIdSet.find(VID) != vectorIdSet.end()) continue; + vectorIdSet.insert(VID); + mergedPostingList += currentPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); + currentLength++; + } - if (currentLength > (m_postingSizeLimit + m_bufferSizeLimit) && m_opt->m_storage == Storage::FILEIO) - { - /* - SPTAGLIB_LOG( - Helper::LogLevel::LL_Warning, - "Split: merged posting list length %d exceeds hard limit %d after merging head " - "VID %lld. Cut to limit and put back to db.\n", - currentLength, m_postingSizeLimit + m_bufferSizeLimit, (std::int64_t)(newHeadVID)); - */ - mergedPostingList.resize((m_postingSizeLimit + m_bufferSizeLimit) * m_vectorInfoSize); - currentLength = m_postingSizeLimit + m_bufferSizeLimit; - } + if (currentLength > (m_postingSizeLimit + m_bufferSizeLimit) && m_opt->m_storage == Storage::FILEIO) { + mergedPostingList.resize((m_postingSizeLimit + m_bufferSizeLimit) * m_vectorInfoSize); + currentLength = m_postingSizeLimit + m_bufferSizeLimit; + } - auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret = db->Put(DBKey(newHeadVID), mergedPostingList, MaxTimeout, - &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to put posting %lld\n", - (std::int64_t)(newHeadVID)); - return ret; - } - CheckCentroid(newHeadVID, mergedPostingList, "Split-MergePosting"); - auto splitPutEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = - std::chrono::duration_cast(splitPutEnd - splitPutBegin) - .count(); - m_stat.m_putCost += elapsedMSeconds; - - if (currentLength > m_postingSizeLimit) - { - m_stat.m_splitExistingHeadMergeResplitCount.fetch_add(1, std::memory_order_relaxed); - SplitAsync(newHeadVID, currentLength); - } - } else { - // If newHeadVID's owner is a remote node, do the - // fenced cross-owner write: try-lock-both + WAL - // + sync fenced RemoteAppend. We still add the - // head locally and rely on BroadcastHeadSync - // (after this loop) to spread the head index - // update to all nodes. The receiver's - // AppendCallback materializes the head if its - // HeadSync hasn't arrived yet. - bool remoteCreated = false; - if (IsRemoteOwnedHead(newHeadVID)) { - ErrorCode fec = TryWriteRemoteSplitChildFenced( - headID, newHeadVID, - args.centers + k * args._D, - (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k]); - if (fec == ErrorCode::Success) { - remoteCreated = true; - } else { - // Fall back to async queue: WAL + - // watchdog converge eventually. - remoteCreated = TryRouteRemoteAppend( - newHeadVID, - (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k], - args.centers + k * args._D); - } - } + auto splitPutBegin = std::chrono::high_resolution_clock::now(); + if ((ret = db->Put(DBKey(newHeadVID), mergedPostingList, MaxTimeout, + &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to put posting %lld\n", + (std::int64_t)(newHeadVID)); + releaseRemoteTokens(); + return ret; + } + CheckCentroid(newHeadVID, mergedPostingList, "Split-MergePosting"); + auto splitPutEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); + m_stat.m_putCost += elapsedMSeconds; - if (!remoteCreated) { - auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); - return ret; - } - CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); - auto splitPutEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); - m_stat.m_putCost += elapsedMSeconds; - } + if (currentLength > m_postingSizeLimit) { + m_stat.m_splitExistingHeadMergeResplitCount.fetch_add(1, std::memory_order_relaxed); + SplitAsync(newHeadVID, currentLength); + } + } else { + auto splitPutBegin = std::chrono::high_resolution_clock::now(); + if ((ret = db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); + releaseRemoteTokens(); + return ret; + } + CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); + auto splitPutEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); + m_stat.m_putCost += elapsedMSeconds; - auto updateHeadBegin = std::chrono::high_resolution_clock::now(); - if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); - if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); - } - return ret; + auto updateHeadBegin = std::chrono::high_resolution_clock::now(); + if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); + if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); } - splitNewHeadCount++; - m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); - auto updateHeadEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); - m_stat.m_updateHeadCost += elapsedMSeconds; + releaseRemoteTokens(); + return ret; } - if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + splitNewHeadCount++; + m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); + auto updateHeadEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); + m_stat.m_updateHeadCost += elapsedMSeconds; } - //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head id: %d split into : %d, length: %d\n", headID, newHeadVID, args.counts[k]); } + if (!theSameHead) { m_headIndex->DeleteIndex(headID, m_layer + 1); if ((ret=db->Delete(DBKey(headID))) != ErrorCode::Success) @@ -1826,12 +1918,6 @@ namespace SPTAG::SPANN { ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID) { - // Ownership filtering is the single gate inside MergeAsync; by - // the time we get here the head is guaranteed local-owned. No - // re-check needed (hash ring is static once initialized, and - // only layer 0 routes anyway). - WaitForRemoteBucketUnlocked(headID); - std::unique_lock lock(m_rwLocks[headID]); if (!m_headIndex->ContainSample(headID, m_layer + 1)) { @@ -1852,12 +1938,7 @@ namespace SPTAG::SPANN { // Tracks the loser VID after a successful merge so we can // broadcast a HeadSync Delete entry to peers after releasing - // the per-head RWLock. Split mirrors this pattern at - // line ~1620 with both Add (new heads) and Delete (original - // head) entries. Without this broadcast, peers keep routing - // BatchAppend traffic to the deleted head -- the receiver's - // AppendCallback wasMissing branch would then resurrect a - // dead head, leaving a zombie until the next merge round. + // the per-head RWLock. SizeType deletedHeadVID = -1; std::string currentPostingList; @@ -1942,17 +2023,7 @@ namespace SPTAG::SPANN { { std::unique_lock anotherLock(m_rwLocks[queryResult->VID], std::defer_lock); - // RAII guard for the advisory remote bucket lock. - struct RemoteLockGuard { - WorkerNode* router = nullptr; - int nodeIndex = -1; - int layer = 0; - SizeType headID = -1; - bool active = false; - ~RemoteLockGuard() { if (active && router) router->SendRemoteLock(nodeIndex, layer, headID, false); } - void release() { active = false; } - } remoteLockGuard; - + RemoteLeaseGuard remoteLease; bool isRemoteCandidate = false; int remoteNodeIndex = -1; if (m_worker && m_worker->IsEnabled()) { @@ -1960,15 +2031,11 @@ namespace SPTAG::SPANN { if (!target.isLocal) { isRemoteCandidate = true; remoteNodeIndex = target.nodeIndex; - if (!m_worker->SendRemoteLock(remoteNodeIndex, m_layer, queryResult->VID, true)) { - // Remote owner busy; skip this candidate. + if (!remoteLease.acquire(m_worker, remoteNodeIndex, m_layer, queryResult->VID)) { + // Advisory remote lease busy; skip this + // candidate. continue; } - remoteLockGuard.router = m_worker; - remoteLockGuard.nodeIndex = remoteNodeIndex; - remoteLockGuard.layer = m_layer; - remoteLockGuard.headID = queryResult->VID; - remoteLockGuard.active = true; } } @@ -1992,13 +2059,19 @@ namespace SPTAG::SPANN { } if ((ret=db->Get(DBKey(queryResult->VID), &nextPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - if (isRemoteCandidate) { - // Stale fetch on remote side; skip and let next round retry. + if (ret == ErrorCode::Key_NotFound) { + // Candidate posting no longer exists (raced with + // another split/merge). Skip and try the next + // neighbor regardless of locality. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: candidate %lld not found (stale); skipping\n", + (std::int64_t)(queryResult->VID)); continue; } + // Real IO failure -- propagate, do not silently skip. SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Fail to get to be merged posting: %lld, get size:%d\n", - (std::int64_t)(queryResult->VID), (int)(nextPostingList.size())); + "Fail to get to be merged posting: %lld, get size:%d (ec=%d)\n", + (std::int64_t)(queryResult->VID), (int)(nextPostingList.size()), (int)ret); PrintErrorInPosting(nextPostingList, queryResult->VID); return ret; } @@ -2105,13 +2178,8 @@ namespace SPTAG::SPANN { deletedLength = currentLength; } if (isRemoteCandidate) { - // Release advisory remote lock before reassign below. - if (remoteLockGuard.active) { - remoteLockGuard.router->SendRemoteLock( - remoteLockGuard.nodeIndex, remoteLockGuard.layer, - remoteLockGuard.headID, false); - remoteLockGuard.release(); - } + // Release advisory remote lease before reassign below. + remoteLease.release(); } else if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); } From 6d5a1b8c8b85e737f58e2019010d09a207e2cea0 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 09:24:49 +0000 Subject: [PATCH 31/48] fix(distributed): bounded fenced-append retry, rollback, simplify Split lock acquisition Phase D: wrap SendFencedRemoteAppend in a bounded 3-attempt retry loop (10/20/40ms backoff with stale-token release + re-acquire on each retry). On exhaustion clear the SplitWAL record, walk the per-Split committed-child log in reverse to roll back partial progress (SameHead db->Put restore, LocalNew DeleteIndex+Delete, Remote local-BKT DeleteIndex, LocalExisting best-effort), then return ErrorCode::Fail so the caller (BatchAppend / AddIndex) sees the failure and can retry the entire op. srcHead is preserved: the trailing 'if (!theSameHead) DeleteIndex(headID)' is gated behind a Success return, so failures never strand the source cluster. Phase C: collapse the previous two-pass lock acquisition (local + remote with sort-by-VID / sort-by-(owner,bucket)) into a single pass over plans[]. The ascending-VID sort never actually prevented deadlock because srcHead is already held; deadlock-freedom comes from try_lock + reenqueueAndExit, which re-queues the Split via SplitAsync on contention. Both branches retain 20-attempt try-lock + 3*N ms backoff before bailing out. The post-C 'invariant' assertion block duplicating the same filter logic is dropped: the single-pass plans[] iteration makes it self-evidently correct. Phase D control flow: restructure the per-k loop with explicit if/else the local path matches the pre-distributed PR-target structure with the remote dispatch as a sibling branch. Verified: 2-node insert_dominant 1M+1M sustained 1263.7 vec/s (vs 1141.6/s in 15f17c9a) with recall\@5 = 0.986 (vs 0.984), zero segfaults, zero fencing-token rejections in the run. The 72 observed retry-exhaustion events were all TiKV gRPC Deadline Exceeded propagating through; caller- level retry handled them transparently. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 587 ++++++++++-------- 1 file changed, 336 insertions(+), 251 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 3fc2e639e..9abc7f382 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -1528,107 +1528,124 @@ namespace SPTAG::SPANN { return ErrorCode::Success; }; - // C.1 Local newHead locks (ascending VID order to avoid GlobalLock deadlock) + // C. Acquire newHead locks (one pass over plans[]). + // Local children: try_lock with up to 20 retries + 3*N ms backoff. + // Remote children: SendRemoteLock (receiver-side TryAcquire) + // with the same retry schedule; coalesce same-(owner,bucket) + // via remoteTokens so two children on one bucket share a lease. + // Any acquisition failure bails to reenqueueAndExit -- that is + // itself the retry mechanism (job re-queues via SplitAsync), + // which also breaks any potential lock cycle. Acquisition + // order is therefore irrelevant. { - std::vector localVids; + SizeType prevLocalVid = -1; for (int k = 0; k < 2; ++k) { - if (!plans[k].active || plans[k].isRemote || plans[k].isSameHead) continue; - if (plans[k].newHeadVID == headID) continue; - localVids.push_back(plans[k].newHeadVID); - } - std::sort(localVids.begin(), localVids.end()); - localVids.erase(std::unique(localVids.begin(), localVids.end()), localVids.end()); + const auto& p = plans[k]; + if (!p.active || p.isSameHead) continue; + + if (p.isRemote) { + unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex( + static_cast(p.newHeadVID)); + auto key = std::make_pair(p.ownerNode, bucket); + auto it = remoteTokens.find(key); + if (it != remoteTokens.end()) { + // Same (owner,bucket) already leased by a prior + // child; reuse the token and bump refcount. + it->second.refcount++; + continue; + } + std::uint64_t token = 0; + for (int attempt = 0; attempt < 20; ++attempt) { + token = m_worker->SendRemoteLock(p.ownerNode, m_layer, + p.newHeadVID, true, 0); + if (token != 0) break; + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: remote newHead VID %lld owner=%d bucket=%u lease busy (attempt %d)\n", + (std::int64_t)p.newHeadVID, p.ownerNode, bucket, attempt + 1); + std::this_thread::sleep_for(std::chrono::milliseconds(3 * (attempt + 1))); + } + if (token == 0) { + return reenqueueAndExit("remote child lock"); + } + remoteTokens[key] = { token, 1, p.newHeadVID }; + } else { + if (p.newHeadVID == headID) continue; // srcHead already held + if (p.newHeadVID == prevLocalVid) continue; // dedupe k=1 vs k=0 - for (SizeType vid : localVids) { - std::unique_lock ul(m_rwLocks[vid], std::defer_lock); - int rtry = 0; - while (!ul.try_lock() && rtry < 20) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "Split: local newHead VID %lld lock busy (attempt %d)\n", - (std::int64_t)vid, rtry + 1); - rtry++; - std::this_thread::sleep_for(std::chrono::milliseconds(3 * rtry)); - } - if (!ul.owns_lock()) { - return reenqueueAndExit("local child lock"); + std::unique_lock ul(m_rwLocks[p.newHeadVID], std::defer_lock); + int rtry = 0; + while (!ul.try_lock() && rtry < 20) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: local newHead VID %lld lock busy (attempt %d)\n", + (std::int64_t)p.newHeadVID, rtry + 1); + rtry++; + std::this_thread::sleep_for(std::chrono::milliseconds(3 * rtry)); + } + if (!ul.owns_lock()) { + return reenqueueAndExit("local child lock"); + } + localChildLocks.push_back(std::move(ul)); + prevLocalVid = p.newHeadVID; } - localChildLocks.push_back(std::move(ul)); } } - // C.2 Remote newHead locks (ascending (ownerNode, bucket) order) - { - struct RemoteSlot { int k; int owner; unsigned bucket; }; - std::vector slots; - for (int k = 0; k < 2; ++k) { - if (!plans[k].active || !plans[k].isRemote) continue; - slots.push_back({k, plans[k].ownerNode, - COMMON::FineGrainedRWLock::BucketIndex(static_cast(plans[k].newHeadVID))}); - } - std::sort(slots.begin(), slots.end(), - [](const RemoteSlot& a, const RemoteSlot& b) { - return std::tie(a.owner, a.bucket) < std::tie(b.owner, b.bucket); - }); - for (auto& slot : slots) { - auto key = std::make_pair(slot.owner, slot.bucket); - auto it = remoteTokens.find(key); - if (it != remoteTokens.end()) { - // Same (ownerNode, bucket) as a previously-acquired - // child; the owner's per-bucket lease covers both - // children, so reuse the token and bump refcount. - it->second.refcount++; - continue; + + // === Phase D: execute per-child writes (all locks held) === + // On any unrecoverable failure we walk `committed` in + // reverse to undo the prior children of THIS Split and + // return ErrorCode::Fail so the caller (Append → AddIndex + // → BatchAppend) sees the failure and can retry from the + // top. srcHead is intentionally preserved: the trailing + // `if (!theSameHead) DeleteIndex(headID)` block is gated + // behind us returning Success. + struct CommittedChildRecord { + enum class Kind { SameHead, LocalNew, LocalExisting, Remote }; + Kind kind; + SizeType vid; + }; + std::vector committed; + auto rollbackCommitted = [&]() { + for (auto it = committed.rbegin(); it != committed.rend(); ++it) { + switch (it->kind) { + case CommittedChildRecord::Kind::SameHead: { + // Restore srcHead's pre-Split posting that we + // overwrote with cluster-k's subset. + auto rret = db->Put(DBKey(headID), postingList, + MaxTimeout, nullptr); + if (rret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "Split rollback: failed to restore srcHead %lld posting (ec=%d); recall may drop until next Merge\n", + (std::int64_t)headID, (int)rret); + } + theSameHead = false; + break; } - std::uint64_t token = 0; - constexpr int kMaxLockRetries = 20; - for (int attempt = 0; attempt < kMaxLockRetries; ++attempt) { - token = m_worker->SendRemoteLock(slot.owner, m_layer, - plans[slot.k].newHeadVID, true, 0); - if (token != 0) break; + case CommittedChildRecord::Kind::LocalNew: + m_headIndex->DeleteIndex(it->vid, m_layer + 1); + (void)db->Delete(DBKey(it->vid)); + break; + case CommittedChildRecord::Kind::LocalExisting: + // The merged posting overwrote an existing head; + // we did not stash its prior contents so we + // cannot cheaply restore it. srcHead still + // holds the original vectors (we did not delete + // it), so a search dedupes the duplication via + // the version map. Best-effort. SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "Split: remote newHead VID %lld owner=%d bucket=%u lease busy (attempt %d)\n", - (std::int64_t)plans[slot.k].newHeadVID, slot.owner, slot.bucket, attempt + 1); - std::this_thread::sleep_for(std::chrono::milliseconds(3 * (attempt + 1))); - } - if (token == 0) { - return reenqueueAndExit("remote child lock"); - } - remoteTokens[key] = { token, 1, plans[slot.k].newHeadVID }; - } - } - - // Invariant: every child that needs a lock has one held. - // Failure paths in C.1/C.2 already early-returned via - // reenqueueAndExit, so reaching here means all required - // locks (local per-VID + remote per-(owner,bucket) lease) - // are acquired. Assert this explicitly for debug builds. - { - size_t expectedLocal = 0; - std::set> expectedRemoteBuckets; - std::set expectedLocalVids; - for (int k = 0; k < 2; ++k) { - if (!plans[k].active) continue; - if (plans[k].isSameHead) continue; - if (plans[k].isRemote) { - expectedRemoteBuckets.insert(std::make_pair(plans[k].ownerNode, - COMMON::FineGrainedRWLock::BucketIndex(static_cast(plans[k].newHeadVID)))); - } else if (plans[k].newHeadVID != headID) { - expectedLocalVids.insert(plans[k].newHeadVID); + "Split rollback: local-existing head %lld merged-posting NOT restored; duplication with srcHead %lld accepted\n", + (std::int64_t)it->vid, (std::int64_t)headID); + break; + case CommittedChildRecord::Kind::Remote: + m_headIndex->DeleteIndex(it->vid, m_layer + 1); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split rollback: remote head %lld removed from local BKT; stale owner-side posting will be GC'd by next Merge round\n", + (std::int64_t)it->vid); + break; } } - expectedLocal = expectedLocalVids.size(); - assert(localChildLocks.size() == expectedLocal && - "Split Phase C invariant: local child locks count mismatch"); - assert(remoteTokens.size() == expectedRemoteBuckets.size() && - "Split Phase C invariant: remote lease count mismatch"); - (void)expectedLocal; // silence -Wunused in NDEBUG builds - } - - // === Phase D: execute per-child writes (all locks held) === - // Plan-1 best-effort semantics: an IO failure on k=0 after - // k=0 already wrote is accepted as-is; the WAL + watchdog - // converge. We never fall through from a failed remote - // fenced write to a wrong local db Put. + committed.clear(); + }; SizeType newHeadVID = -1; for (int k : ks) { if (!plans[k].active) continue; @@ -1641,6 +1658,7 @@ namespace SPTAG::SPANN { auto splitPutBegin = std::chrono::high_resolution_clock::now(); if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to override posting %lld\n", (std::int64_t)(newHeadVID)); + rollbackCommitted(); releaseRemoteTokens(); return ret; } @@ -1650,183 +1668,250 @@ namespace SPTAG::SPANN { m_stat.m_putCost += elapsedMSeconds; m_stat.m_theSameHeadNum++; m_stat.m_splitSameHeadCount.fetch_add(1, std::memory_order_relaxed); - continue; - } - - newHeadVID = plans[k].newHeadVID; - uint8_t version = plans[k].version; - newHeadsID[k] = newHeadVID; - newHeadsVec[k] = std::make_shared((char *)(args.centers + k * args._D), m_vectorDataSize); - - bool headExistsInIndex = m_headIndex->ContainSample(newHeadVID, m_layer + 1); - - if (plans[k].isRemote) { - // Remote-owned newHead: write posting via fenced - // RemoteAppend to the owner. Local BKT head index - // is still updated here for not-yet-known heads; - // peers learn via BroadcastHeadSync below. - auto leaseIt = remoteTokens.find(bucketKey(plans[k].ownerNode, newHeadVID)); - std::uint64_t token = (leaseIt != remoteTokens.end()) ? leaseIt->second.token : 0; - - std::uint64_t jobID = m_splitJobIdCounter.fetch_add(1) + 1; - if (m_splitWAL) { - Distributed::SplitWAL::Record r; - r.jobID = jobID; - r.srcHeadID = headID; - r.localChildHeadID = 0; - r.remoteChildHeadID = newHeadVID; - r.remoteOwnerNodeIndex = plans[k].ownerNode; - r.startTimestampSec = - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()).count(); - r.stage = Distributed::SplitWAL::Stage::Begin; - m_splitWAL->Write(r); - } + committed.push_back({CommittedChildRecord::Kind::SameHead, newHeadVID}); + } else { + newHeadVID = plans[k].newHeadVID; + uint8_t version = plans[k].version; + newHeadsID[k] = newHeadVID; + newHeadsVec[k] = std::make_shared((char *)(args.centers + k * args._D), m_vectorDataSize); - auto remoteHeadVec = std::make_shared( - (const char *)(args.centers + k * args._D), m_vectorDataSize); - ErrorCode ec = m_worker->SendFencedRemoteAppend( - plans[k].ownerNode, m_layer, newHeadVID, remoteHeadVec, - (int)(newPostingLists[k].size() / m_vectorInfoSize), - newPostingLists[k], token); + bool headExistsInIndex = m_headIndex->ContainSample(newHeadVID, m_layer + 1); - if (ec == ErrorCode::Success) { - if (m_splitWAL) m_splitWAL->Clear(headID, jobID); + if (!plans[k].isRemote) { + // Local-owned newHead path (lock already held in localChildLocks) if (headExistsInIndex) { m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); - } - } else { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "Split: fenced remote append failed for child %lld on node %d (ec=%d); WAL kept for GC\n", - (std::int64_t)newHeadVID, plans[k].ownerNode, (int)ec); - } - // Release this child's remote lease as soon as the - // remote write is done (refcount-aware for the rare - // case both children share a bucket). - if (leaseIt != remoteTokens.end()) { - if (--leaseIt->second.refcount <= 0) { - m_worker->SendRemoteLock(plans[k].ownerNode, m_layer, - leaseIt->second.sampleVID, - false, leaseIt->second.token); - remoteTokens.erase(leaseIt); - } - } + std::string mergedPostingList; + std::set vectorIdSet; + std::string currentPostingList; + if ((ret = db->Get(DBKey(newHeadVID), ¤tPostingList, MaxTimeout, + &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get posting %lld\n", + (std::int64_t)(newHeadVID)); + rollbackCommitted(); + releaseRemoteTokens(); + return ret; + } - // For a new head we still need to register it in the - // local BKT so head-search can route to it; HeadSync - // below broadcasts to peers. - if (!headExistsInIndex) { - auto updateHeadBegin = std::chrono::high_resolution_clock::now(); - if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); - releaseRemoteTokens(); - return ret; - } - splitNewHeadCount++; - m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); - auto updateHeadEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); - m_stat.m_updateHeadCost += elapsedMSeconds; - } - continue; - } + auto *postingO = reinterpret_cast(newPostingLists[k].data()); + size_t postVectorNumO = newPostingLists[k].size() / m_vectorInfoSize; + int currentLength = 0; + bool hasHeadO = false; + for (int j = 0; j < (int)postVectorNumO; j++, postingO += m_vectorInfoSize) { + SizeType VID = *((SizeType *)(postingO)); + if (vectorIdSet.insert(VID).second) { + mergedPostingList += newPostingLists[k].substr(j * m_vectorInfoSize, m_vectorInfoSize); + currentLength++; + if (VID == newHeadVID) hasHeadO = true; + } + } - // Local-owned newHead path (lock already held in localChildLocks) - if (headExistsInIndex) { - m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); - - std::string mergedPostingList; - std::set vectorIdSet; - std::string currentPostingList; - if ((ret = db->Get(DBKey(newHeadVID), ¤tPostingList, MaxTimeout, - &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get posting %lld\n", - (std::int64_t)(newHeadVID)); - releaseRemoteTokens(); - return ret; - } + if (!hasHeadO) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Split: after merging head VID %lld, the head vector is missing in posting list. Add head vector back to posting list.\n", (std::int64_t)(newHeadVID)); + vectorIdSet.insert(newHeadVID); + mergedPostingList = postingList.substr(args.clusterIdx[k] * m_vectorInfoSize, m_vectorInfoSize) + mergedPostingList; + currentLength++; + } - auto *postingO = reinterpret_cast(newPostingLists[k].data()); - size_t postVectorNumO = newPostingLists[k].size() / m_vectorInfoSize; - int currentLength = 0; - bool hasHeadO = false; - for (int j = 0; j < (int)postVectorNumO; j++, postingO += m_vectorInfoSize) { - SizeType VID = *((SizeType *)(postingO)); - if (vectorIdSet.insert(VID).second) { - mergedPostingList += newPostingLists[k].substr(j * m_vectorInfoSize, m_vectorInfoSize); - currentLength++; - if (VID == newHeadVID) hasHeadO = true; - } - } + auto *postingK = reinterpret_cast(currentPostingList.data()); + size_t newPostVectorNum = currentPostingList.size() / m_vectorInfoSize; + for (int j = 0; j < (int)newPostVectorNum; j++, postingK += m_vectorInfoSize) { + SizeType VID = *((SizeType *)(postingK)); + uint8_t verK = *(postingK + sizeof(SizeType)); + if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != verK) continue; + if (vectorIdSet.find(VID) != vectorIdSet.end()) continue; + vectorIdSet.insert(VID); + mergedPostingList += currentPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); + currentLength++; + } - if (!hasHeadO) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Split: after merging head VID %lld, the head vector is missing in posting list. Add head vector back to posting list.\n", (std::int64_t)(newHeadVID)); - vectorIdSet.insert(newHeadVID); - mergedPostingList = postingList.substr(args.clusterIdx[k] * m_vectorInfoSize, m_vectorInfoSize) + mergedPostingList; - currentLength++; - } + if (currentLength > (m_postingSizeLimit + m_bufferSizeLimit) && m_opt->m_storage == Storage::FILEIO) { + /* + SPTAGLIB_LOG( + Helper::LogLevel::LL_Warning, + "Split: merged posting list length %d exceeds hard limit %d after merging head " + "VID %lld. Cut to limit and put back to db.\n", + currentLength, m_postingSizeLimit + m_bufferSizeLimit, (std::int64_t)(newHeadVID)); + */ + mergedPostingList.resize((m_postingSizeLimit + m_bufferSizeLimit) * m_vectorInfoSize); + currentLength = m_postingSizeLimit + m_bufferSizeLimit; + } - auto *postingK = reinterpret_cast(currentPostingList.data()); - size_t newPostVectorNum = currentPostingList.size() / m_vectorInfoSize; - for (int j = 0; j < (int)newPostVectorNum; j++, postingK += m_vectorInfoSize) { - SizeType VID = *((SizeType *)(postingK)); - uint8_t verK = *(postingK + sizeof(SizeType)); - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != verK) continue; - if (vectorIdSet.find(VID) != vectorIdSet.end()) continue; - vectorIdSet.insert(VID); - mergedPostingList += currentPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); - currentLength++; - } + auto splitPutBegin = std::chrono::high_resolution_clock::now(); + if ((ret = db->Put(DBKey(newHeadVID), mergedPostingList, MaxTimeout, + &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to put posting %lld\n", + (std::int64_t)(newHeadVID)); + rollbackCommitted(); + releaseRemoteTokens(); + return ret; + } + CheckCentroid(newHeadVID, mergedPostingList, "Split-MergePosting"); + auto splitPutEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); + m_stat.m_putCost += elapsedMSeconds; - if (currentLength > (m_postingSizeLimit + m_bufferSizeLimit) && m_opt->m_storage == Storage::FILEIO) { - mergedPostingList.resize((m_postingSizeLimit + m_bufferSizeLimit) * m_vectorInfoSize); - currentLength = m_postingSizeLimit + m_bufferSizeLimit; - } + committed.push_back({CommittedChildRecord::Kind::LocalExisting, newHeadVID}); - auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret = db->Put(DBKey(newHeadVID), mergedPostingList, MaxTimeout, - &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to put posting %lld\n", - (std::int64_t)(newHeadVID)); - releaseRemoteTokens(); - return ret; - } - CheckCentroid(newHeadVID, mergedPostingList, "Split-MergePosting"); - auto splitPutEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); - m_stat.m_putCost += elapsedMSeconds; + if (currentLength > m_postingSizeLimit) { + m_stat.m_splitExistingHeadMergeResplitCount.fetch_add(1, std::memory_order_relaxed); + SplitAsync(newHeadVID, currentLength); + } + } else { + auto splitPutBegin = std::chrono::high_resolution_clock::now(); + if ((ret = db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); + rollbackCommitted(); + releaseRemoteTokens(); + return ret; + } + CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); + auto splitPutEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); + m_stat.m_putCost += elapsedMSeconds; + + auto updateHeadBegin = std::chrono::high_resolution_clock::now(); + if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); + if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); + } + rollbackCommitted(); + releaseRemoteTokens(); + return ret; + } + splitNewHeadCount++; + m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); + auto updateHeadEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); + m_stat.m_updateHeadCost += elapsedMSeconds; - if (currentLength > m_postingSizeLimit) { - m_stat.m_splitExistingHeadMergeResplitCount.fetch_add(1, std::memory_order_relaxed); - SplitAsync(newHeadVID, currentLength); - } - } else { - auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret = db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); - releaseRemoteTokens(); - return ret; - } - CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); - auto splitPutEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); - m_stat.m_putCost += elapsedMSeconds; + committed.push_back({CommittedChildRecord::Kind::LocalNew, newHeadVID}); + } + } else { + // Remote-owned newHead: write posting via fenced + // RemoteAppend to the owner. Local BKT head index + // is still updated here for not-yet-known heads; + // peers learn via BroadcastHeadSync below. + auto leaseIt = remoteTokens.find(bucketKey(plans[k].ownerNode, newHeadVID)); + std::uint64_t token = (leaseIt != remoteTokens.end()) ? leaseIt->second.token : 0; + + std::uint64_t jobID = m_splitJobIdCounter.fetch_add(1) + 1; + if (m_splitWAL) { + Distributed::SplitWAL::Record r; + r.jobID = jobID; + r.srcHeadID = headID; + r.localChildHeadID = 0; + r.remoteChildHeadID = newHeadVID; + r.remoteOwnerNodeIndex = plans[k].ownerNode; + r.startTimestampSec = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + r.stage = Distributed::SplitWAL::Stage::Begin; + m_splitWAL->Write(r); + } - auto updateHeadBegin = std::chrono::high_resolution_clock::now(); - if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); - if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); + auto remoteHeadVec = std::make_shared( + (const char *)(args.centers + k * args._D), m_vectorDataSize); + + // Bounded retry: a fencing-token rejection means the + // owner's lease TTL expired between our acquire and + // our send (rare; lease TTL is 30 s). Release the + // stale token, re-acquire, and resend. After 3 + // attempts (10/20/40 ms backoff) we surface the + // failure to the caller so they can retry the + // whole AddIndex op at the user level instead of + // silently dropping the cluster vectors. + constexpr int kFenceRetries = 3; + ErrorCode ec = ErrorCode::Fail; + for (int attempt = 0; attempt < kFenceRetries; ++attempt) { + if (attempt > 0) { + std::this_thread::sleep_for( + std::chrono::milliseconds(10 << (attempt - 1))); + // Release the stale lease (best-effort: + // the owner may have auto-released it via + // TTL already, in which case this no-ops). + if (leaseIt != remoteTokens.end()) { + m_worker->SendRemoteLock( + plans[k].ownerNode, m_layer, + leaseIt->second.sampleVID, + false, leaseIt->second.token); + leaseIt->second.token = 0; + } + std::uint64_t newTok = m_worker->SendRemoteLock( + plans[k].ownerNode, m_layer, + plans[k].newHeadVID, true, 0); + if (newTok == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: fenced retry %d/%d cannot re-acquire lease for child %lld on node %d\n", + attempt + 1, kFenceRetries, + (std::int64_t)newHeadVID, plans[k].ownerNode); + continue; + } + token = newTok; + if (leaseIt != remoteTokens.end()) { + leaseIt->second.token = newTok; + } + } + ec = m_worker->SendFencedRemoteAppend( + plans[k].ownerNode, m_layer, newHeadVID, remoteHeadVec, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], token); + if (ec == ErrorCode::Success) break; + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "Split: fenced remote append attempt %d/%d failed for child %lld on node %d (ec=%d)\n", + attempt + 1, kFenceRetries, + (std::int64_t)newHeadVID, plans[k].ownerNode, (int)ec); } - releaseRemoteTokens(); - return ret; + + if (ec == ErrorCode::Success) { + if (m_splitWAL) m_splitWAL->Clear(headID, jobID); + if (headExistsInIndex) { + m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); + } + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "Split: fenced remote append exhausted %d retries for child %lld on node %d; rolling back srcHead %lld and returning Fail\n", + kFenceRetries, (std::int64_t)newHeadVID, + plans[k].ownerNode, (std::int64_t)headID); + if (m_splitWAL) m_splitWAL->Clear(headID, jobID); + rollbackCommitted(); + releaseRemoteTokens(); + return ErrorCode::Fail; + } + + // Release this child's remote lease as soon as the + // remote write is done (refcount-aware for the rare + // case both children share a bucket). + if (leaseIt != remoteTokens.end()) { + if (--leaseIt->second.refcount <= 0) { + m_worker->SendRemoteLock(plans[k].ownerNode, m_layer, + leaseIt->second.sampleVID, + false, leaseIt->second.token); + remoteTokens.erase(leaseIt); + } + } + + // For a new head we still need to register it in the + // local BKT so head-search can route to it; HeadSync + // below broadcasts to peers. + if (!headExistsInIndex) { + auto updateHeadBegin = std::chrono::high_resolution_clock::now(); + if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); + rollbackCommitted(); + releaseRemoteTokens(); + return ret; + } + splitNewHeadCount++; + m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); + auto updateHeadEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); + m_stat.m_updateHeadCost += elapsedMSeconds; + } + committed.push_back({CommittedChildRecord::Kind::Remote, newHeadVID}); } - splitNewHeadCount++; - m_stat.m_splitCreatedNewHeadCount.fetch_add(1, std::memory_order_relaxed); - auto updateHeadEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); - m_stat.m_updateHeadCost += elapsedMSeconds; } } From 74a5a8ca3da8d50f3b5ce1c95b01f9dad125c322 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 12:42:55 +0000 Subject: [PATCH 32/48] refactor(distributed): explicit distributed gate + cleanup hot-path branching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ExtraDynamicSearcher.h: - MergePostings candidate branch: single GetOwner call, explicit if/else on isRemoteCandidate (lease-acquire vs try_lock + ContainSample) - Unified re-queue: local lock-busy AND remote lease-busy both re-queue via reenqueueMerge() lambda (counter discipline preserved) - 4 ec=%d log sites switched to ec=%s via Helper::Convert::ConvertToString - MergePostings loser-delete: collapse local/remote into single db->Delete + BKT::DeleteIndex with location=local|nodeN log - Restore bool urgent=false parameter on AppendAsync/ReassignAsync (3 callsites pass true: CollectReAssign batch fallback, Append HeadMiss, BatchAppend HeadMiss); restore addfront dispatch - Append() prologue: keep separate empty-posting drop + appendNum==0 log (do not collapse into single early-return) - Add FindSelfEntryVectorBytes helper for posting self-entry scan - Delete TryRouteRemoteAppend wrapper; Append/BatchAppend/Reassign use explicit 'if (m_worker && IsEnabled()) { if IsRemoteOwnedHead { Enqueue+return } else { WaitForRemoteBucketUnlocked } }' pattern - BatchAppend now calls WaitForRemoteBucketUnlocked for parity with Append on the local-owned branch - BatchAppend routing counters only increment in distributed mode - Reassign loop: flat 'isRemote = (m_worker && IsEnabled && IsRemoteOwnedHead)' + if/else for clean two-way branch - BuildIndex zero-replica refill: move WireJobSubmitterIfReady inside the pool-init if, consistent with LoadIndex pattern Index.h: - Remove unused m_sharedSplitPool slot mechanism (m_sharedSplitPool, m_sharedSplitPoolMutex, Get/SetSharedSplitPool). WorkerNode receiver shares the layer-0 pool via the SetJobSubmitter lambda closure; the per-Index slot was dead code in all observed flows. Verified: 1M+1M insert_dominant 2-node — insert throughput 1226.7 vec/s, recall@5 0.984/0.980 pre/post-insert; within run-to-run variance of the 6d5a1b8c baseline (1263.7 vec/s, 0.986). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 400 ++++++++---------- AnnService/inc/Core/SPANN/Index.h | 17 - 2 files changed, 166 insertions(+), 251 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 9abc7f382..c344e820c 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -317,8 +317,8 @@ namespace SPTAG::SPANN { // Routing counters for local AddIndex calls so we can verify // GetOwner is partitioning work evenly. Incremented in - // BatchAppend()/Append() based on whether TryRouteRemoteAppend - // shipped the head to a peer or it stayed local. + // BatchAppend()/Append() based on whether IsRemoteOwnedHead + // routed the head to a peer or it stayed local. std::atomic_size_t m_routedLocalHeads{ 0 }; std::atomic_size_t m_routedRemoteHeads{ 0 }; std::atomic_size_t m_routedLocalItems{ 0 }; @@ -853,18 +853,24 @@ namespace SPTAG::SPANN { return true; } - // If headID is owned by a remote node, queue the append for that - // node and return true; otherwise return false (caller continues - // with local write logic). - bool TryRouteRemoteAppend(SizeType headID, - int appendNum, - std::string posting, - const void* headVecBytes = nullptr) { - int ownerNode = -1; - if (!IsRemoteOwnedHead(headID, &ownerNode)) return false; - EnqueueRemoteAppend(ownerNode, headID, appendNum, - std::move(posting), headVecBytes); - return true; + // Scan a posting buffer for an entry whose VID matches headID + // (the head's own self-entry). Returns a pointer into the buffer + // at the start of the vector bytes (skipping VID + version + + // padding), or nullptr if no self-entry is present. Used by + // remote-append callers so the receiver can materialize a missing + // head index without waiting for BroadcastHeadSync. + const void* FindSelfEntryVectorBytes(SizeType headID, + const std::string& posting, + int recCount) const { + const uint8_t* basePtr = + reinterpret_cast(posting.data()); + for (int i = 0; i < recCount; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + if (*reinterpret_cast(p) == headID) { + return p + m_metaDataSize; + } + } + return nullptr; } // Synchronous, fenced cross-owner write used by the Split path. @@ -954,8 +960,9 @@ namespace SPTAG::SPANN { } else { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Split: fenced remote append failed for child %lld " - "on node %d (ec=%d); WAL kept for GC\n", - (std::int64_t)remoteChildHeadID, ownerNode, (int)ec); + "on node %d (ec=%s); WAL kept for GC\n", + (std::int64_t)remoteChildHeadID, ownerNode, + Helper::Convert::ConvertToString(ec).c_str()); } return ec; } @@ -1615,8 +1622,9 @@ namespace SPTAG::SPANN { MaxTimeout, nullptr); if (rret != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Split rollback: failed to restore srcHead %lld posting (ec=%d); recall may drop until next Merge\n", - (std::int64_t)headID, (int)rret); + "Split rollback: failed to restore srcHead %lld posting (ec=%s); recall may drop until next Merge\n", + (std::int64_t)headID, + Helper::Convert::ConvertToString(rret).c_str()); } theSameHead = false; break; @@ -1860,9 +1868,10 @@ namespace SPTAG::SPANN { newPostingLists[k], token); if (ec == ErrorCode::Success) break; SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "Split: fenced remote append attempt %d/%d failed for child %lld on node %d (ec=%d)\n", + "Split: fenced remote append attempt %d/%d failed for child %lld on node %d (ec=%s)\n", attempt + 1, kFenceRetries, - (std::int64_t)newHeadVID, plans[k].ownerNode, (int)ec); + (std::int64_t)newHeadVID, plans[k].ownerNode, + Helper::Convert::ConvertToString(ec).c_str()); } if (ec == ErrorCode::Success) { @@ -2093,6 +2102,22 @@ namespace SPTAG::SPANN { m_headIndex->SearchHeadIndex(queryResults, m_layer + 1, p_exWorkSpace); std::string nextPostingList; + // Re-queue this Merge job and exit cleanly. Counts as a new + // submission so MergeAsyncJob::exec()'s m_mergeJobsInFlight-- / + // m_totalMergeCompleted++ stays balanced -- without these + // increments m_mergeJobsInFlight underflows to a huge uint64 + // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. + auto reenqueueMerge = [&](const char* reason) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "MergePostings: re-queueing headID=%lld (%s)\n", + (std::int64_t)headID, reason); + auto* curJob = new MergeAsyncJob(this, headID, nullptr); + m_mergeJobsInFlight++; + m_totalMergeSubmitted++; + m_splitThreadPool->add(curJob); + return ErrorCode::Success; + }; + for (int i = 1; i < queryResults.GetResultNum(); ++i) { BasicResult* queryResult = queryResults.GetResult(i); @@ -2106,38 +2131,25 @@ namespace SPTAG::SPANN { std::set nextVectorIdSet; int deletedLength = 0; { + RemoteLeaseGuard remoteLease; std::unique_lock anotherLock(m_rwLocks[queryResult->VID], std::defer_lock); - RemoteLeaseGuard remoteLease; bool isRemoteCandidate = false; int remoteNodeIndex = -1; if (m_worker && m_worker->IsEnabled()) { auto target = m_worker->GetOwner(queryResult->VID); - if (!target.isLocal) { - isRemoteCandidate = true; - remoteNodeIndex = target.nodeIndex; - if (!remoteLease.acquire(m_worker, remoteNodeIndex, m_layer, queryResult->VID)) { - // Advisory remote lease busy; skip this - // candidate. - continue; - } - } + isRemoteCandidate = !target.isLocal; + remoteNodeIndex = target.nodeIndex; } - if (!isRemoteCandidate) { - // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID); + if (isRemoteCandidate) { + if (!remoteLease.acquire(m_worker, remoteNodeIndex, m_layer, queryResult->VID)) { + return reenqueueMerge("remote lease busy"); + } + } else { if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { if (!anotherLock.try_lock()) { - auto* curJob = new MergeAsyncJob(this, headID, nullptr); - // Re-queue counts as a new submission; matched by the - // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in - // MergeAsyncJob::exec(). Without these increments - // m_mergeJobsInFlight underflows to a huge uint64 - // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. - m_mergeJobsInFlight++; - m_totalMergeSubmitted++; - m_splitThreadPool->add(curJob); - return ErrorCode::Success; + return reenqueueMerge("local lock busy"); } } if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; @@ -2155,8 +2167,9 @@ namespace SPTAG::SPANN { } // Real IO failure -- propagate, do not silently skip. SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Fail to get to be merged posting: %lld, get size:%d (ec=%d)\n", - (std::int64_t)(queryResult->VID), (int)(nextPostingList.size()), (int)ret); + "Fail to get to be merged posting: %lld, get size:%d (ec=%s)\n", + (std::int64_t)(queryResult->VID), (int)(nextPostingList.size()), + Helper::Convert::ConvertToString(ret).c_str()); PrintErrorInPosting(nextPostingList, queryResult->VID); return ret; } @@ -2178,14 +2191,6 @@ namespace SPTAG::SPANN { nextLength++; } if (resultVec == nullptr) { - if (isRemoteCandidate) { - // Stale fetch / version skew on remote side. Skip - // and let the next merge round retry. - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergePostings: remote candidate %lld has no head record in fetched posting, skipping\n", - (std::int64_t)(queryResult->VID)); - continue; - } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail: cannot find another head vector in posting! headID:%lld\n", (std::int64_t)(queryResult->VID)); return ErrorCode::Fail; } @@ -2201,25 +2206,19 @@ namespace SPTAG::SPANN { return ret; } CheckCentroid(headID, mergedPostingList, "MergePostings-currentLength >= nextLength"); - if (isRemoteCandidate) { - // Survivor is local; delete remote loser first - // (so we don't have duplicate VID across nodes), - // then drop local head-index entry. - if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success - && ret != ErrorCode::Key_NotFound) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergePostings: remote-loser Delete(%lld) failed; survivor %lld is durable\n", - (std::int64_t)queryResult->VID, (std::int64_t)headID); - return ret; - } - m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); - } else { - m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); - if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID)); - return ret; - } + m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); + if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) + { + std::string location = isRemoteCandidate + ? ("node" + std::to_string(remoteNodeIndex)) + : std::string("local"); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: failed to delete old posting %lld in Merge (ec=%s), location=%s; survivor %lld is durable\n", + (std::int64_t)queryResult->VID, + Helper::Convert::ConvertToString(ret).c_str(), + location.c_str(), + (std::int64_t)headID); + return ret; } deletedHeadVID = queryResult->VID; nextHeadID = headID; @@ -2233,12 +2232,6 @@ namespace SPTAG::SPANN { mergedPostingList += *resultVec; } if ((ret=db->Put(DBKey(queryResult->VID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - if (isRemoteCandidate) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergePostings: remote-survivor Put(%lld) failed; no state mutated, next round will retry\n", - (std::int64_t)queryResult->VID); - return ret; - } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail to override posting %lld after merge\n", (std::int64_t)(queryResult->VID)); return ret; } @@ -2246,12 +2239,6 @@ namespace SPTAG::SPANN { m_headIndex->DeleteIndex(headID, m_layer + 1); if ((ret = db->Delete(DBKey(headID))) != ErrorCode::Success) { - if (isRemoteCandidate) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "MergePostings: local-loser Delete(%lld) failed; remote survivor %lld is durable\n", - (std::int64_t)headID, (std::int64_t)queryResult->VID); - return ret; - } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(headID)); return ret; } @@ -2345,11 +2332,14 @@ namespace SPTAG::SPANN { inline void SplitAsync(SizeType headID, int postingSize, std::function p_callback = nullptr) { - // Single authoritative ownership gate. Sources of remote-owned - // headIDs that legitimately reach here: RefineIndex full scan, - // Search→MergeAsync via search result, Split-internal re-enqueue - // for new-head VIDs, MergePostings re-merge of survivor. Drop - // them so the owner runs its own structural pass. + // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Into SplitAsync, current headID: %d, size: %d\n", headID, m_postingSizes.GetSize(headID)); + // tbb::concurrent_hash_map::const_accessor headIDAccessor; + // if (m_splitList.find(headIDAccessor, headID)) { + // return; + // } + // tbb::concurrent_hash_map::value_type workPair(headID, headID); + // m_splitList.insert(workPair); + // Single authoritative ownership gate. if (IsRemoteOwnedHead(headID)) return; { Helper::Concurrent::ConcurrentMap::value_type workPair(headID, postingSize); @@ -2371,11 +2361,7 @@ namespace SPTAG::SPANN { inline void MergeAsync(SizeType headID, std::function p_callback = nullptr) { - // Single authoritative ownership gate. Sources of remote-owned - // headIDs that legitimately reach here: RefineIndex full scan, - // Search→MergeAsync via search result, MergePostings re-merge of - // survivor (nextHeadID). Drop them so the owner runs its own - // merge pass. + // Single authoritative ownership gate. if (IsRemoteOwnedHead(headID)) return; { std::shared_lock tmplock(m_mergeListLock); @@ -2393,20 +2379,28 @@ namespace SPTAG::SPANN { m_splitThreadPool->add(curJob); } - inline void AppendAsync(SizeType headID, std::shared_ptr postingList, std::function p_callback = nullptr) + inline void AppendAsync(SizeType headID, std::shared_ptr postingList, bool urgent = false,std::function p_callback = nullptr) { auto* curJob = new AppendAsyncJob(this, headID, std::move(postingList), p_callback); m_appendJobsInFlight++; m_totalAppendSubmitted++; - m_splitThreadPool->add(curJob); + if (urgent) { + m_splitThreadPool->addfront(curJob); + } else { + m_splitThreadPool->add(curJob); + } } - inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, std::function p_callback = nullptr) + inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, bool urgent = false, std::function p_callback = nullptr) { auto* curJob = new ReassignAsyncJob(this, std::move(vectorInfo), headPrev, p_callback); m_reassignJobsInFlight++; m_totalReassignSubmitted++; - m_splitThreadPool->add(curJob); + if (urgent) { + m_splitThreadPool->addfront(curJob); + } else { + m_splitThreadPool->add(curJob); + } } ErrorCode CollectReAssign(ExtraWorkSpace *p_exWorkSpace, SizeType headID, std::shared_ptr headVec, @@ -2573,7 +2567,7 @@ namespace SPTAG::SPANN { if (m_opt->m_storage == Storage::TIKVIO) ret = BatchAppend(p_exWorkSpace, batchReassign, "CollectReAssign"); else { for (auto& kv : batchReassign) { - AppendAsync(kv.first, std::make_shared(kv.second)); + AppendAsync(kv.first, std::make_shared(kv.second), true); } } if (batchReassignCount > 0) { @@ -2640,53 +2634,40 @@ namespace SPTAG::SPANN { ErrorCode Append(ExtraWorkSpace* p_exWorkSpace, SizeType headID, int appendNum, std::string& appendPosting, int reassignThreshold = 0) { auto appendBegin = std::chrono::high_resolution_clock::now(); - if (appendPosting.empty() || appendNum == 0) { - // Defensive: drop empty/zero-count appends rather than letting - // them reach the storage layer (which would log - // "TiKVIO::Merge: empty append posting!" and fail). Empty - // payloads should never be produced by normal flow, but they - // can arise from buggy sender-side retries that resend - // already-consumed (moved-from) items. - if (appendPosting.empty() && appendNum != 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "Append: dropping empty posting for headID=%lld appendNum=%d\n", - (std::int64_t)headID, appendNum); - } - return ErrorCode::Success; + if (appendPosting.empty()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Error! empty append posting!\n"); } - // If this head is owned by a remote node, route the append via - // QueueRemoteAppend instead of touching local TiKV. appendNum is - // captured BEFORE std::move(appendPosting) to avoid use-after-move. - // If the batch carries the head's own self-entry (VID == headID), - // forward its vector bytes so the receiver can materialize the - // head index before the BroadcastHeadSync arrives. See the - // matching scan in BatchAppend() for rationale. - { - const uint8_t* basePtr = - reinterpret_cast(appendPosting.data()); - const void* headVecBytes = nullptr; - for (int i = 0; i < appendNum; ++i) { - const uint8_t* p = basePtr + i * m_vectorInfoSize; - SizeType vid = *reinterpret_cast(p); - if (vid == headID) { - headVecBytes = p + m_metaDataSize; - break; - } - } - if (TryRouteRemoteAppend(headID, appendNum, appendPosting, headVecBytes)) { + if (appendNum == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Error!, headID :%lld, appendNum:%d\n", (std::int64_t)headID, appendNum); + } + + // Distributed routing gate. + if (m_worker && m_worker->IsEnabled()) { + int ownerNode = -1; + if (IsRemoteOwnedHead(headID, &ownerNode)) { + // Remote-owned head: pack + enqueue for that node. + // Scan posting for self-entry so the receiver can + // materialize a missing head index without waiting + // for BroadcastHeadSync. + const void* headVecBytes = FindSelfEntryVectorBytes( + headID, appendPosting, appendNum); + EnqueueRemoteAppend(ownerNode, headID, appendNum, + std::move(appendPosting), headVecBytes); if (!reassignThreshold) { m_totalAppendCount++; m_stat.m_appendTaskNum++; } return ErrorCode::Success; + } else { + // Local-owned head: wait out any in-flight remote + // initiator that holds an advisory fenced-lease on our + // bucket (e.g. another node mid-Split) before we acquire + // the per-head lock and write. + WaitForRemoteBucketUnlocked(headID); } } - // If a remote initiator is currently holding the advisory lock - // on this bucket, wait it out before we touch the posting. - WaitForRemoteBucketUnlocked(headID); - checkDeleted: if (!m_headIndex->ContainSample(headID, m_layer + 1)) { for (int i = 0; i < appendNum; i++) @@ -2698,7 +2679,7 @@ namespace SPTAG::SPANN { if (m_versionMap->GetVersion(VID) == version) { // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss To ReAssign: VID: %d, current version: %d\n", *(int*)(&appendPosting[idx]), version); m_stat.m_headMiss++; - ReassignAsync(vectorInfo, headID); + ReassignAsync(vectorInfo, headID, true); } // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss Do Not To ReAssign: VID: %d, version: %d, current version: %d\n", *(int*)(&appendPosting[idx]), m_versionMap->GetVersion(*(int*)(&appendPosting[idx])), version); } @@ -2818,47 +2799,24 @@ namespace SPTAG::SPANN { auto appendIt = headAppends.find(headID); if (appendIt == headAppends.end()) continue; - // Owner gate: forward heads owned by a remote node via the - // batched RemoteAppend queue. Local heads fall through to - // the standard MultiMerge path below. Without this hook, - // every node writes to every head's TiKV key and the owner - // ring is ignored (no remote RPC, no route stats). - // - // Pass headVecBytes when this batch carries the head's own - // self-entry (VID == headID). During Build-time seed the - // receiver may not yet have the head index entry; without - // headVecBytes its AppendCallback can't materialize the head - // and falls into the ReassignAsync redirect path, dropping - // the self-entry from the posting and later causing - // "MergePostings fail: cannot find head vector in posting!". - { - const std::string& posting = appendIt->second; - const uint8_t* basePtr = - reinterpret_cast(posting.data()); - size_t totalRec = posting.size() / m_vectorInfoSize; - const void* headVecBytes = nullptr; - for (size_t i = 0; i < totalRec; ++i) { - const uint8_t* p = basePtr + i * m_vectorInfoSize; - SizeType vid = *reinterpret_cast(p); - if (vid == headID) { - headVecBytes = p + m_metaDataSize; - break; - } - } - if (TryRouteRemoteAppend(headID, - (int)(posting.size() / m_vectorInfoSize), - posting, - headVecBytes)) { + // Distributed routing gate (mirrors Append()) + const std::string& posting = appendIt->second; + size_t totalRec = posting.size() / m_vectorInfoSize; + if (m_worker && m_worker->IsEnabled()) { + int ownerNode = -1; + if (IsRemoteOwnedHead(headID, &ownerNode)) { + const void* headVecBytes = FindSelfEntryVectorBytes( + headID, posting, (int)totalRec); + EnqueueRemoteAppend(ownerNode, headID, (int)totalRec, + posting, headVecBytes); m_routedRemoteHeads.fetch_add(1, std::memory_order_relaxed); - m_routedRemoteItems.fetch_add( - posting.size() / m_vectorInfoSize, - std::memory_order_relaxed); + m_routedRemoteItems.fetch_add(totalRec, std::memory_order_relaxed); continue; + } else { + m_routedLocalHeads.fetch_add(1, std::memory_order_relaxed); + m_routedLocalItems.fetch_add(totalRec, std::memory_order_relaxed); + WaitForRemoteBucketUnlocked(headID); } - m_routedLocalHeads.fetch_add(1, std::memory_order_relaxed); - m_routedLocalItems.fetch_add( - posting.size() / m_vectorInfoSize, - std::memory_order_relaxed); } std::unique_lock headLock(m_rwLocks[headID]); @@ -2872,7 +2830,7 @@ namespace SPTAG::SPANN { uint8_t version = *(uint8_t*)(ptr + sizeof(SizeType)); if (m_versionMap->GetVersion(VID) == version) { m_stat.m_headMiss++; - ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID); + ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID, true); } } continue; @@ -2965,20 +2923,28 @@ namespace SPTAG::SPANN { //LOG(Helper::LogLevel::LL_Info, "Reassign: oldVID:%d, replicaCount:%d, candidateNum:%d, dist0:%f\n", oldVID, replicaCount, i, selections[0].distance); for (int i = 0; i < replicaCount && m_versionMap->GetVersion(VID) == version; i++) { //LOG(Helper::LogLevel::LL_Info, "Reassign: headID :%d, oldVID:%d, newVID:%d, posting length: %d, dist: %f, string size: %d\n", headID, oldVID, VID, m_postingSizes[headID].load(), selections[i].distance, newPart.size()); - if (TryRouteRemoteAppend(selections[i].VID, 1, *vectorInfo, - selections[i].Vec.Data())) { - continue; - } - // [FIX H3] use reassignThreshold=0 so that an oversized - // target posting triggers SplitAsync (not a synchronous - // Split on this worker thread). This matches the - // CollectReAssign batch path and avoids a single merge- - // path reassign blocking a worker for the full duration - // of a Split (observed up to tens of seconds). - ErrorCode tmp = Append(p_exWorkSpace, selections[i].VID, 1, *vectorInfo, 0); - if (ErrorCode::Success != tmp) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Head Miss: VID: %d, current version: %d, another re-assign\n", VID, version); - return tmp; + int ownerNode = -1; + bool isRemote = (m_worker && m_worker->IsEnabled() + && IsRemoteOwnedHead(selections[i].VID, &ownerNode)); + if (!isRemote) { + // [FIX H3] use reassignThreshold=0 so that an oversized + // target posting triggers SplitAsync (not a synchronous + // Split on this worker thread). This matches the + // CollectReAssign batch path and avoids a single merge- + // path reassign blocking a worker for the full duration + // of a Split (observed up to tens of seconds). + ErrorCode tmp = Append(p_exWorkSpace, selections[i].VID, 1, *vectorInfo, 0); + if (ErrorCode::Success != tmp) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Head Miss: VID: %d, current version: %d, another re-assign\n", VID, version); + return tmp; + } + } else { + // Centroid bytes are already in selections[i], + // so no self-entry scan needed. + EnqueueRemoteAppend(ownerNode, selections[i].VID, 1, + *vectorInfo, + selections[i].Vec.Data()); + } } } @@ -3083,30 +3049,13 @@ namespace SPTAG::SPANN { } if (m_opt->m_update) { if (m_splitThreadPool == nullptr) { - // Only layer 0 participates in the shared-pool slot: - // it both adopts (if a sibling published first) and - // publishes (so the WorkerNode receiver and any later - // layer-0 instance can reuse the same threads). - // Inner layers (m_layer > 0) always create their own - // pool, matching qianxi's per-instance pool design. - if (m_layer == 0 && m_headIndex) { - auto shared = m_headIndex->GetSharedSplitPool(); - if (shared) { - m_splitThreadPool = std::static_pointer_cast(shared); - } - } - if (m_splitThreadPool == nullptr) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); - - m_splitThreadPool = std::make_shared(); - m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); - //m_reassignThreadPool = std::make_shared(); - //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n"); - if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool); - } else { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: adopted shared split pool from sibling layer\n"); - } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); + + m_splitThreadPool = std::make_shared(); + m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); + //m_reassignThreadPool = std::make_shared(); + //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n"); // Pool is now ready: re-attempt wiring the worker's job // submitter (may have been set before pool was alive). WireJobSubmitterIfReady(); @@ -3759,20 +3708,10 @@ namespace SPTAG::SPANN { if (m_opt->m_update && !m_opt->m_allowZeroReplica && zeroReplicaCount > 0) { - if (m_splitThreadPool == nullptr && m_layer == 0 && m_headIndex) { - auto shared = m_headIndex->GetSharedSplitPool(); - if (shared) { - m_splitThreadPool = std::static_pointer_cast(shared); - } - } - if (m_splitThreadPool == nullptr) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); - m_splitThreadPool = std::make_shared(); - m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount); - if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool); - } - WireJobSubmitterIfReady(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); + m_splitThreadPool = std::make_shared(); + m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount); uint32_t splitNumBeforeZeroReplica = m_stat.m_splitNum; uint32_t reassignNumBeforeZeroReplica = m_stat.m_reAssignNum; @@ -4149,13 +4088,6 @@ namespace SPTAG::SPANN { avgSplitMs, maxSplitMs); } if (runningJobs == 0 && totalJobs == 0) { - // Note: AllFinished() must return true once the LOCAL pool - // is drained; SaveIndexData uses it as the shutdown signal. - // We can't gate it on the outbound remote-append queue: - // peers may continue routing reassigns back to us during - // the drain (feedback loop) so the queue is not - // guaranteed to hit zero. Remote queue depth shows up - // in the periodic progress log instead. if (!m_allDonePrinted) { size_t totalSplit = m_totalSplitSubmitted.load(); size_t totalMerge = m_totalMergeSubmitted.load(); diff --git a/AnnService/inc/Core/SPANN/Index.h b/AnnService/inc/Core/SPANN/Index.h index 255043a58..743588437 100644 --- a/AnnService/inc/Core/SPANN/Index.h +++ b/AnnService/inc/Core/SPANN/Index.h @@ -96,14 +96,6 @@ namespace SPTAG std::shared_ptr> m_freeWorkSpaceIds; std::atomic m_workspaceCount = 0; - // Single split/append thread pool shared by all extraSearchers - // (one per layer). Lazily populated by the first layer that - // initializes its pool inside LoadIndex; subsequent layers - // adopt the same shared instance so the total worker count - // is AppendThreadNum (not AppendThreadNum * layers). - mutable std::mutex m_sharedSplitPoolMutex; - std::shared_ptr m_sharedSplitPool; - public: Index() { @@ -155,15 +147,6 @@ namespace SPTAG } inline WorkerNode* GetPendingWorker() const { return m_pendingWorker; } - inline std::shared_ptr GetSharedSplitPool() const { - std::lock_guard lk(m_sharedSplitPoolMutex); - return m_sharedSplitPool; - } - inline void SetSharedSplitPool(std::shared_ptr pool) { - std::lock_guard lk(m_sharedSplitPoolMutex); - m_sharedSplitPool = std::move(pool); - } - inline SizeType GetNumSamples() const { return GetNumSamples(0); } inline SizeType GetNumSamples(int layer) const { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]->GetNumSamples(); else return m_topIndex->GetNumSamples(); } inline DimensionType GetFeatureDim() const { return m_topIndex->GetFeatureDim(); } From b0774dbbf8cfcbf5b1e133ea67a82cf164df8e77 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 14:16:11 +0000 Subject: [PATCH 33/48] fix(distributed): extend fenced-append retry to match local lock budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fenced cross-owner Split append used 3 retries with exponential backoff (10/20/40 ms, ~70 ms total). This was too tight when the receiver was momentarily slow on TiKV — every Deadline-Exceeded burst forced a Split rollback. In the 1M+1M 2-node benchmark we observed 66 rollbacks per run. Bump to 20 attempts with linear 3*N ms backoff (~570 ms worst-case), matching the local lock-acquire retry budget used for sibling-Split contention elsewhere in the prologue. Splits that genuinely cannot publish still propagate Fail to AddIndex so the caller can retry from the user level. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c344e820c..eae1e858c 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -1825,18 +1825,22 @@ namespace SPTAG::SPANN { // Bounded retry: a fencing-token rejection means the // owner's lease TTL expired between our acquire and - // our send (rare; lease TTL is 30 s). Release the - // stale token, re-acquire, and resend. After 3 - // attempts (10/20/40 ms backoff) we surface the - // failure to the caller so they can retry the - // whole AddIndex op at the user level instead of - // silently dropping the cluster vectors. - constexpr int kFenceRetries = 3; + // our send (rare; lease TTL is 30 s), or the owner + // is momentarily backed up on a TiKV Deadline. + // Release the stale token, re-acquire, and resend. + // Matches the local lock-acquire retry budget (20 + // attempts, linear 3*(attempt) ms backoff, ~570 ms + // worst-case) so transient TiKV slowness doesn't + // force a Split rollback. After 20 attempts we + // surface the failure to the caller so they can + // retry the whole AddIndex op at the user level + // instead of silently dropping cluster vectors. + constexpr int kFenceRetries = 20; ErrorCode ec = ErrorCode::Fail; for (int attempt = 0; attempt < kFenceRetries; ++attempt) { if (attempt > 0) { std::this_thread::sleep_for( - std::chrono::milliseconds(10 << (attempt - 1))); + std::chrono::milliseconds(3 * attempt)); // Release the stale lease (best-effort: // the owner may have auto-released it via // TTL already, in which case this no-ops). From 279100e3cf4b023f0f09583eb3c6f7e480784e1b Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 14:17:18 +0000 Subject: [PATCH 34/48] fix(distributed): plumb fencingToken to AppendCallback so Split can publish new remote heads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The receiver-side AppendCallback unconditionally returned Fail when the target head was missing on the local index, on the theory that a concurrent Merge/Split had just deleted it and resurrecting would race the HeadSync Delete broadcast. The follow-up AddHeadIndex call after the return was dead code. But Split's legitimate "publish a brand-new child head on a remote owner" path also goes through AppendCallback with wasMissing == true (the child does not yet exist on the owner). These appends already carry a valid fencing token earned by an authoritative bucket lease on the new head's VID, so they are safe to materialize. Plumb the fencingToken parameter from HandleAppendRequest through the AppendCallback typedef (and the two BatchAppendItemJob invocation sites) into the lambda. In the wasMissing branch, if fencingToken is non-zero and a headVec was supplied, resurrect via AddHeadIndex (the original intent). Otherwise (unfenced append on a missing head) keep refusing — that path is the racy structural-op case. Eliminates the ~66 silent rollbacks per 1M+1M insert run that were costing ~0.6% recall. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 18 ++++++-- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 42 ++++++++++--------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index fd4c607a2..53170b23a 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -46,11 +46,18 @@ namespace SPTAG::SPANN { /// *where* to send, RemotePostingOps handles *how*. class RemotePostingOps { public: + // fencingToken is forwarded from the request: a nonzero token means + // the caller (Split) holds an authoritative bucket lease and is + // publishing a brand-new head — the callback may resurrect/create + // a missing head in that case. A zero token (ordinary Append) + // must refuse resurrection to avoid racing a concurrent + // Merge/Split that just deleted the head. using AppendCallback = std::function headVec, int appendNum, - std::string& appendPosting)>; + std::string& appendPosting, + std::uint64_t fencingToken)>; // Receiver-side batched callback: deliver a whole BatchRemoteAppend // request to the searcher so it can group items by head and call @@ -866,7 +873,8 @@ namespace SPTAG::SPANN { if (cb) { auto headVec = std::make_shared(std::move(req.m_headVec)); result = (*cb)( - req.m_headID, headVec, req.m_appendNum, req.m_appendPosting); + req.m_headID, headVec, req.m_appendNum, req.m_appendPosting, + req.m_fencingToken); } else { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "RemotePostingOps: AppendRequest layer=%d has no callback registered\n", @@ -1021,7 +1029,8 @@ namespace SPTAG::SPANN { const auto* cb = LookupAppendCallback_Locked(req.m_layer); if (cb) { auto hv = std::make_shared(std::move(req.m_headVec)); - r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting); + r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting, + req.m_fencingToken); } (r == ErrorCode::Success ? *successCount : *failCount).fetch_add(1); } @@ -1706,7 +1715,8 @@ namespace SPTAG::SPANN { const auto* cb = m_ops->LookupAppendCallback_Locked(req.m_layer); if (cb) { auto hv = std::make_shared(std::move(req.m_headVec)); - r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting); + r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting, + req.m_fencingToken); } if (r == ErrorCode::Success) m_success->fetch_add(1); else m_fail->fetch_add(1); diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index eae1e858c..22fe7e132 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -556,7 +556,8 @@ namespace SPTAG::SPANN { // Append callback: routes incoming remote appends to local Append() m_worker->SetAppendCallback(m_layer, [this](SizeType headID, std::shared_ptr headVec, - int appendNum, std::string& appendPosting) -> ErrorCode { + int appendNum, std::string& appendPosting, + std::uint64_t fencingToken) -> ErrorCode { // Reuse SPDKThreadPool's per-worker pre-allocated workspace // when called from BatchAppendItemJob on m_splitThreadPool. @@ -568,25 +569,26 @@ namespace SPTAG::SPANN { } bool wasMissing = !m_headIndex->ContainSample(headID, m_layer + 1); if (wasMissing) { - // We waited for an in-flight Split/Merge and the - // head is gone afterwards -- the structural op - // deleted it on purpose. Resurrecting via - // AddHeadIndex would race the structural op's - // HeadSync Delete broadcast and leave a zombie - // head until the next merge round drops it again. - // Refuse the append; the sender's retry path will - // re-resolve once HeadSync propagates the - // deletion to its head index. - SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, - "AppendCallback: head=%lld deleted by local structural op; refusing resurrection\n", - (std::int64_t)headID); - return ErrorCode::Fail; - } - if (wasMissing && headVec && !headVec->empty()) { - DimensionType dim = static_cast( - headVec->size() / sizeof(ValueType)); - m_headIndex->AddHeadIndex(headVec->data(), headID, 0, - dim, m_layer + 1, ws); + // A nonzero fencingToken means the sender (Split) + // holds an authoritative bucket lease on this VID + // and is publishing a brand-new head — fence + // validation already passed above, so resurrection + // here is the legitimate "publish new head" path. + // For unfenced appends (token == 0), refuse: + // resurrecting a head a concurrent Merge/Split + // just deleted would leave a zombie head until + // the next merge round drops it again. + if (fencingToken != 0 && headVec && !headVec->empty()) { + DimensionType dim = static_cast( + headVec->size() / sizeof(ValueType)); + m_headIndex->AddHeadIndex(headVec->data(), headID, 0, + dim, m_layer + 1, ws); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "AppendCallback: head=%lld deleted by local structural op; refusing resurrection\n", + (std::int64_t)headID); + return ErrorCode::Fail; + } } // Mirror sender's version map for the records we're about From dfb77a9072d06286e55b523b3c40193e6cb1b84a Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 14:18:14 +0000 Subject: [PATCH 35/48] fix(distributed): MergePostings skip-and-continue instead of re-enqueue on lock busy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a merge candidate's lock (local m_rwLocks or remote bucket lease) was busy, MergePostings re-enqueued itself as a fresh MergeAsyncJob with zero backoff. This is a livelock trap whenever two adjacent heads pick each other as the top merge candidate: each holds its own m_rwLocks entry inside MergePostings, each fails to lock the other, both re-enqueue, and the new copies race back through the same path immediately. The benchmark log shows 348 such re-enqueues for the same head pair (622604 / 622608) in one window — a tight CPU-burning ping-pong that starves the rest of the merge queue. Replace the re-enqueue with a plain 'continue' to skip the current candidate and try the next neighbor in queryResults. Worst case this round produces no merge for the current head, which is benign: the head remains in m_postingSizes and becomes merge-eligible again in the next round once its posting size still falls under the merge threshold. Delete the now-unused reenqueueMerge lambda. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 22fe7e132..c254ef9a7 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -2108,21 +2108,16 @@ namespace SPTAG::SPANN { m_headIndex->SearchHeadIndex(queryResults, m_layer + 1, p_exWorkSpace); std::string nextPostingList; - // Re-queue this Merge job and exit cleanly. Counts as a new - // submission so MergeAsyncJob::exec()'s m_mergeJobsInFlight-- / - // m_totalMergeCompleted++ stays balanced -- without these - // increments m_mergeJobsInFlight underflows to a huge uint64 - // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. - auto reenqueueMerge = [&](const char* reason) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "MergePostings: re-queueing headID=%lld (%s)\n", - (std::int64_t)headID, reason); - auto* curJob = new MergeAsyncJob(this, headID, nullptr); - m_mergeJobsInFlight++; - m_totalMergeSubmitted++; - m_splitThreadPool->add(curJob); - return ErrorCode::Success; - }; + // If a candidate is unavailable (remote lease busy or local + // lock held by a peer op), skip it and try the next neighbor + // instead of re-enqueueing the whole Merge job. Re-enqueue + // is a livelock trap when two adjacent heads pick each other + // as the top merge candidate -- each fails to lock the other, + // both re-enqueue, and the new copies race back through the + // same path with zero backoff. Skipping degrades to "no + // merge this round", which is fine: the head will become + // merge-eligible again in the next round once its posting + // list crosses the threshold. for (int i = 1; i < queryResults.GetResultNum(); ++i) { @@ -2150,12 +2145,12 @@ namespace SPTAG::SPANN { if (isRemoteCandidate) { if (!remoteLease.acquire(m_worker, remoteNodeIndex, m_layer, queryResult->VID)) { - return reenqueueMerge("remote lease busy"); + continue; } } else { if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { if (!anotherLock.try_lock()) { - return reenqueueMerge("local lock busy"); + continue; } } if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; From f39db6c0ab48344a8410e7511bc1b69d61be1243 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 16:06:42 +0000 Subject: [PATCH 36/48] fix(distributed): bump SendRemoteLock RPC timeout to lease TTL (30s) When the local future waited only 5 s, an in-flight SendRemoteLock could time out while the owner had already issued a lease for that bucket -- the Grant response then arrived after we'd given up, leaving the owner holding an orphaned lease that blocked every subsequent acquire attempt on the same bucket for the full lease TTL. Wait up to the receiver-side lease TTL (RemoteLeaseTable default 30000 ms): any lease the owner issues for this request auto-expires by the time we return, so a late-arriving Grant on a timed-out RPC cannot leave an orphaned lease. The receiver sends responses synchronously after processing, so the remaining paths to a real timeout (dead peer, network partition lasting >= TTL) wouldn't have benefited from a shorter wait anyway. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/Distributed/RemotePostingOps.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 53170b23a..eb1921017 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -807,7 +807,19 @@ namespace SPTAG::SPANN { m_net->GetClient()->SendPacket(connID, std::move(pkt), MakeSendFailHandler(rid)); - auto status = future.wait_for(std::chrono::milliseconds(5000)); + // Wait up to the receiver-side lease TTL (RemoteLeaseTable + // default 30000 ms; see RemoteLeaseTable.h:33). Any lease + // the owner issues for this request auto-expires by the time + // we return, so a late-arriving Grant response on a + // timed-out RPC cannot leave the owner holding an orphaned + // lease that blocks subsequent retries (a problem we + // observed with shorter timeouts during 2-node benchmark + // runs). The receiver sends a response synchronously after + // processing, so the only paths to this timeout are a dead + // peer or a network partition lasting >= TTL -- in both + // cases waiting longer would not have helped anyway. + constexpr int kLockWaitMs = 30000; + auto status = future.wait_for(std::chrono::milliseconds(kLockWaitMs)); if (status != std::future_status::ready) { ErasePending(rid); TakePendingLockToken(rid); From 0cb7eafa00eaea5dadd9f5fb47a3c0ec0189ec04 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 16:06:52 +0000 Subject: [PATCH 37/48] fix(distributed): bump fenced Append RPC timeout to 4x lease TTL (120s) The fence-retry path in Split deadlocks when the Append RPC timeout equals the receiver-side lease TTL: under TiKV pressure the receiver can take ~30 s on a single append, so the sender's 30 s timeout fires at exactly the moment the receiver's lease auto-expires. The retry calls SendRemoteLock again, but by the time the lock request lands a sibling Split has already claimed the bucket; the entire 20-attempt retry budget then burns failing to re-acquire and Split rolls back. Pick 4 x TTL so a real Append timeout unambiguously means the lease has been recoverable for long enough that any concurrent acquisition has had a chance to release. The remaining cause is a hung or crashed peer, which is the actual condition this guard should fire on. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index eb1921017..2c6479571 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -397,7 +397,21 @@ namespace SPTAG::SPANN { m_net->GetClient()->SendPacket(connID, std::move(packet), MakeSendFailHandler(resID)); - auto status = future.wait_for(std::chrono::seconds(30)); + // Wait long enough that a successful response is not racing + // the lease TTL. Append timeout == lease TTL deadlocks the + // fence-retry path: when TiKV is backed up and the receiver + // takes ~30 s on a single append, the sender's 30 s timeout + // fires at the same moment the receiver-side lease auto- + // expires. The retry then calls SendRemoteLock again, but + // by the time the request lands another Split has acquired + // the bucket, and the entire 20-attempt budget is spent + // failing to re-acquire. Pick 4 x TTL so that a real + // timeout unambiguously means the lease has been + // recoverable for long enough that any concurrent + // acquisition has had a chance to release; the only + // remaining cause is a hung / crashed peer. + constexpr int kAppendRpcTimeoutSec = 120; + auto status = future.wait_for(std::chrono::seconds(kAppendRpcTimeoutSec)); if (status == std::future_status::timeout) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "RemotePostingOps: Timeout waiting for append response for headID %lld from node %d\n", From 6bfe0f1c123357dcfd6f3457c40642a25fd0f900 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 16:08:06 +0000 Subject: [PATCH 38/48] fix(distributed): align WaitForRemoteBucketUnlocked wait cap to lease TTL The previous 5 s cap made the local writer barge in while a remote Split that held an advisory lease on the bucket was still mid-flight. Worst case: Split then broadcasts HeadSync Delete on srcHead and the items we just appended disappear with the head -- recall drops silently with no error. Tie the cap to RemoteLeaseTable::GetTtlMs() (default 30 s): after TTL the entry is auto-reclaimed by IsLocked() so this loop exits naturally on its own. The 'stuck for ... ms, proceeding' log path is now truly anomalous and worth surfacing in the regression-detector queries. In the 2-node insert_dominant benchmark this dropped 'stuck for' from 74 events per run to 0. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c254ef9a7..c15c5e8a8 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -799,7 +799,15 @@ namespace SPTAG::SPANN { if (!m_worker || !m_worker->IsEnabled()) return; unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); if (!m_remoteLeaseTable->IsLocked(bucket)) return; - constexpr int kMaxRemoteBucketWaitMs = 5000; + // Bound the wait by the lease TTL. A shorter cap (we used + // 5 s previously) makes the local writer barge in while the + // remote Split is still mid-flight: if Split then broadcasts + // a HeadSync Delete on srcHead, the items we just appended + // disappear with the head and recall drops silently. After + // TTL, IsLocked auto-reclaims the lease so this loop exits + // naturally; the "stuck" log path is now truly anomalous. + const int kMaxRemoteBucketWaitMs = + m_remoteLeaseTable->GetTtlMs(); auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(kMaxRemoteBucketWaitMs); while (m_remoteLeaseTable->IsLocked(bucket)) { From adaf01c1d8e284df281fbecf0f19c736f69f9695 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 24 May 2026 16:08:23 +0000 Subject: [PATCH 39/48] fix(distributed): receiver-side fenced Append bypasses self-bucket wait When node A holds lease T on bucket(headX) at node B and sends a FencedRemoteAppend(T) for headX, B's RPC handler validates the fence (passes), then enters AppendCallback -> Append(headX, ...). Since headX is locally-owned on B, Append falls into WaitForRemoteBucketUnlocked(headX) -- but the lease blocking that bucket is A's, our own caller's. B waits up to TTL (~30 s) for A's lease to expire while A is blocked in SendFencedRemoteAppend waiting for B's response. Throughout that 30 s self-block every sibling Split that hashes into the same bucket sees 'lease busy', burns its 20-attempt retry budget, and rolls back. This was the dominant cause of 'lease busy' cascades on adjacent splits in the 2-node insert_dominant benchmark (~40-80 events per run; recall dropped to 0.984 with periodic Split rollbacks). Add a p_skipRemoteBucketWait bool to Append and BatchAppend; the receiver-side single-item callback passes fencingToken != 0, and the BatchAppend callback passes anyFenced computed across surviving items. Safety: fence validation upstream already proved the sender owns the lease covering all in-flight modifications to this bucket, and per-head serialization via m_rwLocks[headID] inside Append's body is unchanged. Local writers (Append called from AddIndex / Split / Reassign / Merge) keep the default false: they still honour any remote initiator's advisory lease. Result after stacking with the SendRemoteLock, fenced-Append, and WaitForRemoteBucketUnlocked TTL alignments: 0 stuck, 0 lock timeouts, 0 rollbacks, 0 cannot-re-acquire, lease-busy events drop to ~2 per run (down from 40-80). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c15c5e8a8..a1a20672c 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -620,7 +620,8 @@ namespace SPTAG::SPANN { m_versionMap->SetVersionBatch(batchVids, batchVers); } } - return Append(ws, headID, appendNum, appendPosting, 0); + return Append(ws, headID, appendNum, appendPosting, 0, + /*p_skipRemoteBucketWait=*/fencingToken != 0); }); // Batch append callback: receiver-side fast path. @@ -704,17 +705,20 @@ namespace SPTAG::SPANN { std::unordered_map headAppends; headAppends.reserve(items.size()); size_t aliveCount = 0; + bool anyFenced = false; for (size_t i = 0; i < items.size(); ++i) { if (!alive[i]) continue; auto* req = items[i]; auto& dst = headAppends[req->m_headID]; if (dst.empty()) dst = std::move(req->m_appendPosting); else dst.append(req->m_appendPosting); + if (req->m_fencingToken != 0) anyFenced = true; ++aliveCount; } if (headAppends.empty()) return; - ErrorCode ret = BatchAppend(ws, headAppends, "PeerBatch"); + ErrorCode ret = BatchAppend(ws, headAppends, "PeerBatch", + /*p_skipRemoteBucketWait=*/anyFenced); if (ret == ErrorCode::Success) { outSuccess += static_cast(aliveCount); } else { @@ -2640,7 +2644,8 @@ namespace SPTAG::SPANN { } - ErrorCode Append(ExtraWorkSpace* p_exWorkSpace, SizeType headID, int appendNum, std::string& appendPosting, int reassignThreshold = 0) + ErrorCode Append(ExtraWorkSpace* p_exWorkSpace, SizeType headID, int appendNum, std::string& appendPosting, int reassignThreshold = 0, + bool p_skipRemoteBucketWait = false) { auto appendBegin = std::chrono::high_resolution_clock::now(); if (appendPosting.empty()) { @@ -2668,11 +2673,19 @@ namespace SPTAG::SPANN { m_stat.m_appendTaskNum++; } return ErrorCode::Success; - } else { + } else if (!p_skipRemoteBucketWait) { // Local-owned head: wait out any in-flight remote // initiator that holds an advisory fenced-lease on our // bucket (e.g. another node mid-Split) before we acquire // the per-head lock and write. + // + // Skip this wait when the caller is the receiver-side + // handler for a fenced RemoteAppend: fence validation + // upstream has already proven the sender holds the + // very lease this wait would block on, so we would be + // waiting for our own caller's lease to expire (TTL, + // ~30 s). That self-block was the dominant cause of + // "lease busy" cascades on adjacent splits. WaitForRemoteBucketUnlocked(headID); } } @@ -2786,7 +2799,8 @@ namespace SPTAG::SPANN { return ErrorCode::Success; } - ErrorCode BatchAppend(ExtraWorkSpace* p_exWorkSpace, std::unordered_map& headAppends, const char* caller) + ErrorCode BatchAppend(ExtraWorkSpace* p_exWorkSpace, std::unordered_map& headAppends, const char* caller, + bool p_skipRemoteBucketWait = false) { if (headAppends.empty()) return ErrorCode::Success; @@ -2824,7 +2838,11 @@ namespace SPTAG::SPANN { } else { m_routedLocalHeads.fetch_add(1, std::memory_order_relaxed); m_routedLocalItems.fetch_add(totalRec, std::memory_order_relaxed); - WaitForRemoteBucketUnlocked(headID); + // Skip the self-wait for receiver-side fenced + // BatchAppend (see Append() for the rationale). + if (!p_skipRemoteBucketWait) { + WaitForRemoteBucketUnlocked(headID); + } } } From 17e8646e014741d17809490a36a6db4a82145625 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Mon, 25 May 2026 12:01:44 +0000 Subject: [PATCH 40/48] Remove unused variable --- AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h | 1 - 1 file changed, 1 deletion(-) diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index a1a20672c..a8a272050 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -265,7 +265,6 @@ namespace SPTAG::SPANN { std::shared_ptr GetDB() const { return db; } private: - std::atomic m_workspaceCount = 0; std::shared_ptr db; WorkerNode* m_worker = nullptr; // externally owned, set via SetWorker() From 82dc35a801a389c4ff72fe1eec62fec5046d586b Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 27 May 2026 03:50:43 +0000 Subject: [PATCH 41/48] fix(socket): typo in SimpleSerialization static_assert messages Replaces "fundanmental" with "fundamental" in the four static_assert messages of SimpleWriteBuffer / SimpleReadBuffer / EstimateBufferSize / SafeSimpleReadBuffer. Copilot inline review on PR #448 flagged the misspelling. Pure log/diag message change; no code semantics affected. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/inc/Socket/SimpleSerialization.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/AnnService/inc/Socket/SimpleSerialization.h b/AnnService/inc/Socket/SimpleSerialization.h index e0b8141dd..6c0ddddf0 100644 --- a/AnnService/inc/Socket/SimpleSerialization.h +++ b/AnnService/inc/Socket/SimpleSerialization.h @@ -23,7 +23,7 @@ namespace SimpleSerialization SimpleWriteBuffer(const T& p_val, std::uint8_t* p_buffer) { static_assert(std::is_fundamental::value || std::is_enum::value, - "Only applied for fundanmental type."); + "Only applied for fundamental type."); *(reinterpret_cast(p_buffer)) = p_val; return p_buffer + sizeof(T); @@ -35,7 +35,7 @@ namespace SimpleSerialization SimpleReadBuffer(const std::uint8_t* p_buffer, T& p_val) { static_assert(std::is_fundamental::value || std::is_enum::value, - "Only applied for fundanmental type."); + "Only applied for fundamental type."); p_val = *(reinterpret_cast(p_buffer)); return p_buffer + sizeof(T); @@ -47,7 +47,7 @@ namespace SimpleSerialization EstimateBufferSize(const T& p_val) { static_assert(std::is_fundamental::value || std::is_enum::value, - "Only applied for fundanmental type."); + "Only applied for fundamental type."); return sizeof(T); } @@ -90,7 +90,7 @@ namespace SimpleSerialization SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, T& p_val) { static_assert(std::is_fundamental::value || std::is_enum::value, - "Only applied for fundanmental type."); + "Only applied for fundamental type."); if (p_buffer == nullptr) return nullptr; if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < sizeof(T)) return nullptr; From 8fd4c3088be9a13baf446fc2f33cad54b88b8de2 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 27 May 2026 03:50:53 +0000 Subject: [PATCH 42/48] build(test): gate absl_* link deps behind if(TIKV) The previous unconditional target_link_libraries on SPTAGTest pulled in absl_synchronization / absl_cord / absl_cordz_info / absl_cord_internal / absl_cordz_functions / absl_cordz_handle. These libs are only needed because gRPC's static archive references them; when TIKV=OFF (the default), neither gRPC nor any absl symbol is in the dependency closure, so demanding the libs at link time breaks builds on hosts that don't have absl installed. Top-level CMakeLists.txt declares 'option(TIKV "TIKV" OFF)' (L131) and gates TiKV_LIBRARIES on the same flag (L172-201), so this change mirrors that convention: the absl link is now nested inside 'if (TIKV)' so non-TiKV builds match upstream master's link line again. Copilot inline review on PR #448 surfaced this regression. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Test/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt index 9db640da2..b1b708d6b 100644 --- a/Test/CMakeLists.txt +++ b/Test/CMakeLists.txt @@ -24,7 +24,12 @@ if (NOT LIBRARYONLY) file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h) file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp) add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES}) - target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} ${TiKV_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) + target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} ${TiKV_LIBRARIES}) + if (TIKV) + # gRPC's static libs require these absl symbols; only link when the + # TiKV backend (and thus gRPC) is in the dependency closure. + target_link_libraries(SPTAGTest absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) + endif() install(TARGETS SPTAGTest RUNTIME DESTINATION bin From 047ed5b592319b75f1cfd4a193a77d2e4094b8d3 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 27 May 2026 03:51:04 +0000 Subject: [PATCH 43/48] fix(socket): separate error_code per endpoint() call in Connection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Connection::Start() and Connection::Stop() each called local_endpoint() and remote_endpoint() in sequence, sharing a single boost::system:: error_code. boost::asio's overload writes the result into the supplied error_code on every call: a successful local_endpoint() resets the code that a subsequent remote_endpoint() failure should signal, and vice versa. For Start(), the bug skewed the 'socket not connected' branch — the log message read whichever ec was set last instead of pinpointing which endpoint actually failed, and a falsely-successful epEc could let us proceed to call .address() on an invalid remote endpoint. Stop()'s diag log had the same accuracy issue (no logic divergence because Stop has no early-return on log-only failure). Fix uses one error_code per call and only logs the success branch when both calls succeeded. Failure branch now identifies which side errored. Copilot inline review on PR #448 flagged both call sites. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/src/Socket/Connection.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/AnnService/src/Socket/Connection.cpp b/AnnService/src/Socket/Connection.cpp index 444c7afb0..d99ba8882 100644 --- a/AnnService/src/Socket/Connection.cpp +++ b/AnnService/src/Socket/Connection.cpp @@ -26,17 +26,19 @@ Connection::Connection(ConnectionID p_connectionID, boost::asio::ip::tcp::socket void Connection::Start() { - boost::system::error_code epEc; - auto localEp = m_socket.local_endpoint(epEc); - auto remoteEp = m_socket.remote_endpoint(epEc); - if (!epEc) { + boost::system::error_code localEc; + boost::system::error_code remoteEc; + auto localEp = m_socket.local_endpoint(localEc); + auto remoteEp = m_socket.remote_endpoint(remoteEc); + if (!localEc && !remoteEc) { SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n", static_cast(localEp.port()), remoteEp.address().to_string().c_str(), static_cast(remoteEp.port())); } else { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Connection Start, socket not connected: %s\n", - epEc.message().c_str()); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Connection Start, socket not connected: local=%s remote=%s\n", + localEc ? localEc.message().c_str() : "ok", + remoteEc ? remoteEc.message().c_str() : "ok"); return; } @@ -51,10 +53,11 @@ void Connection::Start() void Connection::Stop() { - boost::system::error_code epEc; - auto localEp = m_socket.local_endpoint(epEc); - auto remoteEp = m_socket.remote_endpoint(epEc); - if (!epEc) { + boost::system::error_code localEc; + boost::system::error_code remoteEc; + auto localEp = m_socket.local_endpoint(localEc); + auto remoteEp = m_socket.remote_endpoint(remoteEc); + if (!localEc && !remoteEc) { SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n", static_cast(localEp.port()), remoteEp.address().to_string().c_str(), From 1786092da885f87844e61f45f594d3d1befc9e6c Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 27 May 2026 03:51:24 +0000 Subject: [PATCH 44/48] fix(distributed): explicit field-wise Encode/Decode for SplitWAL::Record MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous Encode() / Decode() used memcpy(this, ..., sizeof(Record)) to serialize the whole struct. This had three problems for a record that gets written to TiKV and read back later: 1. Padding leak. The struct interleaves std::uint64_t / SizeType / std::int64_t / std::uint8_t fields; the compiler inserts alignment padding bytes between fields and tail padding after .stage. memcpy sends those bytes — which are uninitialized stack content the first time a Record is encoded — into the WAL key/value. 2. Brittleness to source-order changes. Reordering Record fields, or adding a new field anywhere except the tail, silently changes the on-the-wire byte order. Old WAL entries decode as garbage with no error signal, breaking the split-cleanup GC sweep. 3. Enum representation. Stage is declared with an explicit std::uint8_t underlying type today, but a future refactor that drops the explicit width would silently change the encoded size. Replaces memcpy with field-by-field std::memcpy at known offsets using the project's existing pattern (matches Socket SimpleWriteBuffer behavior). The wire format is now deterministic and survives both unrelated source edits and field reordering, so the Begin record that ExtraDynamicSearcher.h:940 / :1823 write is still readable after a recompile. Wire layout (in order, no padding): uint64 jobID SizeType srcHeadID, localChildHeadID, remoteChildHeadID int remoteOwnerNodeIndex int64 startTimestampSec uint8 stage kEncodedSize is exposed as a constexpr so tests / consumers can assert the size. Note: SizeType width still tracks the build-config LARGEVID flag — this is by design (matches the rest of the codebase, including the on-disk posting lists) and the WAL contract is that writer and reader must be built with the same LARGEVID setting. Copilot inline review on PR #448 flagged the memcpy approach. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/Distributed/SplitWAL.h | 51 +++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h b/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h index 3cd642a13..d083b1790 100644 --- a/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h +++ b/AnnService/inc/Core/SPANN/Distributed/SplitWAL.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -54,14 +55,56 @@ class SplitWAL { std::int64_t startTimestampSec; Stage stage; + // Wire layout: each field appended sequentially with no padding, + // stage written as a fixed std::uint8_t. Field-by-field memcpy + // avoids leaking uninitialized struct padding into the WAL and + // keeps the encoding stable if fields are reordered in source. + // Field widths still follow the build-config-bound SizeType + // (int32 by default, int64 with -DLARGEVID); a deployment must + // not toggle LARGEVID between WAL writer and reader. + static constexpr std::size_t kEncodedSize = + sizeof(std::uint64_t) /* jobID */ + + sizeof(SizeType) /* srcHeadID */ + + sizeof(SizeType) /* localChildHeadID */ + + sizeof(SizeType) /* remoteChildHeadID */ + + sizeof(int) /* remoteOwnerNodeIndex */ + + sizeof(std::int64_t) /* startTimestampSec */ + + sizeof(std::uint8_t); /* stage */ + std::string Encode() const { - std::string s(sizeof(Record), '\0'); - memcpy(&s[0], this, sizeof(Record)); + std::string s(kEncodedSize, '\0'); + std::size_t off = 0; + auto put = [&](const void* src, std::size_t n) { + std::memcpy(&s[off], src, n); + off += n; + }; + put(&jobID, sizeof(jobID)); + put(&srcHeadID, sizeof(srcHeadID)); + put(&localChildHeadID, sizeof(localChildHeadID)); + put(&remoteChildHeadID, sizeof(remoteChildHeadID)); + put(&remoteOwnerNodeIndex, sizeof(remoteOwnerNodeIndex)); + put(&startTimestampSec, sizeof(startTimestampSec)); + std::uint8_t st = static_cast(stage); + put(&st, sizeof(st)); return s; } + bool Decode(const std::string& s) { - if (s.size() < sizeof(Record)) return false; - memcpy(this, s.data(), sizeof(Record)); + if (s.size() < kEncodedSize) return false; + std::size_t off = 0; + auto get = [&](void* dst, std::size_t n) { + std::memcpy(dst, s.data() + off, n); + off += n; + }; + get(&jobID, sizeof(jobID)); + get(&srcHeadID, sizeof(srcHeadID)); + get(&localChildHeadID, sizeof(localChildHeadID)); + get(&remoteChildHeadID, sizeof(remoteChildHeadID)); + get(&remoteOwnerNodeIndex, sizeof(remoteOwnerNodeIndex)); + get(&startTimestampSec, sizeof(startTimestampSec)); + std::uint8_t st = 0; + get(&st, sizeof(st)); + stage = static_cast(st); return true; } }; From 149bdd4d26543ade538f6a1b17cfb7280fc74a8d Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 27 May 2026 03:51:47 +0000 Subject: [PATCH 45/48] fix(distributed): graceful WorkerNode shutdown drains auto-flush threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QueueRemoteAppend spawns detached std::thread instances that capture 'this' (WorkerNode) when the per-node queue crosses kAutoFlushThreshold. Each thread loops over chunks, accesses m_appendQueueMutex / m_appendQueue / m_asyncWatchdog, and decrements m_inflightAppendFlushes on exit. Without a destructor, if WorkerNode is destroyed while any of those threads are still running, the members get torn down underneath the threads and we get a use-after-free. In the current SPTAGTest driver this is masked because WorkerNode lives for the entire process and the OS reaps the threads on exit. But the hazard becomes real as soon as we want to: * gracefully shut down one worker node in a multi-node deployment (e.g. for an in-place upgrade or rolling restart); * reconstruct the WorkerNode after a config reload; * tear down WorkerNode in unit tests. The remote-node-failure case is NOT a UAF: SendBatchRemoteAppend to a dead peer returns Fail, the local thread hands the batch to the local m_asyncWatchdog (which itself captures self=this on a local object), the watchdog retries up to MaxAttempts and gives up. As long as the LOCAL WorkerNode is alive, all dereferences are safe. The hazard is purely tied to local destruction. Fix: gate-then-drain shutdown in ~WorkerNode(). Phase 1: m_acceptingNewRequests is set to false. QueueRemoteAppend consults this flag at the top of its body and returns early with a warning log if shutdown has started. This ensures no NEW auto-flush thread can be spawned. Phase 2: wait for m_inflightAppendFlushes to reach zero. The wait is unbounded by design — with the gate set, each in-flight thread is bounded by its current SendBatchRemoteAppend gRPC call (~kTimeoutSec, default 180s) plus one more iteration that sees an empty queue and breaks. Concurrent threads drain in parallel, so worst-case wall time is one gRPC timeout regardless of how many threads are in flight. A hard timeout was considered and rejected: breaking out early would let detached threads outlive m_appendQueueMutex / m_appendQueue / m_asyncWatchdog and immediately UAF — strictly worse than a slow shutdown. If shutdown ever stays stuck past one gRPC timeout in production the diagnostic to chase is 'gRPC client is wedged', not 'tune a destructor timeout'. A periodic LL_Warning log every 30s reports how many threads are still inflight so operators see progress. Phase 3: members destruct in reverse declaration order; m_asyncWatchdog's own destructor (AsyncJobWatchdog.h:48) joins its loop thread, then the mutex / queue / etc. tear down with no live consumer threads. Also logs a LL_Warning if m_remoteQueueSize > 0 at destruction so callers are reminded to invoke FlushRemoteAppends first if they care about durability of the residue. Copilot inline review on PR #448 flagged the detach() lifecycle. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inc/Core/SPANN/Distributed/WorkerNode.h | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h index 116b6c25f..77a251262 100644 --- a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -103,6 +103,59 @@ namespace SPTAG::SPANN { public: bool Start() { return StartNetwork(); } + // Gate + drain shutdown: + // 1. Reject new QueueRemoteAppend producers via m_acceptingNewRequests. + // 2. Wait for any in-flight auto-flush detached threads to exit. + // With the gate set, each thread is bounded by its current + // SendBatchRemoteAppend call (~kTimeoutSec, default 180s) plus + // one more loop iteration that will see an empty queue (no + // new producers) and break. So worst-case wall time is one + // gRPC timeout, regardless of concurrency. + // 3. Member destruction runs after this body: m_asyncWatchdog's own + // destructor (AsyncJobWatchdog.h:48) joins its loop thread, then + // mutex/queue members tear down cleanly. + // Callers are expected to have invoked FlushRemoteAppends() before + // destruction; any residue in m_appendQueue is dropped with a warning. + // + // The wait is unbounded by design: a hard timeout here would let + // threads outlive the members they captured (m_appendQueueMutex / + // m_appendQueue / m_asyncWatchdog) and immediately UAF — strictly + // worse than a slow shutdown. If shutdown ever stays stuck past a + // gRPC timeout in production, the diagnostic to chase is "gRPC + // client is wedged", not "tune the destructor timeout". + ~WorkerNode() { + m_acceptingNewRequests.store(false, std::memory_order_release); + + // Log every 2x RPC timeout: that gives one full RPC cycle as + // the healthy-drain upper bound (gate -> each in-flight thread + // bounded by exactly one SendBatchRemoteAppend cycle), plus a + // second cycle of buffer so a slightly slow-but-healthy drain + // doesn't false-alarm. Past 2x is firmly into "gRPC client is + // wedged" territory and worth a LL_Warning. + const auto logInterval = std::chrono::seconds( + 2 * std::max(1, m_remoteOps.GetRpcTimeoutSec())); + + auto lastLogged = std::chrono::steady_clock::now(); + while (m_inflightAppendFlushes.load(std::memory_order_acquire) > 0) { + auto now = std::chrono::steady_clock::now(); + if (now - lastLogged >= logInterval) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "~WorkerNode: still waiting on %d in-flight auto-flush thread(s) " + "(exceeded 2x RPC timeout, gRPC may be wedged)\n", + m_inflightAppendFlushes.load(std::memory_order_relaxed)); + lastLogged = now; + } + std::this_thread::sleep_for(kShutdownPollInterval); + } + + const size_t residue = m_remoteQueueSize.load(std::memory_order_relaxed); + if (residue > 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "~WorkerNode: dropping %zu queued RemoteAppend item(s) at destruction; " + "caller should have invoked FlushRemoteAppends() first\n", residue); + } + } + // ---- Callbacks ---- // // ExtraDynamicSearcher passes its m_layer when binding callbacks so @@ -277,6 +330,12 @@ namespace SPTAG::SPANN { // ---- Append queue ---- void QueueRemoteAppend(int nodeIndex, RemoteAppendRequest req) { + if (!m_acceptingNewRequests.load(std::memory_order_acquire)) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "WorkerNode: rejecting QueueRemoteAppend to node %d during shutdown\n", + nodeIndex); + return; + } std::vector toFlush; bool didReserveSlot = false; { @@ -672,6 +731,20 @@ namespace SPTAG::SPANN { static constexpr size_t kAutoFlushThreshold = 50000; std::atomic m_maxInflightPerNode{4}; + // Gate: producers (QueueRemoteAppend) consult this; the destructor + // sets it to false to drain in-flight auto-flush threads to zero + // without new threads being spawned. + std::atomic m_acceptingNewRequests{true}; + + // Shutdown wait tuning (used only by ~WorkerNode). + // - kShutdownPollInterval: how often the destructor wakes to + // re-check m_inflightAppendFlushes. 20ms keeps p50 shutdown + // latency tight when threads exit between polls. + // - The progress-log cadence is derived at destruction time + // from m_remoteOps.GetRpcTimeoutSec() — see ~WorkerNode(). + static constexpr auto kShutdownPollInterval = + std::chrono::milliseconds(20); + // Resends failed async fire-and-forget batches with exponential // backoff (see AsyncJobWatchdog.h). Constructed last so it tears // down before the queues; declared here so destruction order From 5d4dbd3e8e32b62042c535ce4021bb0336c2ee0f Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 27 May 2026 11:00:21 +0000 Subject: [PATCH 46/48] fix(bench): move SPTAGTest CWD to per-scale scratch dir on NVMe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestDataGenerator writes perftest_*.bin files (notably perftest_vector.bin which is Dim*BaseVectorCount bytes — 118GB at 1B with UInt8/dim=128) relative to CWD. The previous CWD was $SPTAG_DIR (the SPTAG repo dir, on /), so at 1B scale the 118GB write filled the root partition, truncated the file, then groundtruth generation's follow-up read raised 'Failed to read VectorSet' and aborted the build. Change every SPTAGTest invocation (driver build, driver run, worker) to cd into $DATA_DIR/scratch_${SCALE}_${NODE_COUNT}node/ instead. This puts perftest_*.bin (and TruthPath='truth', BENCHMARK_OUTPUT) on the same big NVMe volume that already holds the index data. distribute_perftest_files now takes the SCALE and rsyncs from the driver's SCRATCH_DIR to each worker's SCRATCH_DIR. cmd_deploy's perftest_* deploy section is dropped (it is redundant with the post-build distribute_perftest_files step and could not pick a scratch dir without knowing the scale anyway). cmd_cleanup also removes $DATA_DIR/scratch_*/ on every remote. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/run_distributed.sh | 62 +++++++++++++---------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh index 28404c8a3..57f43f98b 100755 --- a/evaluation/distributed/run_distributed.sh +++ b/evaluation/distributed/run_distributed.sh @@ -237,18 +237,10 @@ cmd_deploy() { fi done - # Deploy data files (perftest_* vectors, queries) - echo "" - echo "Deploying data files..." - for host in "${NODE_HOSTS[@]}"; do - if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi - echo " → $host:$SPTAG_DIR/ (perftest_* files)" - remote_exec "$host" "mkdir -p $SPTAG_DIR" - rsync -az --progress \ - --include='perftest_*' --exclude='*' \ - -e "ssh $(_ssh_opts)" \ - "$SPTAG_DIR/" "$SSH_USER@$host:$SPTAG_DIR/" - done + # perftest_* data files are generated by SPTAGTest at runtime in SCRATCH_DIR + # and rsynced by distribute_perftest_files() during cmd_run, so cmd_deploy + # no longer needs to push them. (Pushing here also wouldn't know which + # scale's SCRATCH_DIR to source from.) echo "" echo "Deploy complete." @@ -732,9 +724,13 @@ start_remote_worker() { local NODE_COUNT="$4" local host="${NODE_HOSTS[$NODE_IDX]}" local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${NODE_IDX}.log" + local SCRATCH_DIR="$DATA_DIR/scratch_${SCALE}_${NODE_COUNT}node" - # Copy INI + binary to remote - remote_sync "$host" "$INI" "$SPTAG_DIR/worker_n${NODE_IDX}.ini" + # Ensure scratch dir exists on remote, then copy INI there. SPTAGTest's CWD + # is set to SCRATCH_DIR so TestDataGenerator's relative perftest_*.bin + # files land on the big NVMe disk, not on /. + remote_exec "$host" "mkdir -p $SCRATCH_DIR" + remote_sync "$host" "$INI" "$SCRATCH_DIR/worker_n${NODE_IDX}.ini" # Start worker via SSH (foreground on remote, background locally). # Use `ssh -n` to redirect stdin from /dev/null so SSH doesn't try to @@ -742,9 +738,9 @@ start_remote_worker() { # the SSH client sometimes silently re-points fd1 → /dev/null and fd2 # → a deleted /tmp file, dropping the worker log. ssh -n $(_ssh_opts) "$SSH_USER@$host" \ - "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \ + "cd $SCRATCH_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \ WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \ - ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \ + $SPTAG_DIR/Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \ "$LOG" 2>&1 & local ssh_pid=$! WORKER_SSH_PIDS+=($ssh_pid) @@ -846,16 +842,19 @@ distribute_head_index() { } distribute_perftest_files() { - # rsync generated perftest_* files from driver to workers. - local NODE_COUNT="$1" + # rsync generated perftest_* files from driver SCRATCH_DIR to worker SCRATCH_DIR. + local SCALE="$1" + local NODE_COUNT="$2" + local SCRATCH_DIR="$DATA_DIR/scratch_${SCALE}_${NODE_COUNT}node" echo "Distributing perftest_* data files to workers..." for (( i=1; i&1 ) \ | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" @@ -950,7 +955,7 @@ cmd_run() { local BUILD_INI BUILD_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" "BuildOnly=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1 - ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \ + ( cd "$SCRATCH_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \ BENCHMARK_OUTPUT="output_${SCALE}_1node_build.json" \ "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ | tee "$LOGDIR/benchmark_${SCALE}_1node_build.log" @@ -974,7 +979,7 @@ cmd_run() { local RUN_INI RUN_INI=$(generate_ini "$SCALE" 1 "Rebuild=false" "VersionCacheTTLMs=0" "VersionCacheMaxChunks=0") || exit 1 - ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ + ( cd "$SCRATCH_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" @@ -985,7 +990,7 @@ cmd_run() { INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}") || exit 1 echo "Starting driver on ${NODE_HOSTS[0]}..." - ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$INI" \ + ( cd "$SCRATCH_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$INI" \ BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" @@ -1052,7 +1057,7 @@ cmd_run() { # launched during the build phase; they come up in Phase 3 (run). local BUILD_LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_build.log" echo "Starting driver build on ${NODE_HOSTS[0]}..." - ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \ + ( cd "$SCRATCH_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \ BENCHMARK_OUTPUT="output_${SCALE}_${NODE_COUNT}node_build.json" \ "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig ) \ > "$BUILD_LOG" 2>&1 & @@ -1089,7 +1094,7 @@ cmd_run() { rm -f "$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index/checkpoint.txt" distribute_head_index "$SCALE" "$NODE_COUNT" - distribute_perftest_files "$NODE_COUNT" + distribute_perftest_files "$SCALE" "$NODE_COUNT" # Sync SPTAGTest binary + bundled runtime libs to all workers so # they pick up the latest compiled changes. (cmd_deploy is a separate @@ -1133,7 +1138,7 @@ cmd_run() { # workers need to connect to for ring registration. local DRIVER_LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_driver.log" echo "Starting driver (dispatcher+worker0) on ${NODE_HOSTS[0]}..." - ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ + ( cd "$SCRATCH_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ BENCHMARK_OUTPUT="output_${SCALE}_${NODE_COUNT}node.json" \ "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig ) \ > "$DRIVER_LOG" 2>&1 & @@ -1269,7 +1274,10 @@ cmd_cleanup() { for i in $(seq 1 $((${#NODE_HOSTS[@]} - 1))); do local host="${NODE_HOSTS[$i]}" echo " Cleaning $host..." + # Older runs wrote perftest_* and worker_*.ini directly under + # $SPTAG_DIR; current runs put them in $DATA_DIR/scratch_*/. Clean both. remote_exec "$host" "rm -rf $SPTAG_DIR/Release/SPTAGTest $SPTAG_DIR/perftest_* $SPTAG_DIR/worker_*.ini" + remote_exec "$host" "rm -rf $DATA_DIR/scratch_*" # Clean index directories remote_exec "$host" "rm -rf $DATA_DIR/proidx_*" done From dabe74a4bb4b662ec6a80a1cccf5a9ec80f2f642 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 31 May 2026 03:26:19 +0000 Subject: [PATCH 47/48] fix(versionmap): restore per-layer Initialize to seed alive heads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the qiazh refactor commented out the m_versionMap->Initialize(...) call in ExtraDynamicSearcher::BuildIndex and replaced it with a dead per-VID Deleted+SetVersion loop, layer-1 (head index) postings were silently corrupted on the first async MergePostings: * TiKVVersionMap uses default=0xfe (deleted) for layer >0. With Initialize skipped, alive heads have no per-VID byte. GetVersion returns the 0xfe default → Deleted() returns true. * WriteDownAllPostingToDB stores version=GetVersion(headVID)=0xfe in every layer-1 base posting entry. * MergePostings' filter at L2021 (Deleted(VID) || GetVersion!=version) drops every entry, so the merged-with-neighbor head writes a tiny corrupted posting. After ~10K small async merges triggered during concurrent search-during-insert, the head index is destroyed. Symptom on the 1M+1M insert_dominant 1-node bench: pre-insert recall = 0.985, post-insert recall = 0.218 Fix: * Re-add Initialize to IVersionMap interface with default impl SetR(size) so existing implementations (array-backed) compile unchanged. * Make TiKVVersionMap::Initialize an explicit override (it already persists 0x00 for each alive head when m_layer > 0). * Add LocalVersionMap::Initialize override that explicitly writes 0x00 for each globalID -- the hashmap variant has the same default=0xfe problem when a key is missing. * Restore m_versionMap->Initialize(...) at ExtraDynamicSearcher.h:3653 with an explanatory comment block. Validated on 1M+1M insert_dominant 1-node: pre-insert recall = 0.9850 post-insert recall = 0.9830 (was 0.218 before fix) layer-1 async merges during run: 218 (was 54K before fix) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/inc/Core/Common/IVersionMap.h | 24 ++++++++++++++ AnnService/inc/Core/Common/LocalVersionMap.h | 23 +++++++++++++ AnnService/inc/Core/Common/TiKVVersionMap.h | 2 +- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 32 +++++++++++-------- 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/AnnService/inc/Core/Common/IVersionMap.h b/AnnService/inc/Core/Common/IVersionMap.h index 9c9c8c7dd..e7621ec2e 100644 --- a/AnnService/inc/Core/Common/IVersionMap.h +++ b/AnnService/inc/Core/Common/IVersionMap.h @@ -30,6 +30,30 @@ namespace SPTAG virtual void DeleteAll() = 0; + /// One-time per-layer setup performed at the end of BuildIndex. + /// size total VID count for this layer (== m_opt->m_vectorSize) + /// blockSize/capacity hints for array-backed legacy maps; ignored + /// by hashmap / TiKV implementations + /// globalIDs (optional) set of GLOBAL VIDs that are alive on + /// this layer. Layers whose "default + /// version" semantics treat unknown VIDs as + /// DELETED (e.g. TiKV layer >0, hashmap + /// LocalVersionMap) MUST persist an + /// explicit alive byte for each globalID; + /// otherwise MergePostings' + /// Deleted()/version-mismatch filter + /// eats every base entry on the first + /// async merge and corrupts the head index. + /// Default impl: just bump the internal count via SetR. + virtual void Initialize(SizeType size, SizeType blockSize, SizeType capacity, + COMMON::Dataset* globalIDs = nullptr) + { + (void)blockSize; + (void)capacity; + (void)globalIDs; + SetR(size); + } + virtual SizeType Count() = 0; virtual SizeType GetDeleteCount() = 0; virtual std::uint64_t BufferSize() = 0; diff --git a/AnnService/inc/Core/Common/LocalVersionMap.h b/AnnService/inc/Core/Common/LocalVersionMap.h index 5b185183e..c01e1bdcd 100644 --- a/AnnService/inc/Core/Common/LocalVersionMap.h +++ b/AnnService/inc/Core/Common/LocalVersionMap.h @@ -27,6 +27,29 @@ namespace SPTAG m_label.clear(); } + void Initialize(SizeType size, SizeType blockSize, SizeType capacity, + COMMON::Dataset* globalIDs = nullptr) override + { + (void)size; + (void)blockSize; + (void)capacity; + if (globalIDs == nullptr || globalIDs->R() <= 0) return; + + // Hashmap LocalVersionMap treats missing keys as deleted + // (Deleted() returns true, GetVersion() returns 0xfe). + // Layer-1 build calls Initialize with the alive-head global + // IDs; we must explicitly mark them alive (0x00) so that + // MergePostings' Deleted()/version-mismatch filter does not + // strip every base head entry on the first async merge. + std::unique_lock lock(m_updateMutex); + for (SizeType i = 0; i < globalIDs->R(); i++) { + SizeType globalID = *(globalIDs->At(i)); + if (globalID >= 0) { + m_label[globalID] = 0x00; + } + } + } + SizeType Count() override { std::shared_lock lock(m_updateMutex); return (SizeType)(m_label.size()); diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h index d85489686..8c9d4b5b9 100644 --- a/AnnService/inc/Core/Common/TiKVVersionMap.h +++ b/AnnService/inc/Core/Common/TiKVVersionMap.h @@ -212,7 +212,7 @@ namespace SPTAG std::shared_ptr GetDB() const { return m_db; } - void Initialize(SizeType size, SizeType blockSize, SizeType capacity, COMMON::Dataset* globalIDs = nullptr) + void Initialize(SizeType size, SizeType blockSize, SizeType capacity, COMMON::Dataset* globalIDs = nullptr) override { (void)blockSize; (void)capacity; diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index c12116f67..8b51f6b8d 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -3650,19 +3650,25 @@ namespace SPTAG::SPANN { auto fullVectors = p_reader->GetVectorSet(); if (m_opt->m_distCalcMethod == DistCalcMethod::Cosine && !p_reader->IsNormalized() && !p_headIndex->m_pQuantizer) fullVectors->Normalize(m_opt->m_iSSDNumberOfThreads); - //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize versionMap\n"); - //m_versionMap->Initialize(m_opt->m_vectorSize, p_headIndex->m_iDataBlockSize, p_headIndex->m_iDataCapacity, &p_localToGlobal); - - if (p_localToGlobal.R() > 0) { - for (SizeType i = 0; i < p_localToGlobal.R(); i++) { - SizeType globalID = *(p_localToGlobal[i]); - if (m_versionMap->Deleted(globalID)) m_versionMap->SetVersion(globalID, -1); - } - } else { - for (SizeType i = 0; i < m_opt->m_vectorSize; i++) { - if (m_versionMap->Deleted(i)) m_versionMap->SetVersion(i, -1); - } - } + // Initialize the per-layer version map. For TiKVVersionMap this: + // - layer 0 (default=0x00 alive): bumps m_count only; no per-VID + // writes. Inserts later rely on the default 0x00 == alive. + // - layer >0 (default=0xfe deleted): writes 0x00 explicitly for + // each alive head in p_localToGlobal so MergePostings' + // Deleted()/GetVersion filter (L2021) doesn't silently drop + // legitimate base heads during async merges. Without this, + // layer-1 MergePostings reads stored version=0xfe, sees + // Deleted()=true (because per-VID byte is missing → reads + // default 0xfe), filters every entry, and writes back a + // corrupted near-empty posting -- destroying recall after + // even a single async merge. + // LocalVersionMap (hashmap) treats missing keys as deleted + // (returns 0xfe) and so has the same problem; its Initialize + // override also persists 0x00 for each globalID. + m_versionMap->Initialize(m_opt->m_vectorSize, + p_headIndex->m_iDataBlockSize, + p_headIndex->m_iDataCapacity, + &p_localToGlobal); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: Writing values to DB\n"); From e7ef65d28d00f75878cef87de77e84ad40a90bc9 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Sun, 31 May 2026 14:19:18 +0000 Subject: [PATCH 48/48] perf(versionmap): batch per-VID hot loops via BatchGetVersions/MultiPut MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TiKV-backed VersionMap (commit 05d046ec) intentionally drops the in-memory chunk cache for correctness under multi-node concurrent writes. As a result every per-VID Deleted/GetVersion/SetVersion is a synchronous TiKV roundtrip (~1-2ms). The build-time per-VID hot loops in Initialize and the maintenance loops in MergePostings/Split/ RefineIndex/CollectReAssign were never optimized — each merge/split issued N serial RPCs where N is the posting size (~250 entries). This change batches those hot paths using the existing IVersionMap::BatchGetVersions (one MultiGet RPC) and KeyValueIO::MultiPut (one region-grouped batched RPC), without introducing a cache and without changing the per-VID key schema. TiKVVersionMap.h ---------------- - Initialize(layer 1): replace 200K serial PutByte with MultiPut in 4096-key chunks; fall back to serial PutByte if the backend lacks MultiPut. Layer-1 build dropped from 186s to 89s in the 1M+1M insert_dominant 1-node bench. - SetVersionBatch: route through MultiPut directly instead of a serial SetVersion loop. m_deleted accounting is approximate in the batched path (no read-old-then-write), which is acceptable because GetDeleteCount() returns 0 for the TiKV-backed map by design. ExtraDynamicSearcher.h ---------------------- Replace 7 inline 'Deleted(VID) || GetVersion(VID) != version' patterns with one BatchGetVersions per posting: - MergePostings: current-posting (with headID appended), next-posting, and post-merge reassign loops. - Split: filter-live-entries loop. The retry-on-invalid-VID semantics are preserved via a pre-scan before the batched read. - Split-merge: per-entry version reads. - RefineIndex: per-entry + globalID version reads. - CollectReAssign: postingLists loop and nearbyPostings loop. Plus the two RemoteAppend mirror loops in the receiver callbacks (AppendCallback and BatchAppendCallback) now batch the per-record GetVersion reads before issuing the SetVersionBatch write. Measured impact (1M+1M insert_dominant, 1-node, TiKV): - Total build time: 693s -> 585s (-16%) - Layer 1 BuildSSDIndex: 186s -> 89s (-52%) - Pre-insert recall: 0.98 (unchanged) - Pre-insert search QPS (warm round 2): 526 (recovered) - Insert throughput: ~205 ops/s (unchanged; the remaining bottleneck is per-Append db->Merge TiKV CAS, which is per-key and cannot be batched across keys) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/inc/Core/Common/TiKVVersionMap.h | 86 ++++++- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 224 +++++++++++++----- 2 files changed, 249 insertions(+), 61 deletions(-) diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h index 8c9d4b5b9..61b131575 100644 --- a/AnnService/inc/Core/Common/TiKVVersionMap.h +++ b/AnnService/inc/Core/Common/TiKVVersionMap.h @@ -6,6 +6,7 @@ #include "IVersionMap.h" #include "inc/Helper/KeyValueIO.h" +#include #include #include #include @@ -233,10 +234,44 @@ namespace SPTAG m_deleted = size; SaveMetadata(); + // Batch the alive-marker writes via MultiPut so they + // can be grouped per TiKV region and issued in parallel. + // Serial PutByte was the build-time hotspot (~1-2ms + // per write × ~200K alive heads at 1M-vector scale). + std::vector aliveSorted; + aliveSorted.reserve(aliveIDs.size()); + for (SizeType id : aliveIDs) aliveSorted.push_back(id); + std::sort(aliveSorted.begin(), aliveSorted.end()); + SizeType written = 0; - for (SizeType globalID : aliveIDs) { - if (PutByte(VersionKey(globalID), 0x00) == ErrorCode::Success) { - written++; + constexpr size_t kBatchSize = 4096; + std::vector keys; + std::vector values; + keys.reserve(kBatchSize); + values.reserve(kBatchSize); + const std::string aliveByte(1, static_cast(0x00)); + for (size_t i = 0; i < aliveSorted.size(); i++) { + keys.push_back(VersionKey(aliveSorted[i])); + values.push_back(aliveByte); + if (keys.size() >= kBatchSize || i + 1 == aliveSorted.size()) { + auto ret = m_db->MultiPut(keys, values, MaxTimeout, nullptr); + if (ret == ErrorCode::Success) { + written += static_cast(keys.size()); + } else if (ret == ErrorCode::Undefined) { + // Backend lacks MultiPut: fall back to serial PutByte. + for (const auto& k : keys) { + if (PutByte(k, 0x00) == ErrorCode::Success) written++; + } + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "TiKVVersionMap::Initialize: MultiPut batch failed layer=%d ret=%d size=%zu; falling back to serial PutByte for this batch.\n", + m_layer, static_cast(ret), keys.size()); + for (const auto& k : keys) { + if (PutByte(k, 0x00) == ErrorCode::Success) written++; + } + } + keys.clear(); + values.clear(); } } m_deleted = size - written; @@ -336,15 +371,52 @@ namespace SPTAG } // Per-VID batch write: mirrors SetVersion() for each (vid, ver) pair. - // The new per-VID-key TiKVVersionMap has no chunked batching path, so - // this is a thin convenience loop. Performance-sensitive callers - // can switch to m_db->MultiPut() directly if profiling requires it. + // Uses TiKVIO MultiPut so the writes are grouped per TiKV region + // and issued in parallel. m_deleted accounting is approximate + // here (we do not read the old byte to compute the exact delta); + // GetDeleteCount() returns 0 for the TiKV-backed version map so + // this approximation is acceptable. Callers that need precise + // accounting can call SetVersion() per-VID instead. void SetVersionBatch(const std::vector& vids, const std::vector& versions) override { size_t n = std::min(vids.size(), versions.size()); if (n == 0) return; + + SizeType count = m_count.load(); + std::vector keys; + std::vector values; + keys.reserve(n); + values.reserve(n); for (size_t i = 0; i < n; ++i) { - SetVersion(vids[i], versions[i]); + if (vids[i] < 0 || vids[i] >= count) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "TiKVVersionMap::SetVersionBatch: invalid key %d (max %d)\n", + vids[i], count); + continue; + } + keys.push_back(VersionKey(vids[i])); + values.push_back(std::string(1, static_cast(versions[i]))); + } + if (keys.empty()) return; + + auto ret = m_db->MultiPut(keys, values, MaxTimeout, nullptr); + if (ret == ErrorCode::Undefined) { + // Backend lacks MultiPut: fall back to serial SetVersion + // which preserves m_deleted accounting. + for (size_t i = 0; i < n; ++i) { + if (vids[i] >= 0 && vids[i] < count) { + SetVersion(vids[i], versions[i]); + } + } + } else if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "TiKVVersionMap::SetVersionBatch: MultiPut failed layer=%d ret=%d keys=%zu; falling back to per-VID SetVersion.\n", + m_layer, static_cast(ret), keys.size()); + for (size_t i = 0; i < n; ++i) { + if (vids[i] >= 0 && vids[i] < count) { + SetVersion(vids[i], versions[i]); + } + } } } diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 8b51f6b8d..9d19265c4 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -595,21 +595,37 @@ namespace SPTAG::SPANN { const uint8_t* basePtr = reinterpret_cast(appendPosting.data()); size_t totalRec = appendPosting.size() / m_vectorInfoSize; - std::vector batchVids; - std::vector batchVers; - batchVids.reserve(totalRec); - batchVers.reserve(totalRec); + // Pre-build the candidate set and batch-read current + // versions to avoid one TiKV Get per record. + std::vector candIdx; + std::vector candVids; + std::vector candRecVers; + candIdx.reserve(totalRec); + candVids.reserve(totalRec); + candRecVers.reserve(totalRec); for (size_t i = 0; i < totalRec; ++i) { const uint8_t* p = basePtr + i * m_vectorInfoSize; SizeType vid = *reinterpret_cast(p); uint8_t recVer = *(p + sizeof(SizeType)); if (vid < 0) continue; if (recVer == 0xfe) continue; - uint8_t curVer = m_versionMap->GetVersion(vid); + candIdx.push_back(i); + candVids.push_back(vid); + candRecVers.push_back(recVer); + } + std::vector curVers; + m_versionMap->BatchGetVersions(candVids, curVers); + + std::vector batchVids; + std::vector batchVers; + batchVids.reserve(candVids.size()); + batchVers.reserve(candVids.size()); + for (size_t k = 0; k < candVids.size(); ++k) { + uint8_t curVer = curVers[k]; if (curVer == 0xfe) continue; - if (curVer == recVer) continue; - batchVids.push_back(vid); - batchVers.push_back(recVer); + if (curVer == candRecVers[k]) continue; + batchVids.push_back(candVids[k]); + batchVers.push_back(candRecVers[k]); } if (!batchVids.empty()) { m_versionMap->SetVersionBatch(batchVids, batchVers); @@ -670,21 +686,35 @@ namespace SPTAG::SPANN { const uint8_t* basePtr = reinterpret_cast(req->m_appendPosting.data()); size_t totalRec = req->m_appendPosting.size() / m_vectorInfoSize; - std::vector batchVids; - std::vector batchVers; - batchVids.reserve(totalRec); - batchVers.reserve(totalRec); + std::vector candIdx; + std::vector candVids; + std::vector candRecVers; + candIdx.reserve(totalRec); + candVids.reserve(totalRec); + candRecVers.reserve(totalRec); for (size_t k = 0; k < totalRec; ++k) { const uint8_t* p = basePtr + k * m_vectorInfoSize; SizeType vid = *reinterpret_cast(p); uint8_t recVer = *(p + sizeof(SizeType)); if (vid < 0) continue; if (recVer == 0xfe) continue; - uint8_t curVer = m_versionMap->GetVersion(vid); + candIdx.push_back(k); + candVids.push_back(vid); + candRecVers.push_back(recVer); + } + std::vector curVers; + m_versionMap->BatchGetVersions(candVids, curVers); + + std::vector batchVids; + std::vector batchVers; + batchVids.reserve(candVids.size()); + batchVers.reserve(candVids.size()); + for (size_t k = 0; k < candVids.size(); ++k) { + uint8_t curVer = curVers[k]; if (curVer == 0xfe) continue; - if (curVer == recVer) continue; - batchVids.push_back(vid); - batchVers.push_back(recVer); + if (curVer == candRecVers[k]) continue; + batchVids.push_back(candVids[k]); + batchVers.push_back(candRecVers[k]); } if (!batchVids.empty()) { m_versionMap->SetVersionBatch(batchVids, batchVers); @@ -1135,15 +1165,25 @@ namespace SPTAG::SPANN { int vectorCount = 0; std::shared_ptr vecStr; bool hasHead = false; + // Batched version-byte read for this posting + globalID head. + std::vector rf_vids; + rf_vids.reserve(postVectorNum + 1); + for (SizeType j = 0; j < postVectorNum; j++) { + rf_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + rf_vids.push_back(globalID); + std::vector rf_mapVers; + m_versionMap->BatchGetVersions(rf_vids, rf_mapVers); for (int j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) { uint8_t version = *(vectorId + sizeof(SizeType)); - SizeType VID = *((SizeType *)(vectorId)); + SizeType VID = rf_vids[j]; if (VID == globalID) vecStr = std::make_shared((char*)vectorId + m_metaDataSize, m_vectorDataSize); - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) + uint8_t mapVer = rf_mapVers[j]; + if (mapVer == 0xfe || mapVer != version) continue; if (VID == globalID) hasHead = true; @@ -1159,7 +1199,7 @@ namespace SPTAG::SPANN { } if (!hasHead && vecStr != nullptr) { - Serialize((char*)postingP + vectorCount * m_vectorInfoSize, globalID, m_versionMap->GetVersion(globalID), vecStr->data()); + Serialize((char*)postingP + vectorCount * m_vectorInfoSize, globalID, rf_mapVers.back(), vecStr->data()); vectorCount++; } if (vectorCount <= m_mergeThreshold) mergelist.insert(globalID); @@ -1280,30 +1320,50 @@ namespace SPTAG::SPANN { localIndices.reserve(postVectorNum); uint8_t* vectorId = postingP; bool hasHead = false; - for (SizeType j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) + + // Pre-scan for invalid VIDs (treat as corruption marker + // that triggers retry of the GET, matching the original + // serial-loop behaviour) before issuing the batched + // version-byte read. { - //LOG(Helper::LogLevel::LL_Info, "vector index/total:id: %d/%d:%d\n", j, m_postingSizes[headID].load(), *(reinterpret_cast(vectorId))); - uint8_t version = *(vectorId + sizeof(SizeType)); - SizeType VID = *((SizeType*)(vectorId)); - if (VID < 0 || VID >= m_versionMap->Count()) - { - if (retry < 3) - { + bool sawInvalid = false; + SizeType maxVid = m_versionMap->Count(); + for (SizeType j = 0; j < postVectorNum; j++) { + SizeType VID = *((SizeType*)(postingP + j * m_vectorInfoSize)); + if (VID < 0 || VID >= maxVid) { sawInvalid = true; break; } + } + if (sawInvalid) { + if (retry < 3) { retry++; goto Retry; - } - else - { + } else { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Split fail: Get posting %lld fail after 3 times retries.\n", (std::int64_t)(headID)); + "Split fail: Get posting %lld fail after 3 times retries.\n", (std::int64_t)headID); return ErrorCode::DiskIOFail; } } - + } + + // Batched MultiGet for every entry's version byte plus headID's. + std::vector sp_vids; + sp_vids.reserve(postVectorNum + 1); + for (SizeType j = 0; j < postVectorNum; j++) { + sp_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + sp_vids.push_back(headID); + std::vector sp_mapVers; + m_versionMap->BatchGetVersions(sp_vids, sp_mapVers); + + for (SizeType j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) + { + //LOG(Helper::LogLevel::LL_Info, "vector index/total:id: %d/%d:%d\n", j, m_postingSizes[headID].load(), *(reinterpret_cast(vectorId))); + uint8_t version = *(vectorId + sizeof(SizeType)); + SizeType VID = sp_vids[j]; + if (VID == headID) headVec = std::make_shared((char*)vectorId, m_vectorInfoSize); - //if (VID >= m_versionMap.Count()) SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "DEBUG: vector ID:%d total size:%d\n", VID, m_versionMap.Count()); - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) continue; + uint8_t mapVer = sp_mapVers[j]; + if (mapVer == 0xfe || mapVer != version) continue; if (VID == headID) hasHead = true; localIndices.push_back(j); @@ -1312,7 +1372,7 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split fail: cannot find head in posting! headID:%lld\n", (std::int64_t)headID); return ErrorCode::Fail; } else { - *((uint8_t*)(headVec->data() + sizeof(SizeType))) = m_versionMap->GetVersion(headID); + *((uint8_t*)(headVec->data() + sizeof(SizeType))) = sp_mapVers.back(); } // double gcEndTime = sw.getElapsedMs(); // m_splitGcCost += gcEndTime; @@ -1676,10 +1736,19 @@ namespace SPTAG::SPANN { auto *postingK = reinterpret_cast(currentPostingList.data()); size_t newPostVectorNum = currentPostingList.size() / m_vectorInfoSize; + // Batched version-byte read for this posting we're merging into. + std::vector sm_vids; + sm_vids.reserve(newPostVectorNum); + for (size_t j = 0; j < newPostVectorNum; j++) { + sm_vids.push_back(*((SizeType*)(postingK + j * m_vectorInfoSize))); + } + std::vector sm_mapVers; + m_versionMap->BatchGetVersions(sm_vids, sm_mapVers); for (int j = 0; j < (int)newPostVectorNum; j++, postingK += m_vectorInfoSize) { - SizeType VID = *((SizeType *)(postingK)); + SizeType VID = sm_vids[j]; uint8_t verK = *(postingK + sizeof(SizeType)); - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != verK) continue; + uint8_t mapVer = sm_mapVers[j]; + if (mapVer == 0xfe || mapVer != verK) continue; if (vectorIdSet.find(VID) != vectorIdSet.end()) continue; vectorIdSet.insert(VID); mergedPostingList += currentPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); @@ -2011,14 +2080,26 @@ namespace SPTAG::SPANN { int currentLength = 0; uint8_t* vectorId = postingP; std::shared_ptr headVec; - for (int j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) + // Batch one TiKV MultiGet for the entire posting's version + // bytes (plus the head's own version) instead of two serial + // TiKV roundtrips per entry. Last slot is headID's version. + std::vector mp_vids; + mp_vids.reserve(postVectorNum + 1); + for (size_t j = 0; j < postVectorNum; j++) { + mp_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + mp_vids.push_back(headID); + std::vector mp_mapVers; + m_versionMap->BatchGetVersions(mp_vids, mp_mapVers); + for (int j = 0; j < (int)postVectorNum; j++, vectorId += m_vectorInfoSize) { - SizeType VID = *((SizeType*)(vectorId)); + SizeType VID = mp_vids[j]; uint8_t version = *(vectorId + sizeof(SizeType)); if (VID == headID) { headVec = std::make_shared((char*)vectorId, m_vectorInfoSize); } - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) continue; + uint8_t mapVer = mp_mapVers[j]; + if (mapVer == 0xfe || mapVer != version) continue; vectorIdSet.insert(VID); mergedPostingList += currentPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); currentLength++; @@ -2028,7 +2109,7 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail: cannot find head vector in posting! headID:%lld\n", (std::int64_t)headID); return ErrorCode::Fail; } else { - *((uint8_t*)(headVec->data() + sizeof(SizeType))) = m_versionMap->GetVersion(headID); + *((uint8_t*)(headVec->data() + sizeof(SizeType))) = mp_mapVers.back(); } if (currentLength > m_mergeThreshold) @@ -2128,12 +2209,21 @@ namespace SPTAG::SPANN { postVectorNum = nextPostingList.size() / m_vectorInfoSize; vectorId = postingP; int nextLength = 0; - for (int j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) + // Batched version-byte read for this next posting. + std::vector mp_next_vids; + mp_next_vids.reserve(postVectorNum); + for (size_t j = 0; j < postVectorNum; j++) { + mp_next_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + std::vector mp_next_mapVers; + m_versionMap->BatchGetVersions(mp_next_vids, mp_next_mapVers); + for (int j = 0; j < (int)postVectorNum; j++, vectorId += m_vectorInfoSize) { - SizeType VID = *((SizeType*)(vectorId)); + SizeType VID = mp_next_vids[j]; uint8_t version = *(vectorId + sizeof(SizeType)); if (VID == queryResult->VID) resultVec = std::make_shared((char*)vectorId, m_vectorInfoSize); - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) continue; + uint8_t mapVer = mp_next_mapVers[j]; + if (mapVer == 0xfe || mapVer != version) continue; if (vectorIdSet.find(VID) == vectorIdSet.end()) { nextVectorIdSet.insert(VID); mergedPostingList += nextPostingList.substr(j * m_vectorInfoSize, m_vectorInfoSize); @@ -2212,12 +2302,20 @@ namespace SPTAG::SPANN { if (!m_opt->m_disableReassign) { postingP = reinterpret_cast(deletedPostingList->data()); + // Batched version-byte read for the about-to-be-removed posting. + std::vector mp_del_vids; + mp_del_vids.reserve(deletedLength); + for (int j = 0; j < deletedLength; j++) { + mp_del_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + std::vector mp_del_mapVers; + m_versionMap->BatchGetVersions(mp_del_vids, mp_del_mapVers); for (int j = 0; j < deletedLength; j++) { uint8_t* vectorId = postingP + j * m_vectorInfoSize; - SizeType VID = *(reinterpret_cast(vectorId)); uint8_t version = *(vectorId + sizeof(SizeType)); ValueType* vector = reinterpret_cast(vectorId + m_metaDataSize); - if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) continue; + uint8_t mapVer = mp_del_mapVers[j]; + if (mapVer == 0xfe || mapVer != version) continue; float origin_dist = m_headIndex->ComputeDistance(deletedHeadVec->data() + m_metaDataSize, vector); float current_dist = m_headIndex->ComputeDistance(nextHeadVec->data() + m_metaDataSize, vector); if (current_dist > origin_dist) { @@ -2408,19 +2506,28 @@ namespace SPTAG::SPANN { auto& postingList = postingLists[i]; size_t postVectorNum = postingList.size() / m_vectorInfoSize; auto* postingP = reinterpret_cast(postingList.data()); - for (int j = 0; j < postVectorNum; j++) { + // Batched version-byte read for the entire posting. + std::vector cr_vids; + cr_vids.reserve(postVectorNum); + for (size_t j = 0; j < postVectorNum; j++) { + cr_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + std::vector cr_mapVers; + m_versionMap->BatchGetVersions(cr_vids, cr_mapVers); + const SizeType maxVid = m_versionMap->Count(); + for (size_t j = 0; j < postVectorNum; j++) { uint8_t* vectorId = postingP + j * m_vectorInfoSize; - SizeType vid = *(reinterpret_cast(vectorId)); + SizeType vid = cr_vids[j]; uint8_t version = *(reinterpret_cast(vectorId + sizeof(SizeType))); ValueType* vector = reinterpret_cast(vectorId + m_metaDataSize); - const SizeType maxVid = m_versionMap->Count(); if (vid < 0 || vid >= maxVid) { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "CollectReAssign: skip invalid VID %d (max %d) in posting headID=%d\n", vid, maxVid, newHeadsID[i]); continue; } - if (reAssignVectorsTopK.find(vid) == reAssignVectorsTopK.end() && !m_versionMap->Deleted(vid) && m_versionMap->GetVersion(vid) == version) { + uint8_t mapVer = cr_mapVers[j]; + if (reAssignVectorsTopK.find(vid) == reAssignVectorsTopK.end() && mapVer != 0xfe && mapVer == version) { m_stat.m_reAssignScanNum++; float dist = m_headIndex->ComputeDistance(newHeadsVec[i]->data(), vector); if (CheckIsNeedReassign(newHeadsVec, vector, headVector, newHeadsDist[i], dist, true)) { @@ -2485,19 +2592,28 @@ namespace SPTAG::SPANN { auto& postingList = nearbyPostings[i]; size_t postVectorNum = postingList.size() / m_vectorInfoSize; auto* postingP = reinterpret_cast(postingList.data()); - for (int j = 0; j < postVectorNum; j++) { + // Batched version-byte read for the nearby posting. + std::vector nb_vids; + nb_vids.reserve(postVectorNum); + for (size_t j = 0; j < postVectorNum; j++) { + nb_vids.push_back(*((SizeType*)(postingP + j * m_vectorInfoSize))); + } + std::vector nb_mapVers; + m_versionMap->BatchGetVersions(nb_vids, nb_mapVers); + const SizeType maxVid = m_versionMap->Count(); + for (size_t j = 0; j < postVectorNum; j++) { uint8_t* vectorId = postingP + j * m_vectorInfoSize; - SizeType vid = *(reinterpret_cast(vectorId)); + SizeType vid = nb_vids[j]; uint8_t version = *(reinterpret_cast(vectorId + sizeof(SizeType))); ValueType* vector = reinterpret_cast(vectorId + m_metaDataSize); - const SizeType maxVid = m_versionMap->Count(); if (vid < 0 || vid >= maxVid) { SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "CollectReAssign(nearby): skip invalid VID %d (max %d) in posting headID=%d\n", vid, maxVid, HeadPrevTopK[i]); continue; } - if (reAssignVectorsTopK.find(vid) == reAssignVectorsTopK.end() && !m_versionMap->Deleted(vid) && m_versionMap->GetVersion(vid) == version) { + uint8_t mapVer = nb_mapVers[j]; + if (reAssignVectorsTopK.find(vid) == reAssignVectorsTopK.end() && mapVer != 0xfe && mapVer == version) { m_stat.m_reAssignScanNum++; float dist = m_headIndex->ComputeDistance(HeadPrevTopKVec[i]->data(), vector); if (CheckIsNeedReassign(newHeadsVec, vector, headVector, newHeadsDist[i], dist, false)) {