diff --git a/.github/workflows/sanitizer_test.yml b/.github/workflows/sanitizer_test.yml index efe9f49ac..faac265e8 100644 --- a/.github/workflows/sanitizer_test.yml +++ b/.github/workflows/sanitizer_test.yml @@ -58,9 +58,9 @@ jobs: - name: Run Tests working-directory: build env: - ASAN_OPTIONS: log_path=out.log:detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=1:detect_container_overflow=0 + ASAN_OPTIONS: log_path=${{ github.workspace }}/asan.log:detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=1:detect_container_overflow=0 LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/lsan-suppressions.txt - UBSAN_OPTIONS: log_path=out.log:halt_on_error=1:print_stacktrace=1:suppressions=${{ github.workspace }}/.github/ubsan-suppressions.txt + UBSAN_OPTIONS: log_path=${{ github.workspace }}/ubsan.log:halt_on_error=1:print_stacktrace=1:suppressions=${{ github.workspace }}/.github/ubsan-suppressions.txt run: | ctest --output-on-failure - name: Save the test output @@ -68,4 +68,8 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: test-output - path: build/test/out.log* + path: | + asan.log* + ubsan.log* + build/Testing/Temporary/LastTest.log + build/Testing/Temporary/LastTestsFailed.log diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 18cf70bdb..38d85534a 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -89,6 +89,7 @@ set(ICEBERG_SOURCES type.cc update/expire_snapshots.cc update/fast_append.cc + update/merging_snapshot_update.cc update/pending_update.cc update/set_snapshot.cc update/snapshot_manager.cc diff --git a/src/iceberg/manifest/manifest_entry.h b/src/iceberg/manifest/manifest_entry.h index 17c5388bd..e2dd5ea61 100644 --- a/src/iceberg/manifest/manifest_entry.h +++ b/src/iceberg/manifest/manifest_entry.h @@ -374,7 +374,7 @@ struct ICEBERG_EXPORT ManifestEntry { ManifestEntry AsAdded() const { ManifestEntry copy = *this; copy.status = ManifestStatus::kAdded; - if (copy.data_file->first_row_id.has_value()) { + if (copy.data_file != nullptr && copy.data_file->first_row_id.has_value()) { copy.data_file = std::make_unique(*copy.data_file); copy.data_file->first_row_id = std::nullopt; } diff --git a/src/iceberg/manifest/manifest_filter_manager.cc b/src/iceberg/manifest/manifest_filter_manager.cc index 086c94a78..49fec0461 100644 --- a/src/iceberg/manifest/manifest_filter_manager.cc +++ b/src/iceberg/manifest/manifest_filter_manager.cc @@ -20,6 +20,7 @@ #include "iceberg/manifest/manifest_filter_manager.h" #include +#include #include #include @@ -29,6 +30,7 @@ #include "iceberg/expression/manifest_evaluator.h" #include "iceberg/expression/residual_evaluator.h" #include "iceberg/expression/strict_metrics_evaluator.h" +#include "iceberg/file_io.h" #include "iceberg/manifest/manifest_entry.h" #include "iceberg/manifest/manifest_list.h" #include "iceberg/manifest/manifest_reader.h" @@ -63,18 +65,76 @@ Result FormatPartitionPath(const PartitionSpecsById& specs_by_id, return spec->PartitionPath(file.partition); } +void AddDeletedFileToManager(ManifestContent manifest_content, DataFileSet& data_files, + DeleteFileSet& delete_files, + std::vector>& deleted_files, + DataFileSet& deleted_data_file_set, + DeleteFileSet& deleted_file_set, + const std::shared_ptr& file) { + if (file == nullptr) { + return; + } + + bool inserted; + if (manifest_content == ManifestContent::kData) { + data_files.insert(file); + inserted = deleted_data_file_set.insert(file).second; + } else { + delete_files.insert(file); + inserted = deleted_file_set.insert(file).second; + } + if (inserted) { + deleted_files.push_back(file); + } +} + +bool AddDeletedFileToManifest(ManifestContent manifest_content, + std::vector>& deleted_files, + DataFileSet& deleted_data_file_set, + DeleteFileSet& deleted_file_set, + const std::shared_ptr& file) { + if (file == nullptr) { + return false; + } + bool inserted = manifest_content == ManifestContent::kData + ? deleted_data_file_set.insert(file).second + : deleted_file_set.insert(file).second; + if (inserted) { + deleted_files.push_back(file); + } + return inserted; +} + } // namespace -ManifestFilterManager::ManifestFilterManager(ManifestContent content, - std::shared_ptr file_io) +Result> ManifestFilterManager::Make( + ManifestContent content, std::shared_ptr file_io, + std::function delete_file) { + ICEBERG_PRECHECK(file_io != nullptr, "FileIO cannot be null"); + return std::unique_ptr( + new ManifestFilterManager(content, std::move(file_io), std::move(delete_file))); +} + +ManifestFilterManager::ManifestFilterManager( + ManifestContent content, std::shared_ptr file_io, + std::function delete_file) : manifest_content_(content), file_io_(std::move(file_io)), - delete_expr_(Expressions::AlwaysFalse()) {} + delete_file_(std::move(delete_file)), + delete_expr_(Expressions::AlwaysFalse()) { + ICEBERG_DCHECK(file_io_, "FileIO cannot be null"); + if (delete_file_ == nullptr) { + delete_file_ = [this](const std::string& location) { + return file_io_->DeleteFile(location); + }; + } +} ManifestFilterManager::~ManifestFilterManager() = default; Status ManifestFilterManager::DeleteByRowFilter(std::shared_ptr expr) { ICEBERG_PRECHECK(expr != nullptr, "Cannot delete files using filter: null"); + ICEBERG_RETURN_UNEXPECTED(InvalidateFilteredCache()); ICEBERG_ASSIGN_OR_RAISE(delete_expr_, Or::MakeFolded(delete_expr_, std::move(expr))); manifest_evaluator_cache_.clear(); residual_evaluator_cache_.clear(); @@ -87,23 +147,58 @@ void ManifestFilterManager::CaseSensitive(bool case_sensitive) { residual_evaluator_cache_.clear(); } -void ManifestFilterManager::DeleteFile(std::string_view path) { +Status ManifestFilterManager::DeleteFile(std::string_view path) { + ICEBERG_RETURN_UNEXPECTED(InvalidateFilteredCache()); delete_paths_.insert(std::string(path)); + return {}; } Status ManifestFilterManager::DeleteFile(std::shared_ptr file) { ICEBERG_PRECHECK(file != nullptr, "Cannot delete file: null"); - delete_paths_.insert(file->file_path); - delete_files_.insert(std::move(file)); + ICEBERG_RETURN_UNEXPECTED(InvalidateFilteredCache()); + if (manifest_content_ == ManifestContent::kData) { + data_files_.insert(file); + data_files_to_delete_.insert(std::move(file)); + } else { + delete_files_.insert(file); + delete_files_to_delete_.insert(std::move(file)); + } return {}; } -const DataFileSet& ManifestFilterManager::FilesToBeDeleted() const { - return delete_files_; +const DataFileSet& ManifestFilterManager::FilesToBeDeleted() const { return data_files_; } + +const std::vector>& ManifestFilterManager::DeletedFiles() + const { + return deleted_files_; +} + +Result ManifestFilterManager::BuildSummary( + const std::vector& manifests, + const PartitionSpecsById& specs_by_id) const { + SnapshotSummaryBuilder summary; + for (const auto& manifest : manifests) { + auto deleted_iter = filtered_manifest_to_deleted_files_.find(manifest); + if (deleted_iter == filtered_manifest_to_deleted_files_.end()) { + continue; + } + + ICEBERG_ASSIGN_OR_RAISE(auto spec, + PartitionSpecById(specs_by_id, manifest.partition_spec_id)); + for (const auto& file : deleted_iter->second.files) { + if (file != nullptr) { + ICEBERG_RETURN_UNEXPECTED(summary.DeletedFile(*spec, *file)); + } + } + } + summary.IncrementDuplicateDeletes(duplicate_deletes_count_); + return summary; } -void ManifestFilterManager::DropPartition(int32_t spec_id, PartitionValues partition) { +Status ManifestFilterManager::DropPartition(int32_t spec_id, PartitionValues partition) { + ICEBERG_RETURN_UNEXPECTED(InvalidateFilteredCache()); drop_partitions_.add(spec_id, std::move(partition)); + return {}; } void ManifestFilterManager::FailMissingDeletePaths() { @@ -114,15 +209,32 @@ void ManifestFilterManager::FailAnyDelete() { fail_any_delete_ = true; } bool ManifestFilterManager::ContainsDeletes() const { return HasRowFilterExpression(delete_expr_) || !delete_paths_.empty() || + !data_files_to_delete_.empty() || !delete_files_to_delete_.empty() || !drop_partitions_.empty(); } +Status ManifestFilterManager::DropDeleteFilesOlderThan(int64_t sequence_number) { + ICEBERG_PRECHECK(sequence_number >= 0, "Invalid minimum data sequence number: {}", + sequence_number); + min_sequence_number_ = sequence_number; + return {}; +} + +void ManifestFilterManager::RemoveDanglingDeletesFor(const DataFileSet& deleted_files) { + std::unordered_set removed_data_file_paths; + for (const auto& file : deleted_files) { + if (file != nullptr) { + removed_data_file_paths.insert(file->file_path); + } + } + removed_data_file_paths_ = std::move(removed_data_file_paths); +} + Result ManifestFilterManager::CanContainDroppedFiles(const ManifestFile&) const { - // TODO(Guotao): Use the manifest descriptor to skip unrelated object-delete - // manifests once object-delete partitions are tracked separately. - // Currently, DeleteFile(std::shared_ptr) degrades to a path-based delete, - // which forces scanning all manifests. - return !delete_paths_.empty(); + // TODO(Guotao): prune object deletes by partition once manifest partition + // summary checks are available. + return !delete_paths_.empty() || !data_files_to_delete_.empty() || + !delete_files_to_delete_.empty() || !removed_data_file_paths_.empty(); } Result ManifestFilterManager::CanContainDroppedPartitions( @@ -208,15 +320,42 @@ Result ManifestFilterManager::ShouldDelete(const ManifestEntry& entry, const DataFile& file = *entry.data_file; int32_t spec_id = file.partition_spec_id.value_or(manifest_spec_id); - // Path-based and partition-drop checks - if (delete_paths_.count(file.file_path) || - drop_partitions_.contains(spec_id, file.partition)) { + // All delete branches share fail-any-delete handling. + auto marked_for_delete = [&]() -> Result { if (fail_any_delete_) { ICEBERG_ASSIGN_OR_RAISE(auto partition_path, FormatPartitionPath(specs_by_id, file, spec_id)); - return InvalidArgument("Operation would delete existing data: {}", partition_path); + return ValidationFailed("Operation would delete existing data: {}", partition_path); } return true; + }; + + // Path/object-based and partition-drop checks. + bool object_delete = manifest_content_ == ManifestContent::kData + ? data_files_to_delete_.contains(file) + : delete_files_to_delete_.contains(file); + if (delete_paths_.count(file.file_path) || object_delete || + drop_partitions_.contains(spec_id, file.partition)) { + return marked_for_delete(); + } + + // Delete-manifest-specific cleanup (only for ManifestContent::kDeletes). + if (manifest_content_ == ManifestContent::kDeletes) { + // Drop delete files whose data sequence number is older than the minimum + // retained by the table (they can no longer match any live data rows). + // seq == 0 (kInitialSequenceNumber / nullopt) is intentionally excluded: + // those entries predate sequence number assignment and must not be pruned. + int64_t seq = entry.sequence_number.value_or(0); + if (min_sequence_number_ > 0 && seq > 0 && seq < min_sequence_number_) { + return marked_for_delete(); + } + + // Drop DVs that reference a data file that has been removed (dangling DV). + if (!removed_data_file_paths_.empty() && file.IsDeletionVector() && + file.referenced_data_file.has_value() && + removed_data_file_paths_.count(*file.referenced_data_file)) { + return marked_for_delete(); + } } if (HasRowFilterExpression(delete_expr_)) { @@ -230,13 +369,7 @@ Result ManifestFilterManager::ShouldDelete(const ManifestEntry& entry, StrictMetricsEvaluator::Make(residual_expr, schema, case_sensitive_)); ICEBERG_ASSIGN_OR_RAISE(auto strict_match, strict_eval->Evaluate(file)); if (strict_match) { - if (fail_any_delete_) { - ICEBERG_ASSIGN_OR_RAISE(auto partition_path, - FormatPartitionPath(specs_by_id, file, spec_id)); - return InvalidArgument("Operation would delete existing data: {}", - partition_path); - } - return true; + return marked_for_delete(); } ICEBERG_ASSIGN_OR_RAISE(auto incl_eval, InclusiveMetricsEvaluator::Make( @@ -246,7 +379,7 @@ Result ManifestFilterManager::ShouldDelete(const ManifestEntry& entry, if (manifest_content_ == ManifestContent::kDeletes) { return false; } - return InvalidArgument( + return ValidationFailed( "Cannot delete file where some, but not all, rows match filter: {}", file.file_path); } @@ -257,20 +390,34 @@ Result ManifestFilterManager::ShouldDelete(const ManifestEntry& entry, bool ManifestFilterManager::CanTrustManifestReferences( const std::vector&) const { - // TODO(Guotao): Track source manifest locations for object deletes so manifests - // outside the referenced set can be skipped before any other delete checks. + // TODO(Guotao): add DataFile manifest locations and use them to skip unrelated + // manifests. Until then, take the conservative path. return false; } Result ManifestFilterManager::FilterManifest( const std::shared_ptr& schema, const PartitionSpecsById& specs_by_id, const ManifestFile& manifest, bool trust_manifest_references, - const ManifestWriterFactory& writer_factory, - std::unordered_set& found_paths) { + const ManifestWriterFactory& writer_factory) { + auto cached = filtered_manifests_.find(manifest); + if (cached != filtered_manifests_.end()) { + auto deleted_iter = filtered_manifest_to_deleted_files_.find(cached->second); + if (deleted_iter != filtered_manifest_to_deleted_files_.end()) { + for (const auto& file : deleted_iter->second.files) { + AddDeletedFileToManager(manifest_content_, data_files_, delete_files_, + deleted_files_, deleted_data_file_set_, + deleted_delete_file_set_, file); + } + duplicate_deletes_count_ += deleted_iter->second.duplicate_deletes_count; + } + return cached->second; + } + ICEBERG_ASSIGN_OR_RAISE( auto can_contain_deleted_files, CanContainDeletedFiles(manifest, schema, specs_by_id, trust_manifest_references)); if (!can_contain_deleted_files) { + filtered_manifests_.emplace(manifest, manifest); return manifest; } @@ -283,11 +430,16 @@ Result ManifestFilterManager::FilterManifest( ICEBERG_ASSIGN_OR_RAISE(auto has_deleted_files, ManifestHasDeletedFiles(entries, schema, specs_by_id, spec_id)); if (!has_deleted_files) { + filtered_manifests_.emplace(manifest, manifest); return manifest; } - return FilterManifestWithDeletedFiles(entries, spec_id, schema, specs_by_id, - writer_factory, found_paths); + ICEBERG_ASSIGN_OR_RAISE(auto filtered_manifest, + FilterManifestWithDeletedFiles(entries, spec_id, schema, + specs_by_id, writer_factory)); + filtered_manifests_.emplace(manifest, filtered_manifest); + ++replaced_manifests_count_; + return filtered_manifest; } Result ManifestFilterManager::ManifestHasDeletedFiles( @@ -306,21 +458,26 @@ Result ManifestFilterManager::ManifestHasDeletedFiles( Result ManifestFilterManager::FilterManifestWithDeletedFiles( const std::vector& entries, int32_t manifest_spec_id, const std::shared_ptr& schema, const PartitionSpecsById& specs_by_id, - const ManifestWriterFactory& writer_factory, - std::unordered_set& found_paths) { + const ManifestWriterFactory& writer_factory) { ICEBERG_ASSIGN_OR_RAISE(auto writer, writer_factory(manifest_spec_id, manifest_content_)); + std::vector> deleted_files; + DataFileSet deleted_data_file_set; + DeleteFileSet deleted_file_set; + int32_t duplicate_deletes_count = 0; for (const auto& entry : entries) { ICEBERG_ASSIGN_OR_RAISE(auto should_delete, ShouldDelete(entry, schema, specs_by_id, manifest_spec_id)); if (should_delete) { - if (entry.data_file && delete_paths_.count(entry.data_file->file_path)) { - found_paths.insert(entry.data_file->file_path); - } if (entry.data_file) { - // TODO(Guotao): Track duplicate deletes and avoid full DataFile copies when - // summary generation can use lighter records. - delete_files_.insert(std::make_shared(*entry.data_file)); + auto file = std::make_shared(*entry.data_file); + AddDeletedFileToManager(manifest_content_, data_files_, delete_files_, + deleted_files_, deleted_data_file_set_, + deleted_delete_file_set_, file); + if (!AddDeletedFileToManifest(manifest_content_, deleted_files, + deleted_data_file_set, deleted_file_set, file)) { + ++duplicate_deletes_count; + } } ICEBERG_RETURN_UNEXPECTED(writer->WriteDeletedEntry(entry)); } else { @@ -329,24 +486,72 @@ Result ManifestFilterManager::FilterManifestWithDeletedFiles( } ICEBERG_RETURN_UNEXPECTED(writer->Close()); - return writer->ToManifestFile(); + ICEBERG_ASSIGN_OR_RAISE(auto filtered_manifest, writer->ToManifestFile()); + duplicate_deletes_count_ += duplicate_deletes_count; + filtered_manifest_to_deleted_files_[filtered_manifest] = FilteredManifestDeletes{ + .files = std::move(deleted_files), + .duplicate_deletes_count = duplicate_deletes_count, + }; + return filtered_manifest; +} + +Status ManifestFilterManager::InvalidateFilteredCache() { + ICEBERG_RETURN_UNEXPECTED(CleanUncommitted({})); + replaced_manifests_count_ = 0; + return {}; +} + +void ManifestFilterManager::ResetDeletedFiles() { + data_files_.clear(); + for (const auto& file : data_files_to_delete_) { + data_files_.insert(file); + } + delete_files_.clear(); + for (const auto& file : delete_files_to_delete_) { + delete_files_.insert(file); + } } -Status ManifestFilterManager::ValidateRequiredDeletes( - const std::unordered_set& found_paths) const { +Status ManifestFilterManager::ValidateRequiredDeletes() const { if (!fail_missing_delete_paths_) { return {}; } - std::string missing; + std::string missing_files; + const auto append_missing = [&missing_files](const std::string& path) { + if (!missing_files.empty()) missing_files += ","; + missing_files += path; + }; + for (const auto& key : data_files_to_delete_) { + if (!deleted_data_file_set_.contains(key)) { + append_missing(key->file_path); + } + } + for (const auto& key : delete_files_to_delete_) { + if (!deleted_delete_file_set_.contains(key)) { + append_missing(key->file_path); + } + } + if (!missing_files.empty()) { + return ValidationFailed("Missing required files to delete: {}", missing_files); + } + + std::string missing_paths; for (const auto& path : delete_paths_) { - if (!found_paths.count(path)) { - if (!missing.empty()) missing += ", "; - missing += path; + bool found = false; + for (const auto& deleted_file : deleted_files_) { + if (deleted_file != nullptr && deleted_file->file_path == path) { + found = true; + break; + } + } + if (!found) { + if (!missing_paths.empty()) missing_paths += ","; + missing_paths += path; } } - if (!missing.empty()) { - return InvalidArgument("Missing delete paths: {}", missing); + if (!missing_paths.empty()) { + return ValidationFailed("Missing required files to delete: {}", missing_paths); } return {}; } @@ -354,8 +559,21 @@ Status ManifestFilterManager::ValidateRequiredDeletes( Result> ManifestFilterManager::FilterManifests( const TableMetadata& metadata, const std::shared_ptr& base_snapshot, const ManifestWriterFactory& writer_factory) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + return FilterManifests(schema, metadata, base_snapshot, writer_factory); +} + +Result> ManifestFilterManager::FilterManifests( + const std::shared_ptr& schema, const TableMetadata& metadata, + const std::shared_ptr& base_snapshot, + const ManifestWriterFactory& writer_factory) { + ResetDeletedFiles(); + deleted_files_.clear(); + deleted_data_file_set_.clear(); + deleted_delete_file_set_.clear(); + duplicate_deletes_count_ = 0; if (!base_snapshot) { - ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes({})); + ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes()); return std::vector{}; } @@ -371,7 +589,6 @@ Result> ManifestFilterManager::FilterManifests( manifests.push_back(&manifest); } - ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); TableMetadataCache metadata_cache(&metadata); ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, metadata_cache.GetPartitionSpecsById()); @@ -394,9 +611,13 @@ Result> ManifestFilterManager::FilterManifests( } } - std::unordered_set found_paths; + ResetDeletedFiles(); + deleted_files_.clear(); + deleted_data_file_set_.clear(); + deleted_delete_file_set_.clear(); + duplicate_deletes_count_ = 0; if (manifests.empty()) { - ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes(found_paths)); + ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes()); return std::vector{}; } @@ -409,15 +630,35 @@ Result> ManifestFilterManager::FilterManifests( std::vector filtered; filtered.reserve(manifests.size()); for (const auto* manifest_ptr : manifests) { - ICEBERG_ASSIGN_OR_RAISE( - auto filtered_manifest, - FilterManifest(schema, specs_by_id, *manifest_ptr, trust_manifest_references, - writer_factory, found_paths)); + ICEBERG_ASSIGN_OR_RAISE(auto filtered_manifest, + FilterManifest(schema, specs_by_id, *manifest_ptr, + trust_manifest_references, writer_factory)); filtered.push_back(std::move(filtered_manifest)); } - ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes(found_paths)); + ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes()); return filtered; } +Status ManifestFilterManager::CleanUncommitted( + const std::unordered_set& committed) { + auto entries = std::vector>{ + filtered_manifests_.begin(), filtered_manifests_.end()}; + for (const auto& [manifest, filtered] : entries) { + if (committed.contains(filtered.manifest_path)) { + continue; + } + + if (manifest != filtered) { + std::ignore = delete_file_(filtered.manifest_path); + if (replaced_manifests_count_ > 0) { + --replaced_manifests_count_; + } + } + filtered_manifests_.erase(manifest); + filtered_manifest_to_deleted_files_.erase(filtered); + } + return {}; +} + } // namespace iceberg diff --git a/src/iceberg/manifest/manifest_filter_manager.h b/src/iceberg/manifest/manifest_filter_manager.h index 55258b2b1..e742812cc 100644 --- a/src/iceberg/manifest/manifest_filter_manager.h +++ b/src/iceberg/manifest/manifest_filter_manager.h @@ -24,6 +24,7 @@ /// or EXISTING based on row-filter expressions, exact path deletes, and partition drops. #include +#include #include #include #include @@ -34,6 +35,7 @@ #include "iceberg/manifest/manifest_list.h" #include "iceberg/manifest/manifest_writer.h" #include "iceberg/result.h" +#include "iceberg/snapshot.h" #include "iceberg/type_fwd.h" #include "iceberg/util/data_file_set.h" #include "iceberg/util/partition_value_util.h" @@ -47,9 +49,6 @@ namespace iceberg { /// entries are returned unchanged (no I/O). Manifests that do contain deleted /// entries are rewritten with those entries marked DELETED. /// -/// The manager is content-agnostic: pass ManifestContent::kData to process data -/// manifests, or ManifestContent::kDeletes to process delete manifests. -/// /// TODO(Guotao): For ManifestContent::kDeletes, implement cleanup for orphan delete files /// and dangling deletion vectors. /// @@ -58,7 +57,9 @@ class ICEBERG_EXPORT ManifestFilterManager { public: using PartitionSpecsById = std::unordered_map>; - ManifestFilterManager(ManifestContent content, std::shared_ptr file_io); + static Result> Make( + ManifestContent content, std::shared_ptr file_io, + std::function delete_file = {}); ~ManifestFilterManager(); ManifestFilterManager(const ManifestFilterManager&) = delete; @@ -80,33 +81,43 @@ class ICEBERG_EXPORT ManifestFilterManager { /// Any manifest entry whose file_path matches this path will be marked DELETED. /// /// \param path The exact file path to delete - void DeleteFile(std::string_view path); + Status DeleteFile(std::string_view path); /// \brief Register a file object for deletion. /// /// Any manifest entry whose file_path matches file->file_path will be marked - /// DELETED. The file object is retained in FilesToBeDeleted(), allowing callers - /// to enumerate deleted file objects for follow-up delete-file cleanup. - /// Duplicate registrations (same path) are silently ignored. + /// DELETED. Duplicate registrations (same path) are silently ignored. /// /// \param file The data/delete file to delete (must not be null) Status DeleteFile(std::shared_ptr file); /// \brief Returns the set of file objects marked for deletion by this manager. /// - /// This includes files registered via DeleteFile(DataFile) and files discovered - /// during FilterManifests() that were deleted by path, partition, or row-filter - /// matching. Used by higher-level operations (e.g. RowDelta) to enumerate the - /// deleted data files for delete-file cleanup. + /// Includes file objects explicitly registered for deletion plus files deleted while + /// filtering manifests. const DataFileSet& FilesToBeDeleted() const; + /// \brief Returns content-file objects deleted by the most recent + /// FilterManifests() call, deduplicated by content-file identity. + const std::vector>& DeletedFiles() const; + + /// \brief Returns how many duplicate file deletes were found in the most recent + /// FilterManifests() call. + int32_t DuplicateDeletesCount() const { return duplicate_deletes_count_; } + + /// \brief Build a snapshot-summary fragment from filtered manifests. + /// + Result BuildSummary( + const std::vector& manifests, + const PartitionSpecsById& specs_by_id) const; + /// \brief Register a partition for dropping. /// /// Any manifest entry whose (spec_id, partition) pair matches will be marked DELETED. /// /// \param spec_id The partition spec ID /// \param partition The partition values to drop - void DropPartition(int32_t spec_id, PartitionValues partition); + Status DropPartition(int32_t spec_id, PartitionValues partition); /// \brief Set a flag that makes FilterManifests() fail if any registered /// delete path was not found in any manifest entry. @@ -116,9 +127,36 @@ class ICEBERG_EXPORT ManifestFilterManager { /// manifest entry matches a delete condition. void FailAnyDelete(); + /// \brief Returns the number of manifests rewritten (replaced) by the last + /// FilterManifests() call. A manifest is replaced when it contained deleted entries + /// and was rewritten with those entries marked DELETED. + int32_t ReplacedManifestsCount() const { return replaced_manifests_count_; } + /// \brief Returns true if any delete condition has been registered. bool ContainsDeletes() const; + /// \brief Set the minimum data sequence number for delete files to retain. + /// + /// Only valid for ManifestContent::kDeletes managers. Delete entries whose + /// data_sequence_number is positive and less than sequence_number will be + /// marked DELETED. This continuously removes delete files that cannot match + /// any remaining data rows (i.e. all data written before that sequence number + /// has itself been deleted). + /// + /// \param sequence_number the inclusive lower bound; delete files older than + /// this value are dropped + Status DropDeleteFilesOlderThan(int64_t sequence_number); + + /// \brief Register data files that have been removed so their dangling DVs + /// can be cleaned up. + /// + /// Only valid for ManifestContent::kDeletes managers. For each DV whose + /// referenced_data_file path appears in deleted_files, the DV entry is + /// marked DELETED because the data file it targets no longer exists. + /// + /// \param deleted_files set of data files that have been marked for deletion + void RemoveDanglingDeletesFor(const DataFileSet& deleted_files); + /// \brief Apply all accumulated delete conditions to the base snapshot's manifests. /// /// Manifests that cannot possibly contain deleted files are returned unchanged. @@ -132,6 +170,15 @@ class ICEBERG_EXPORT ManifestFilterManager { const TableMetadata& metadata, const std::shared_ptr& base_snapshot, const ManifestWriterFactory& writer_factory); + /// \brief Apply all accumulated delete conditions using an explicit schema. + /// + /// This overload is used when callers need row-filter evaluation bound against a + /// schema other than metadata.Schema(), such as the schema at a branch head. + Result> FilterManifests( + const std::shared_ptr& schema, const TableMetadata& metadata, + const std::shared_ptr& base_snapshot, + const ManifestWriterFactory& writer_factory); + /// \brief Apply all accumulated delete conditions to the provided manifests. /// /// This overload accepts only the context needed for filtering. It is intended for @@ -147,7 +194,14 @@ class ICEBERG_EXPORT ManifestFilterManager { const std::vector& manifests, const ManifestWriterFactory& writer_factory); + /// \brief Delete cached filtered manifests that were not committed and roll back + /// replaced-manifest accounting. + Status CleanUncommitted(const std::unordered_set& committed); + private: + ManifestFilterManager(ManifestContent content, std::shared_ptr file_io, + std::function delete_file); + /// \brief Returns true if the manifest might contain files matching any expression. Result CanContainExpressionDeletes(const ManifestFile& manifest, const std::shared_ptr& schema, @@ -172,12 +226,16 @@ class ICEBERG_EXPORT ManifestFilterManager { bool CanTrustManifestReferences( const std::vector& manifests) const; + struct FilteredManifestDeletes { + std::vector> files; + int32_t duplicate_deletes_count = 0; + }; + Result FilterManifest(const std::shared_ptr& schema, const PartitionSpecsById& specs_by_id, const ManifestFile& manifest, bool trust_manifest_references, - const ManifestWriterFactory& writer_factory, - std::unordered_set& found_paths); + const ManifestWriterFactory& writer_factory); Result ManifestHasDeletedFiles(const std::vector& entries, const std::shared_ptr& schema, @@ -187,11 +245,12 @@ class ICEBERG_EXPORT ManifestFilterManager { Result FilterManifestWithDeletedFiles( const std::vector& entries, int32_t manifest_spec_id, const std::shared_ptr& schema, const PartitionSpecsById& specs_by_id, - const ManifestWriterFactory& writer_factory, - std::unordered_set& found_paths); + const ManifestWriterFactory& writer_factory); + + Status ValidateRequiredDeletes() const; - Status ValidateRequiredDeletes( - const std::unordered_set& found_paths) const; + Status InvalidateFilteredCache(); + void ResetDeletedFiles(); /// \brief Get or create a ManifestEvaluator for the given spec. Result GetManifestEvaluator(const std::shared_ptr& schema, @@ -211,14 +270,39 @@ class ICEBERG_EXPORT ManifestFilterManager { const ManifestContent manifest_content_; std::shared_ptr file_io_; + std::function delete_file_; std::shared_ptr delete_expr_; std::unordered_set delete_paths_; - DataFileSet delete_files_; + // Delete files explicitly registered for deletion by object identity. + DeleteFileSet delete_files_to_delete_; + // Data files explicitly registered for deletion by object identity. + DataFileSet data_files_to_delete_; + // Data files to remove: explicit object deletes plus files found while filtering. + DataFileSet data_files_; + // Delete files to remove: explicit object deletes plus files found while filtering. + DeleteFileSet delete_files_; + std::unordered_map + filtered_manifest_to_deleted_files_; + // Ordered files deleted by the latest filter pass, used for summaries. + std::vector> deleted_files_; + // Data-file identity set for latest-pass dedup and required-delete validation. + DataFileSet deleted_data_file_set_; + // Delete-file identity set for latest-pass dedup and required-delete validation. + DeleteFileSet deleted_delete_file_set_; PartitionSet drop_partitions_; bool fail_missing_delete_paths_{false}; bool fail_any_delete_{false}; bool case_sensitive_{true}; + int32_t duplicate_deletes_count_{0}; + int32_t replaced_manifests_count_{0}; + + // minimum data sequence number; delete entries older than this are dropped + int64_t min_sequence_number_{0}; + // paths of data files that were removed; DVs referencing these are dangling + std::unordered_set removed_data_file_paths_; + + std::unordered_map filtered_manifests_; std::unordered_map> manifest_evaluator_cache_; diff --git a/src/iceberg/manifest/manifest_merge_manager.cc b/src/iceberg/manifest/manifest_merge_manager.cc index 056dce3f5..b2becf920 100644 --- a/src/iceberg/manifest/manifest_merge_manager.cc +++ b/src/iceberg/manifest/manifest_merge_manager.cc @@ -21,12 +21,15 @@ #include #include +#include #include #include #include +#include #include #include +#include "iceberg/file_io.h" #include "iceberg/manifest/manifest_entry.h" #include "iceberg/manifest/manifest_reader.h" #include "iceberg/table_metadata.h" @@ -34,27 +37,68 @@ namespace iceberg { -ManifestMergeManager::ManifestMergeManager(int64_t target_size_bytes, - int32_t min_count_to_merge, bool merge_enabled) - : target_size_bytes_(target_size_bytes), +namespace { + +size_t CombineHash(size_t seed, size_t value) { + return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} + +} // namespace + +ManifestMergeManager::ManifestMergeManager( + ManifestContent content, int64_t target_size_bytes, int32_t min_count_to_merge, + bool merge_enabled, std::shared_ptr file_io, + SnapshotIdSupplier snapshot_id_supplier, + std::function delete_file) + : manifest_content_(content), + target_size_bytes_(target_size_bytes), min_count_to_merge_(min_count_to_merge), - merge_enabled_(merge_enabled) {} + merge_enabled_(merge_enabled), + file_io_(std::move(file_io)), + snapshot_id_supplier_(std::move(snapshot_id_supplier)), + delete_file_(std::move(delete_file)) { + ICEBERG_DCHECK(file_io_, "FileIO cannot be null"); + ICEBERG_DCHECK(snapshot_id_supplier_, "Snapshot ID supplier cannot be null"); + if (delete_file_ == nullptr) { + delete_file_ = [this](const std::string& location) { + return file_io_->DeleteFile(location); + }; + } +} + +Result> ManifestMergeManager::Make( + ManifestContent content, int64_t target_size_bytes, int32_t min_count_to_merge, + bool merge_enabled, std::shared_ptr file_io, + SnapshotIdSupplier snapshot_id_supplier, + std::function delete_file) { + ICEBERG_PRECHECK(file_io != nullptr, "FileIO cannot be null"); + ICEBERG_PRECHECK(snapshot_id_supplier != nullptr, + "Snapshot ID supplier cannot be null"); + return std::unique_ptr(new ManifestMergeManager( + content, target_size_bytes, min_count_to_merge, merge_enabled, std::move(file_io), + std::move(snapshot_id_supplier), std::move(delete_file))); +} Result> ManifestMergeManager::MergeManifests( const std::vector& existing_manifests, - const std::vector& new_manifests, int64_t snapshot_id, - const TableMetadata& metadata, std::shared_ptr file_io, + const std::vector& new_manifests, const TableMetadata& metadata, const ManifestWriterFactory& writer_factory) { - // Combine new then existing (new-first ordering is preserved in output). - auto to_manifest_ptr = [](const ManifestFile& manifest) { return &manifest; }; - auto manifest_ranges = std::array{ - new_manifests | std::views::transform(to_manifest_ptr), - existing_manifests | std::views::transform(to_manifest_ptr), + auto append_manifest = [this](const ManifestFile& manifest, + std::vector& manifests) -> Status { + ICEBERG_PRECHECK(manifest.content == manifest_content_, + "Cannot merge manifest with unexpected content"); + manifests.push_back(&manifest); + return {}; }; std::vector all; all.reserve(new_manifests.size() + existing_manifests.size()); - std::ranges::copy(manifest_ranges | std::views::join, std::back_inserter(all)); + for (const auto& manifest : new_manifests) { + ICEBERG_RETURN_UNEXPECTED(append_manifest(manifest, all)); + } + for (const auto& manifest : existing_manifests) { + ICEBERG_RETURN_UNEXPECTED(append_manifest(manifest, all)); + } if (all.empty() || !merge_enabled_) { return all | @@ -62,30 +106,18 @@ Result> ManifestMergeManager::MergeManifests( std::ranges::to>(); } - // Track the first (newest) manifest independently per content type. - std::map first_by_content; - std::ranges::for_each(all, [&first_by_content](const ManifestFile* manifest) { - first_by_content.try_emplace(manifest->content, manifest); - }); - - // Group manifests by (partition_spec_id, content), never merging across specs or - // content types. Reverse spec ordering preserves v3 first-row-id assignment order. - using GroupKey = std::pair; - auto group_key = [](const ManifestFile* manifest) { - return GroupKey{manifest->partition_spec_id, manifest->content}; - }; - - std::map, std::greater<>> by_spec; - std::ranges::for_each(all, [&by_spec, &group_key](const ManifestFile* manifest) { - by_spec[group_key(manifest)].push_back(manifest); + const auto* first = all.front(); + std::map, std::greater<>> by_spec; + std::ranges::for_each(all, [&by_spec](const ManifestFile* manifest) { + by_spec[manifest->partition_spec_id].push_back(manifest); }); std::vector result; result.reserve(all.size()); - for (auto& [key, group] : by_spec) { - const auto* first = first_by_content.at(key.second); - ICEBERG_ASSIGN_OR_RAISE(auto merged, MergeGroup(group, first, snapshot_id, metadata, - file_io, writer_factory)); + for (auto& [spec_id, group] : by_spec) { + std::ignore = spec_id; + ICEBERG_ASSIGN_OR_RAISE(auto merged, + MergeGroup(group, first, metadata, writer_factory)); std::ranges::move(merged, std::back_inserter(result)); } return result; @@ -93,8 +125,7 @@ Result> ManifestMergeManager::MergeManifests( Result> ManifestMergeManager::MergeGroup( const std::vector& group, const ManifestFile* first, - int64_t snapshot_id, const TableMetadata& metadata, std::shared_ptr file_io, - const ManifestWriterFactory& writer_factory) { + const TableMetadata& metadata, const ManifestWriterFactory& writer_factory) { // Match packEnd(group, ManifestFile::length) with lookback 1: // 1. Process manifests in reverse order (oldest-first). // 2. Greedy forward-pack with lookback=1: emit the current bin when the next item @@ -129,18 +160,29 @@ Result> ManifestMergeManager::MergeGroup( // pass its contents through unchanged. std::vector result; result.reserve(group.size()); - // TODO(Guotao): Flush independent bins in parallel and cache successful merged bins - // for commit retries. for (auto& bin : bins) { - bool contains_first = std::ranges::find(bin, first) != bin.end(); - if (contains_first && std::cmp_less(bin.size(), min_count_to_merge_)) { + if (bin.size() == 1) { + result.push_back(*bin[0]); + } else if (bool contains_first = std::ranges::find(bin, first) != bin.end(); + contains_first && std::cmp_less(bin.size(), min_count_to_merge_)) { for (const auto* manifest : bin) { result.push_back(*manifest); } } else { - ICEBERG_ASSIGN_OR_RAISE( - auto merged, FlushBin(bin, snapshot_id, metadata, file_io, writer_factory)); - result.push_back(std::move(merged)); + const auto* cached = merged_manifests_.Find(bin); + if (cached != nullptr) { + result.push_back(*cached); + } else { + const int64_t snapshot_id = snapshot_id_supplier_(); + ICEBERG_ASSIGN_OR_RAISE(auto merged, FlushBin(bin, metadata, writer_factory)); + merged_manifests_.Add(bin, merged); + for (const auto* manifest : bin) { + if (manifest->added_snapshot_id != snapshot_id) { + ++replaced_manifests_count_; + } + } + result.push_back(std::move(merged)); + } } } @@ -148,23 +190,22 @@ Result> ManifestMergeManager::MergeGroup( } Result ManifestMergeManager::FlushBin( - const std::vector& bin, int64_t snapshot_id, - const TableMetadata& metadata, std::shared_ptr file_io, + const std::vector& bin, const TableMetadata& metadata, const ManifestWriterFactory& writer_factory) { - // A single-manifest bin requires no merging. - if (bin.size() == 1) return *bin[0]; - const ManifestFile& first = *bin[0]; int32_t spec_id = first.partition_spec_id; ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(spec_id)); - ICEBERG_ASSIGN_OR_RAISE(auto writer, writer_factory(spec_id, first.content)); + ICEBERG_ASSIGN_OR_RAISE(auto writer, writer_factory(spec_id, manifest_content_)); + const int64_t snapshot_id = snapshot_id_supplier_(); for (const auto* manifest : bin) { - ICEBERG_ASSIGN_OR_RAISE(auto reader, - ManifestReader::Make(*manifest, file_io, schema, spec)); + bool is_committed = manifest->added_snapshot_id != kInvalidSnapshotId && + manifest->added_snapshot_id != snapshot_id; + ICEBERG_ASSIGN_OR_RAISE(auto reader, ManifestReader::Make(*manifest, file_io_, schema, + spec, is_committed)); ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); for (const auto& entry : entries) { bool is_current = @@ -188,4 +229,73 @@ Result ManifestMergeManager::FlushBin( return writer->ToManifestFile(); } +ManifestMergeManager::MergedManifestCache::Key +ManifestMergeManager::MergedManifestCache::MakeKey( + const std::vector& bin) { + Key key; + key.bin.reserve(bin.size()); + for (const auto* manifest : bin) { + key.bin.push_back(*manifest); + } + return key; +} + +const ManifestFile* ManifestMergeManager::MergedManifestCache::Find( + const std::vector& bin) const { + auto iter = entries_.find(MakeKey(bin)); + if (iter == entries_.end()) { + return nullptr; + } + return &iter->second; +} + +void ManifestMergeManager::MergedManifestCache::Add( + const std::vector& bin, const ManifestFile& manifest) { + entries_.emplace(MakeKey(bin), manifest); +} + +size_t ManifestMergeManager::MergedManifestCache::KeyHash::operator()( + const Key& key) const { + size_t hash = 0; + for (const auto& manifest : key.bin) { + hash = CombineHash(hash, std::hash{}(manifest.manifest_path)); + } + return hash; +} + +Result ManifestMergeManager::MergedManifestCache::CleanUncommitted( + const std::unordered_set& committed, int64_t snapshot_id, + const std::function& delete_file) { + int32_t removed_replaced_manifests_count = 0; + auto cached_entries = + std::vector>{entries_.begin(), entries_.end()}; + for (const auto& [bin, merged] : cached_entries) { + if (committed.contains(merged.manifest_path)) { + continue; + } + + std::ignore = delete_file(merged.manifest_path); + for (const auto& manifest : bin.bin) { + if (manifest.added_snapshot_id != snapshot_id) { + ++removed_replaced_manifests_count; + } + } + entries_.erase(bin); + } + return removed_replaced_manifests_count; +} + +Status ManifestMergeManager::CleanUncommitted( + const std::unordered_set& committed) { + if (merged_manifests_.empty()) { + return {}; + } + const int64_t snapshot_id = snapshot_id_supplier_(); + ICEBERG_ASSIGN_OR_RAISE( + auto removed_replaced_manifests_count, + merged_manifests_.CleanUncommitted(committed, snapshot_id, delete_file_)); + replaced_manifests_count_ -= removed_replaced_manifests_count; + return {}; +} + } // namespace iceberg diff --git a/src/iceberg/manifest/manifest_merge_manager.h b/src/iceberg/manifest/manifest_merge_manager.h index 16cc8d987..031c31420 100644 --- a/src/iceberg/manifest/manifest_merge_manager.h +++ b/src/iceberg/manifest/manifest_merge_manager.h @@ -23,7 +23,11 @@ /// Merges small manifests into fewer larger ones according to table properties. #include +#include #include +#include +#include +#include #include #include "iceberg/iceberg_export.h" @@ -45,54 +49,87 @@ namespace iceberg { /// \note This class is non-copyable and non-movable. class ICEBERG_EXPORT ManifestMergeManager { public: + using SnapshotIdSupplier = std::function; + /// \brief Construct a merge manager with the given configuration. /// + /// \param content Manifest content this manager accepts /// \param target_size_bytes Target output manifest size in bytes /// \param min_count_to_merge Minimum number of manifests before any merging occurs /// \param merge_enabled Whether merging is enabled at all - ManifestMergeManager(int64_t target_size_bytes, int32_t min_count_to_merge, - bool merge_enabled); + /// \param file_io File IO used to open manifests for reading + /// \param snapshot_id_supplier Supplies the snapshot id being committed + /// \param delete_file Callback that deletes uncommitted merged manifest files + static Result> Make( + ManifestContent content, int64_t target_size_bytes, int32_t min_count_to_merge, + bool merge_enabled, std::shared_ptr file_io, + SnapshotIdSupplier snapshot_id_supplier, + std::function delete_file = {}); ManifestMergeManager(const ManifestMergeManager&) = delete; ManifestMergeManager& operator=(const ManifestMergeManager&) = delete; /// \brief Merge existing and new manifests according to configured thresholds. /// - /// Manifests are grouped by (partition_spec_id, content) — data and delete manifests - /// are never merged together. Within each group, a greedy bin-packing algorithm - /// combines manifests up to target_size_bytes. The bin that contains the newest - /// manifest for that content type is protected by min_count_to_merge: if it has fewer + /// Manifests are grouped by partition_spec_id. Within each group, a greedy + /// bin-packing algorithm combines manifests up to target_size_bytes. The bin that + /// contains the newest manifest is protected by min_count_to_merge: if it has fewer /// than that many items it is passed through unchanged. /// - /// \note Retry and rollback cleanup are handled by the caller that owns created - /// manifest paths. - /// TODO(Guotao): Add explicit replaced-manifest tracking here if callers need direct - /// access. - /// /// \param existing_manifests Manifests already in the base snapshot /// \param new_manifests Newly written manifests to incorporate - /// \param snapshot_id The ID of the snapshot being committed. Used to preserve - /// ADDED/DELETED status for entries written by this snapshot and to suppress - /// stale DELETED tombstones from prior snapshots. /// \param metadata Table metadata (provides specs and schema for readers) - /// \param file_io File IO used to open existing manifests for reading /// \param writer_factory Factory to create new ManifestWriter instances /// \return The merged manifest list, or an error Result> MergeManifests( const std::vector& existing_manifests, - const std::vector& new_manifests, int64_t snapshot_id, - const TableMetadata& metadata, std::shared_ptr file_io, + const std::vector& new_manifests, const TableMetadata& metadata, const ManifestWriterFactory& writer_factory); + /// \brief Returns the number of manifests replaced by cached merged outputs. + int32_t ReplacedManifestsCount() const { return replaced_manifests_count_; } + + /// \brief Delete cached merged manifests whose paths were not committed and roll + /// back replaced-manifest accounting. + Status CleanUncommitted(const std::unordered_set& committed); + private: + ManifestMergeManager(ManifestContent content, int64_t target_size_bytes, + int32_t min_count_to_merge, bool merge_enabled, + std::shared_ptr file_io, + SnapshotIdSupplier snapshot_id_supplier, + std::function delete_file); + + struct MergedManifestCache { + struct Key { + std::vector bin; + bool operator==(const Key& other) const = default; + }; + + struct KeyHash { + size_t operator()(const Key& key) const; + }; + + const ManifestFile* Find(const std::vector& bin) const; + void Add(const std::vector& bin, const ManifestFile& manifest); + bool empty() const { return entries_.empty(); } + Result CleanUncommitted( + const std::unordered_set& committed, int64_t snapshot_id, + const std::function& delete_file); + + private: + static Key MakeKey(const std::vector& bin); + + std::unordered_map entries_; + }; + /// \brief Merge a group of manifests sharing the same spec_id. /// /// \param first The overall first (newest) manifest across all groups, used to /// apply the min_count_to_merge threshold on the bin that contains it. Result> MergeGroup( const std::vector& group, const ManifestFile* first, - int64_t snapshot_id, const TableMetadata& metadata, std::shared_ptr file_io, - const ManifestWriterFactory& writer_factory); + const TableMetadata& metadata, const ManifestWriterFactory& writer_factory); /// \brief Write a merged manifest from all manifests in a bin. /// @@ -102,13 +139,18 @@ class ICEBERG_EXPORT ManifestMergeManager { /// - DELETED from older snapshots → dropped (stale tombstones are not carried forward) /// - All other entries → WriteExistingEntry Result FlushBin(const std::vector& bin, - int64_t snapshot_id, const TableMetadata& metadata, - std::shared_ptr file_io, + const TableMetadata& metadata, const ManifestWriterFactory& writer_factory); + const ManifestContent manifest_content_; const int64_t target_size_bytes_; const int32_t min_count_to_merge_; const bool merge_enabled_; + std::shared_ptr file_io_; + SnapshotIdSupplier snapshot_id_supplier_; + std::function delete_file_; + int32_t replaced_manifests_count_{0}; + MergedManifestCache merged_manifests_; }; } // namespace iceberg diff --git a/src/iceberg/manifest/manifest_reader.cc b/src/iceberg/manifest/manifest_reader.cc index 7747e2be3..6c166df53 100644 --- a/src/iceberg/manifest/manifest_reader.cc +++ b/src/iceberg/manifest/manifest_reader.cc @@ -432,7 +432,7 @@ Status ParsePartitionValues(ArrowArrayView* view, int64_t row_idx, Status ParseDataFile(const std::shared_ptr& data_file_schema, ArrowArrayView* view, std::optional& first_row_id, - std::vector& manifest_entries) { + bool is_committed, std::vector& manifest_entries) { ICEBERG_RETURN_UNEXPECTED( AssertViewTypeAndChildren(view, ArrowType::NANOARROW_TYPE_STRUCT, data_file_schema->fields().size(), "data_file")); @@ -556,12 +556,14 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, first_row_id = first_row_id.value() + entry.data_file->record_count; } }); - } else { + } else if (is_committed) { // data file's first_row_id is null when the manifest's first_row_id is null std::ranges::for_each( - manifest_entries, [](auto& first_row_id) { first_row_id = std::nullopt; }, + manifest_entries, [](auto& row_id) { row_id = std::nullopt; }, proj_data_file(&DataFile::first_row_id)); } + // Preserve firstRowId for entries in uncommitted manifests, including EXISTING + // entries that may be merged later break; } case DataFile::kReferencedDataFileFieldId: @@ -589,7 +591,7 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, Result> ParseManifestEntry( ArrowSchema* arrow_schema, ArrowArray* array, const Schema& schema, - std::optional& first_row_id) { + std::optional& first_row_id, bool is_committed) { ArrowError error; ArrowArrayView view; ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR( @@ -642,8 +644,8 @@ Result> ParseManifestEntry( case ManifestEntry::kDataFileFieldId: { auto data_file_schema = internal::checked_pointer_cast(field->get().type()); - ICEBERG_RETURN_UNEXPECTED( - ParseDataFile(data_file_schema, field_view, first_row_id, manifest_entries)); + ICEBERG_RETURN_UNEXPECTED(ParseDataFile( + data_file_schema, field_view, first_row_id, is_committed, manifest_entries)); break; } default: @@ -727,14 +729,15 @@ ManifestReaderImpl::ManifestReaderImpl( std::shared_ptr file_io, std::shared_ptr schema, std::shared_ptr spec, std::unique_ptr inheritable_metadata, - std::optional first_row_id) + std::optional first_row_id, bool is_committed) : manifest_path_(std::move(manifest_path)), manifest_length_(manifest_length), file_io_(std::move(file_io)), schema_(std::move(schema)), spec_(std::move(spec)), inheritable_metadata_(std::move(inheritable_metadata)), - first_row_id_(first_row_id) {} + first_row_id_(first_row_id), + is_committed_(is_committed) {} ManifestReader& ManifestReaderImpl::Select(const std::vector& columns) { columns_ = columns; @@ -907,8 +910,8 @@ Result> ManifestReaderImpl::ReadEntries(bool only_liv internal::ArrowArrayGuard array_guard(&result.value()); ICEBERG_ASSIGN_OR_RAISE( - auto entries, - ParseManifestEntry(&arrow_schema, &result.value(), *file_schema_, first_row_id_)); + auto entries, ParseManifestEntry(&arrow_schema, &result.value(), *file_schema_, + first_row_id_, is_committed_)); for (auto& entry : entries) { ICEBERG_RETURN_UNEXPECTED(inheritable_metadata_->Apply(entry)); @@ -984,7 +987,8 @@ Result ManifestFileFieldFromIndex(int32_t index) { Result> ManifestReader::Make( const ManifestFile& manifest, std::shared_ptr file_io, - std::shared_ptr schema, std::shared_ptr spec) { + std::shared_ptr schema, std::shared_ptr spec, + bool is_committed) { if (file_io == nullptr || schema == nullptr || spec == nullptr) { return InvalidArgument( "FileIO, Schema, and PartitionSpec cannot be null to create ManifestReader"); @@ -996,20 +1000,22 @@ Result> ManifestReader::Make( return std::make_unique( manifest.manifest_path, manifest.manifest_length, std::move(file_io), std::move(schema), std::move(spec), std::move(inheritable_metadata), - manifest.first_row_id); + manifest.first_row_id, is_committed); } Result> ManifestReader::Make( const ManifestFile& manifest, std::shared_ptr file_io, std::shared_ptr schema, - const std::unordered_map>& specs_by_id) { + const std::unordered_map>& specs_by_id, + bool is_committed) { auto spec_it = specs_by_id.find(manifest.partition_spec_id); if (spec_it == specs_by_id.end() || spec_it->second == nullptr) { return InvalidArgument("Partition spec {} not found for manifest {}", manifest.partition_spec_id, manifest.manifest_path); } auto spec = spec_it->second; - return Make(manifest, std::move(file_io), std::move(schema), std::move(spec)); + return Make(manifest, std::move(file_io), std::move(schema), std::move(spec), + is_committed); } Result> ManifestReader::Make( @@ -1017,7 +1023,7 @@ Result> ManifestReader::Make( std::shared_ptr file_io, std::shared_ptr schema, std::shared_ptr spec, std::unique_ptr inheritable_metadata, - std::optional first_row_id) { + std::optional first_row_id, bool is_committed) { ICEBERG_PRECHECK(file_io != nullptr, "FileIO cannot be null to read manifest"); ICEBERG_PRECHECK(schema != nullptr, "Schema cannot be null to read manifest"); ICEBERG_PRECHECK(spec != nullptr, "PartitionSpec cannot be null to read manifest"); @@ -1028,7 +1034,8 @@ Result> ManifestReader::Make( return std::make_unique( std::string(manifest_location), manifest_length, std::move(file_io), - std::move(schema), std::move(spec), std::move(inheritable_metadata), first_row_id); + std::move(schema), std::move(spec), std::move(inheritable_metadata), first_row_id, + is_committed); } Result> ManifestListReader::Make( diff --git a/src/iceberg/manifest/manifest_reader.h b/src/iceberg/manifest/manifest_reader.h index 42c56e1c2..b2d1c6505 100644 --- a/src/iceberg/manifest/manifest_reader.h +++ b/src/iceberg/manifest/manifest_reader.h @@ -87,21 +87,26 @@ class ICEBERG_EXPORT ManifestReader { /// \param file_io File IO implementation to use. /// \param schema Schema used to bind the partition type. /// \param spec Partition spec used for this manifest file. + /// \param is_committed Whether the manifest was committed by an older snapshot. /// \return A Result containing the reader or an error. - static Result> Make( - const ManifestFile& manifest, std::shared_ptr file_io, - std::shared_ptr schema, std::shared_ptr spec); + static Result> Make(const ManifestFile& manifest, + std::shared_ptr file_io, + std::shared_ptr schema, + std::shared_ptr spec, + bool is_committed = true); /// \brief Creates a reader for a manifest file using specs keyed by ID. /// \param manifest A ManifestFile object containing metadata about the manifest. /// \param file_io File IO implementation to use. /// \param schema Schema used to bind the partition type. /// \param specs_by_id Mapping of partition spec ID to PartitionSpec. + /// \param is_committed Whether the manifest was committed by an older snapshot. /// \return A Result containing the reader or an error. static Result> Make( const ManifestFile& manifest, std::shared_ptr file_io, std::shared_ptr schema, - const std::unordered_map>& specs_by_id); + const std::unordered_map>& specs_by_id, + bool is_committed = true); /// \brief Creates a reader for a manifest file. /// \param manifest_location Path to the manifest file. @@ -111,13 +116,14 @@ class ICEBERG_EXPORT ManifestReader { /// \param spec Partition spec used for this manifest file. /// \param inheritable_metadata Inheritable metadata. /// \param first_row_id First row ID to use for the manifest entries. + /// \param is_committed Whether the manifest was committed by an older snapshot. /// \return A Result containing the reader or an error. static Result> Make( std::string_view manifest_location, std::optional manifest_length, std::shared_ptr file_io, std::shared_ptr schema, std::shared_ptr spec, std::unique_ptr inheritable_metadata, - std::optional first_row_id = std::nullopt); + std::optional first_row_id = std::nullopt, bool is_committed = true); /// \brief Add stats columns to the column list if needed. static std::vector WithStatsColumns( diff --git a/src/iceberg/manifest/manifest_reader_internal.h b/src/iceberg/manifest/manifest_reader_internal.h index 2b4b1e0ba..53ce2fcb5 100644 --- a/src/iceberg/manifest/manifest_reader_internal.h +++ b/src/iceberg/manifest/manifest_reader_internal.h @@ -53,12 +53,13 @@ class ManifestReaderImpl : public ManifestReader { /// \param spec Partition spec. /// \param inheritable_metadata Metadata inherited from manifest. /// \param first_row_id First row ID for V3 manifests. + /// \param is_committed Whether the manifest was committed by an older snapshot. /// \note ManifestReader::Make() functions should guarantee non-null parameters. ManifestReaderImpl(std::string manifest_path, std::optional manifest_length, std::shared_ptr file_io, std::shared_ptr schema, std::shared_ptr spec, std::unique_ptr inheritable_metadata, - std::optional first_row_id); + std::optional first_row_id, bool is_committed); Result> Entries() override; @@ -106,6 +107,7 @@ class ManifestReaderImpl : public ManifestReader { const std::shared_ptr spec_; const std::unique_ptr inheritable_metadata_; std::optional first_row_id_; + bool is_committed_; // Configuration fields std::vector columns_; diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index a5a60b605..678f30fbd 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -111,6 +111,7 @@ iceberg_sources = files( 'type.cc', 'update/expire_snapshots.cc', 'update/fast_append.cc', + 'update/merging_snapshot_update.cc', 'update/pending_update.cc', 'update/set_snapshot.cc', 'update/snapshot_manager.cc', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 997d18354..20babf19c 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -214,6 +214,7 @@ if(ICEBERG_BUILD_BUNDLE) expire_snapshots_test.cc fast_append_test.cc manifest_filter_manager_test.cc + merging_snapshot_update_test.cc name_mapping_update_test.cc snapshot_manager_test.cc transaction_test.cc diff --git a/src/iceberg/test/data_file_set_test.cc b/src/iceberg/test/data_file_set_test.cc index 60539adfa..e677b0e2e 100644 --- a/src/iceberg/test/data_file_set_test.cc +++ b/src/iceberg/test/data_file_set_test.cc @@ -38,6 +38,21 @@ class DataFileSetTest : public ::testing::Test { file->content = DataFile::Content::kData; return file; } + + std::shared_ptr CreateDeleteFile(const std::string& path) { + auto file = CreateDataFile(path); + file->content = DataFile::Content::kPositionDeletes; + return file; + } + + std::shared_ptr CreateDV(const std::string& path, int64_t offset, + int64_t size) { + auto file = CreateDeleteFile(path); + file->file_format = FileFormatType::kPuffin; + file->content_offset = offset; + file->content_size_in_bytes = size; + return file; + } }; TEST_F(DataFileSetTest, EmptySet) { @@ -260,12 +275,11 @@ TEST_F(DataFileSetTest, RangeBasedForLoop) { TEST_F(DataFileSetTest, CaseSensitivePaths) { DataFileSet set; auto file1 = CreateDataFile("/path/to/file.parquet"); - auto file2 = CreateDataFile("/path/to/FILE.parquet"); // Different case + auto file2 = CreateDataFile("/path/to/FILE.parquet"); set.insert(file1); set.insert(file2); - // Should be treated as different files EXPECT_EQ(set.size(), 2); } @@ -273,7 +287,6 @@ TEST_F(DataFileSetTest, MultipleInsertsSameFile) { DataFileSet set; auto file = CreateDataFile("/path/to/file.parquet"); - // Insert the same file multiple times set.insert(file); set.insert(file); set.insert(file); @@ -281,4 +294,64 @@ TEST_F(DataFileSetTest, MultipleInsertsSameFile) { EXPECT_EQ(set.size(), 1); } +TEST_F(DataFileSetTest, CopyRebuildsDataFileIndex) { + DataFileSet original; + original.insert(CreateDataFile("/path/to/file1.parquet")); + original.insert(CreateDataFile("/path/to/file2.parquet")); + + DataFileSet copy = original; + auto duplicate = CreateDataFile("/path/to/file1.parquet"); + + auto [iter, inserted] = copy.insert(duplicate); + EXPECT_FALSE(inserted); + ASSERT_NE(iter, copy.end()); + EXPECT_EQ((*iter)->file_path, "/path/to/file1.parquet"); + EXPECT_EQ(copy.size(), 2U); +} + +TEST_F(DataFileSetTest, DeleteFileSetDeduplicatesByPathForRegularDeletes) { + DeleteFileSet set; + auto first = CreateDeleteFile("/path/to/delete.parquet"); + auto duplicate = CreateDeleteFile("/path/to/delete.parquet"); + + auto [first_iter, first_inserted] = set.insert(first); + EXPECT_TRUE(first_inserted); + EXPECT_EQ(*first_iter, first); + + auto [duplicate_iter, duplicate_inserted] = set.insert(duplicate); + EXPECT_FALSE(duplicate_inserted); + EXPECT_EQ(*duplicate_iter, first); + EXPECT_EQ(set.size(), 1U); + EXPECT_TRUE(set.contains(*duplicate)); +} + +TEST_F(DataFileSetTest, DeleteFileSetDistinguishesDeletionVectorContentRanges) { + DeleteFileSet set; + auto first = CreateDV("/path/to/dv.puffin", /*offset=*/0, /*size=*/10); + auto same_range = CreateDV("/path/to/dv.puffin", /*offset=*/0, /*size=*/10); + auto different_offset = CreateDV("/path/to/dv.puffin", /*offset=*/10, /*size=*/10); + auto different_size = CreateDV("/path/to/dv.puffin", /*offset=*/0, /*size=*/20); + + EXPECT_TRUE(set.insert(first).second); + EXPECT_FALSE(set.insert(same_range).second); + EXPECT_TRUE(set.insert(different_offset).second); + EXPECT_TRUE(set.insert(different_size).second); + EXPECT_EQ(set.size(), 3U); +} + +TEST_F(DataFileSetTest, DeleteFileSetCopyRebuildsIndex) { + DeleteFileSet original; + original.insert(CreateDV("/path/to/dv.puffin", /*offset=*/0, /*size=*/10)); + original.insert(CreateDV("/path/to/dv.puffin", /*offset=*/10, /*size=*/10)); + + DeleteFileSet copy = original; + auto duplicate = CreateDV("/path/to/dv.puffin", /*offset=*/0, /*size=*/10); + + auto [iter, inserted] = copy.insert(duplicate); + EXPECT_FALSE(inserted); + ASSERT_NE(iter, copy.end()); + EXPECT_EQ((*iter)->content_offset, 0); + EXPECT_EQ(copy.size(), 2U); +} + } // namespace iceberg diff --git a/src/iceberg/test/fast_append_test.cc b/src/iceberg/test/fast_append_test.cc index 6c77fad16..98956ba7c 100644 --- a/src/iceberg/test/fast_append_test.cc +++ b/src/iceberg/test/fast_append_test.cc @@ -160,6 +160,18 @@ TEST_F(FastAppendTest, AppendNullFile) { EXPECT_THAT(table_->current_snapshot(), HasErrorMessage("No current snapshot")); } +TEST_F(FastAppendTest, FinalizeIgnoresCleanupDeleteFailure) { + std::shared_ptr fast_append; + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); + fast_append->AppendFile(file_a_); + fast_append->DeleteWith([](const std::string&) { return IOError("delete failed"); }); + + EXPECT_THAT(static_cast(*fast_append).Apply(), IsOk()); + EXPECT_THAT(fast_append->Finalize(Result( + std::unexpected(CommitFailed("commit failed").error()))), + IsOk()); +} + TEST_F(FastAppendTest, AppendDuplicateFile) { std::shared_ptr fast_append; ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); @@ -170,7 +182,6 @@ TEST_F(FastAppendTest, AppendDuplicateFile) { EXPECT_THAT(table_->Refresh(), IsOk()); ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); - // Should only count the file once EXPECT_EQ(snapshot->summary.at("added-data-files"), "1"); EXPECT_EQ(snapshot->summary.at("added-records"), "100"); } diff --git a/src/iceberg/test/manifest_filter_manager_test.cc b/src/iceberg/test/manifest_filter_manager_test.cc index 7810509fa..442bbe063 100644 --- a/src/iceberg/test/manifest_filter_manager_test.cc +++ b/src/iceberg/test/manifest_filter_manager_test.cc @@ -41,6 +41,7 @@ #include "iceberg/test/matchers.h" #include "iceberg/test/update_test_base.h" #include "iceberg/update/fast_append.h" +#include "iceberg/util/data_file_set.h" #include "iceberg/util/macros.h" namespace iceberg { @@ -127,29 +128,99 @@ class ManifestFilterManagerTest : public MinimalUpdateTestBase { }; TEST_F(ManifestFilterManagerTest, NullSnapshotReturnsEmpty) { - ManifestFilterManager mgr(ManifestContent::kData, file_io_); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, nullptr, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, nullptr, factory)); EXPECT_TRUE(result.empty()); } +TEST_F(ManifestFilterManagerTest, MakeRejectsNullFileIO) { + EXPECT_THAT(ManifestFilterManager::Make(ManifestContent::kData, nullptr), + IsError(ErrorKind::kInvalidArgument)); +} + TEST_F(ManifestFilterManagerTest, ContainsDeletesReturnsCorrectState) { - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - EXPECT_FALSE(mgr.ContainsDeletes()); - mgr.DeleteFile("/some/path.parquet"); - EXPECT_TRUE(mgr.ContainsDeletes()); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + EXPECT_FALSE(mgr->ContainsDeletes()); + ASSERT_THAT(mgr->DeleteFile("/some/path.parquet"), IsOk()); + EXPECT_TRUE(mgr->ContainsDeletes()); +} + +TEST_F(ManifestFilterManagerTest, ContainsDeletesTrueAfterRowFilter) { + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + EXPECT_FALSE(mgr->ContainsDeletes()); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::Equal("x", Literal::Long(1L))), IsOk()); + EXPECT_TRUE(mgr->ContainsDeletes()); +} + +TEST_F(ManifestFilterManagerTest, ContainsDeletesFalseForAlwaysFalseRowFilter) { + // An always-false row filter does not count as a delete condition. + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::AlwaysFalse()), IsOk()); + EXPECT_FALSE(mgr->ContainsDeletes()); +} + +TEST_F(ManifestFilterManagerTest, ContainsDeletesTrueAfterDropPartition) { + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + EXPECT_FALSE(mgr->ContainsDeletes()); + ASSERT_THAT( + mgr->DropPartition(spec_->spec_id(), + PartitionValues(std::vector{Literal::Long(1L)})), + IsOk()); + EXPECT_TRUE(mgr->ContainsDeletes()); +} + +TEST_F(ManifestFilterManagerTest, ContainsDeletesTrueAfterDeleteFileObject) { + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + EXPECT_FALSE(mgr->ContainsDeletes()); + EXPECT_THAT(mgr->DeleteFile(file_a_), IsOk()); + EXPECT_TRUE(mgr->ContainsDeletes()); +} + +TEST_F(ManifestFilterManagerTest, FilesToBeDeletedIncludesRegisteredFileObject) { + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + + EXPECT_THAT(mgr->DeleteFile(file_a_), IsOk()); + + ASSERT_EQ(mgr->FilesToBeDeleted().size(), 1U); + EXPECT_EQ(mgr->FilesToBeDeleted().begin()->get()->file_path, file_a_->file_path); +} + +TEST_F(ManifestFilterManagerTest, FilesToBeDeletedKeepsRegisteredFileObjectAfterFilter) { + ICEBERG_UNWRAP_OR_FAIL(auto snap, CommitFiles({file_b_})); + auto factory = MakeWriterFactory(*table_->metadata()); + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + EXPECT_THAT(mgr->DeleteFile(file_a_), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->FilterManifests(*table_->metadata(), snap, factory)); + EXPECT_THAT(result, ::testing::Not(::testing::IsEmpty())); + + ASSERT_EQ(mgr->FilesToBeDeleted().size(), 1U); + EXPECT_EQ(mgr->FilesToBeDeleted().begin()->get()->file_path, file_a_->file_path); } TEST_F(ManifestFilterManagerTest, DeleteByRowFilterRejectsNull) { - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - EXPECT_THAT(mgr.DeleteByRowFilter(nullptr), IsError(ErrorKind::kInvalidArgument)); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + EXPECT_THAT(mgr->DeleteByRowFilter(nullptr), IsError(ErrorKind::kInvalidArgument)); } TEST_F(ManifestFilterManagerTest, DeleteFileObjectRejectsNull) { - ManifestFilterManager mgr(ManifestContent::kData, file_io_); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); std::shared_ptr null_file; - EXPECT_THAT(mgr.DeleteFile(null_file), IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(mgr->DeleteFile(null_file), IsError(ErrorKind::kInvalidArgument)); } TEST_F(ManifestFilterManagerTest, NoConditionsReturnsManifestsUnchanged) { @@ -162,12 +233,12 @@ TEST_F(ManifestFilterManagerTest, NoConditionsReturnsManifestsUnchanged) { ManifestListReader::Make(snap->manifest_list, file_io_)); ICEBERG_UNWRAP_OR_FAIL(auto orig_manifests, list_reader->Files()); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ASSERT_EQ(result.size(), orig_manifests.size()); for (size_t i = 0; i < result.size(); ++i) { - // No rewrite → same manifest path EXPECT_EQ(result[i].manifest_path, orig_manifests[i].manifest_path); } } @@ -177,10 +248,11 @@ TEST_F(ManifestFilterManagerTest, DeleteFileByPath) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.DeleteFile(file_a_->file_path); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); int deleted_count = 0; @@ -198,6 +270,146 @@ TEST_F(ManifestFilterManagerTest, DeleteFileByPath) { EXPECT_EQ(live_count, 1); } +TEST_F(ManifestFilterManagerTest, FilterManifestsCachesFilteredManifestAcrossRetries) { + ICEBERG_UNWRAP_OR_FAIL(auto snap, CommitFiles({file_a_, file_b_})); + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto list_reader, + ManifestListReader::Make(snap->manifest_list, file_io_)); + ICEBERG_UNWRAP_OR_FAIL(auto manifest_files, list_reader->Files()); + std::vector manifests; + manifests.reserve(manifest_files.size()); + for (const auto& manifest : manifest_files) { + manifests.push_back(&manifest); + } + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + auto specs = SpecsById(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto first, + mgr->FilterManifests(schema, specs, manifests, factory)); + ASSERT_EQ(first.size(), 1U); + EXPECT_NE(first[0].manifest_path, manifest_files[0].manifest_path); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 1); + int after_first = manifest_counter_; + + ICEBERG_UNWRAP_OR_FAIL(auto second, + mgr->FilterManifests(schema, specs, manifests, factory)); + ASSERT_EQ(second.size(), 1U); + EXPECT_EQ(second[0].manifest_path, first[0].manifest_path); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 1); + EXPECT_EQ(manifest_counter_, after_first); +} + +TEST_F(ManifestFilterManagerTest, DeleteFileInvalidatesFilteredCache) { + ICEBERG_UNWRAP_OR_FAIL(auto snap, CommitFiles({file_a_, file_b_})); + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto list_reader, + ManifestListReader::Make(snap->manifest_list, file_io_)); + ICEBERG_UNWRAP_OR_FAIL(auto manifest_files, list_reader->Files()); + std::vector manifests; + manifests.reserve(manifest_files.size()); + for (const auto& manifest : manifest_files) { + manifests.push_back(&manifest); + } + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + auto specs = SpecsById(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto first, + mgr->FilterManifests(schema, specs, manifests, factory)); + ASSERT_EQ(first.size(), 1U); + + ASSERT_THAT(mgr->DeleteFile(file_b_->file_path), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto second, + mgr->FilterManifests(schema, specs, manifests, factory)); + EXPECT_NE(second[0].manifest_path, first[0].manifest_path); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(second, *metadata)); + auto deleted_count = + std::count_if(entries.begin(), entries.end(), [](const ManifestEntry& entry) { + return entry.status == ManifestStatus::kDeleted; + }); + EXPECT_EQ(deleted_count, 2); +} + +TEST_F(ManifestFilterManagerTest, CleanUncommittedDropsFilteredCacheAndRollsBackCount) { + ICEBERG_UNWRAP_OR_FAIL(auto snap, CommitFiles({file_a_, file_b_})); + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto list_reader, + ManifestListReader::Make(snap->manifest_list, file_io_)); + ICEBERG_UNWRAP_OR_FAIL(auto manifest_files, list_reader->Files()); + std::vector manifests; + manifests.reserve(manifest_files.size()); + for (const auto& manifest : manifest_files) { + manifests.push_back(&manifest); + } + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + auto specs = SpecsById(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto first, + mgr->FilterManifests(schema, specs, manifests, factory)); + ASSERT_EQ(first.size(), 1U); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 1); + EXPECT_THAT(mgr->CleanUncommitted({}), IsOk()); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 0); + + int after_cleanup = manifest_counter_; + ICEBERG_UNWRAP_OR_FAIL(auto second, + mgr->FilterManifests(schema, specs, manifests, factory)); + ASSERT_EQ(second.size(), 1U); + EXPECT_NE(second[0].manifest_path, first[0].manifest_path); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 1); + EXPECT_GT(manifest_counter_, after_cleanup); +} + +TEST_F(ManifestFilterManagerTest, CleanUncommittedIgnoresDeleteCallbackError) { + ICEBERG_UNWRAP_OR_FAIL(auto snap, CommitFiles({file_a_, file_b_})); + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto list_reader, + ManifestListReader::Make(snap->manifest_list, file_io_)); + ICEBERG_UNWRAP_OR_FAIL(auto manifest_files, list_reader->Files()); + std::vector manifests; + manifests.reserve(manifest_files.size()); + for (const auto& manifest : manifest_files) { + manifests.push_back(&manifest); + } + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make( + ManifestContent::kData, file_io_, + [](const std::string&) { return IOError("delete failed"); })); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + auto specs = SpecsById(*metadata); + + ICEBERG_UNWRAP_OR_FAIL(auto first, + mgr->FilterManifests(schema, specs, manifests, factory)); + ASSERT_EQ(first.size(), 1U); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 1); + + EXPECT_THAT(mgr->CleanUncommitted({}), IsOk()); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 0); +} + TEST_F(ManifestFilterManagerTest, ExplicitContextFilterManifestsDeletesByPath) { ICEBERG_UNWRAP_OR_FAIL(auto snap, CommitFiles({file_a_, file_b_})); auto* metadata = table_->metadata().get(); @@ -212,11 +424,12 @@ TEST_F(ManifestFilterManagerTest, ExplicitContextFilterManifestsDeletesByPath) { manifests.push_back(&manifest); } - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.DeleteFile(file_a_->file_path); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(schema_, SpecsById(*metadata), - manifests, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(schema_, SpecsById(*metadata), + manifests, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); int deleted_count = 0; @@ -235,10 +448,11 @@ TEST_F(ManifestFilterManagerTest, RowFilterAlwaysTrueDeletesAll) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - ASSERT_THAT(mgr.DeleteByRowFilter(Expressions::AlwaysTrue()), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::AlwaysTrue()), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); for (const auto& e : entries) { @@ -251,14 +465,14 @@ TEST_F(ManifestFilterManagerTest, RowFilterAlwaysFalseDeletesNone) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - ASSERT_THAT(mgr.DeleteByRowFilter(Expressions::AlwaysFalse()), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::AlwaysFalse()), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); for (const auto& e : entries) { - // AlwaysFalse means nothing can match → entries remain ADDED or EXISTING EXPECT_NE(e.status, ManifestStatus::kDeleted) << "Expected no entries to be DELETED"; } } @@ -268,11 +482,12 @@ TEST_F(ManifestFilterManagerTest, RowFilterUsesPartitionResiduals) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.CaseSensitive(false); - ASSERT_THAT(mgr.DeleteByRowFilter(Expressions::Equal("X", Literal::Long(1L))), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + mgr->CaseSensitive(false); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::Equal("X", Literal::Long(1L))), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); int deleted_count = 0; @@ -290,8 +505,8 @@ TEST_F(ManifestFilterManagerTest, RowFilterUsesPartitionResiduals) { EXPECT_EQ(deleted_count, 1); EXPECT_EQ(live_count, 1); - ASSERT_EQ(mgr.FilesToBeDeleted().size(), 1U); - EXPECT_EQ(mgr.FilesToBeDeleted().begin()->get()->file_path, file_a_->file_path); + ASSERT_EQ(mgr->FilesToBeDeleted().size(), 1U); + EXPECT_EQ(mgr->FilesToBeDeleted().begin()->get()->file_path, file_a_->file_path); } TEST_F(ManifestFilterManagerTest, DropPartition) { @@ -300,11 +515,14 @@ TEST_F(ManifestFilterManagerTest, DropPartition) { auto factory = MakeWriterFactory(*metadata); // Drop partition of file_a (partition_x = 1) - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.DropPartition(spec_->spec_id(), - PartitionValues(std::vector{Literal::Long(1L)})); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT( + mgr->DropPartition(spec_->spec_id(), + PartitionValues(std::vector{Literal::Long(1L)})), + IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); int deleted_count = 0; @@ -323,12 +541,14 @@ TEST_F(ManifestFilterManagerTest, FailMissingDeletePathsReturnsError) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.DeleteFile("/does/not/exist.parquet"); - mgr.FailMissingDeletePaths(); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile("/does/not/exist.parquet"), IsOk()); + mgr->FailMissingDeletePaths(); - auto result = mgr.FilterManifests(*metadata, snap, factory); - EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + auto result = mgr->FilterManifests(*metadata, snap, factory); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Missing required files to delete")); } TEST_F(ManifestFilterManagerTest, FailAnyDeleteReportsPartitionPath) { @@ -336,12 +556,13 @@ TEST_F(ManifestFilterManagerTest, FailAnyDeleteReportsPartitionPath) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.DeleteFile(file_a_->file_path); - mgr.FailAnyDelete(); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); + mgr->FailAnyDelete(); - auto result = mgr.FilterManifests(*metadata, snap, factory); - EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + auto result = mgr->FilterManifests(*metadata, snap, factory); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); EXPECT_THAT(result, HasErrorMessage("x=1")); } @@ -351,11 +572,12 @@ TEST_F(ManifestFilterManagerTest, MultipleConditionsOrCombined) { auto factory = MakeWriterFactory(*metadata); // Both files should be deleted: file_a by path, file_b by AlwaysTrue expression - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - mgr.DeleteFile(file_a_->file_path); - ASSERT_THAT(mgr.DeleteByRowFilter(Expressions::AlwaysTrue()), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteFile(file_a_->file_path), IsOk()); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::AlwaysTrue()), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); for (const auto& e : entries) { @@ -368,15 +590,604 @@ TEST_F(ManifestFilterManagerTest, MultipleRowFiltersUseCombinedExpression) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - ManifestFilterManager mgr(ManifestContent::kData, file_io_); - ASSERT_THAT(mgr.DeleteByRowFilter(Expressions::Equal("y", Literal::Long(7L))), IsOk()); - ASSERT_THAT(mgr.DeleteByRowFilter(Expressions::Equal("x", Literal::Long(1L))), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestFilterManager::Make(ManifestContent::kData, file_io_)); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::Equal("y", Literal::Long(7L))), IsOk()); + ASSERT_THAT(mgr->DeleteByRowFilter(Expressions::Equal("x", Literal::Long(1L))), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(*metadata, snap, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(*metadata, snap, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); ASSERT_EQ(entries.size(), 1U); EXPECT_EQ(entries[0].status, ManifestStatus::kDeleted); } +// Helper: write one or more delete-file entries to a new manifest. +// Each entry is (DataFile, data_sequence_number). +using DeleteFileWithSequenceNumber = std::pair, int64_t>; +static Result WriteDeleteManifest( + const std::vector& files, + std::shared_ptr file_io, const TableMetadata& metadata, + const std::string& path) { + if (files.empty()) { + return InvalidArgument("WriteDeleteManifest requires at least one entry"); + } + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + int32_t spec_id = files[0].first->partition_spec_id.value_or(0); + ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(spec_id)); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(metadata.format_version, /*snapshot_id=*/1L, path, + file_io, spec, schema, ManifestContent::kDeletes)); + for (auto& [file, seq] : files) { + ManifestEntry entry; + entry.status = ManifestStatus::kAdded; + entry.snapshot_id = 1L; + entry.sequence_number = seq; + entry.data_file = file; + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); +} + +// Convenience overload for a single entry. +static Result WriteDeleteManifest(std::shared_ptr delete_file, + int64_t data_sequence_number, + std::shared_ptr file_io, + const TableMetadata& metadata, + const std::string& path) { + return WriteDeleteManifest({{delete_file, data_sequence_number}}, file_io, metadata, + path); +} + +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanDoesNotRewriteOnItsOwn) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + // Create a position-delete file with data_sequence_number = 2 (below threshold 5). + auto del_file = std::make_shared(); + del_file->content = DataFile::Content::kPositionDeletes; + del_file->file_path = table_location_ + "/delete/del_old.parquet"; + del_file->file_format = FileFormatType::kParquet; + del_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + del_file->file_size_in_bytes = 512; + del_file->record_count = 10; + del_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto del_manifest, + WriteDeleteManifest(del_file, /*data_seq=*/2L, file_io_, *metadata, manifest_path)); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DropDeleteFilesOlderThan(5), IsOk()); + + std::vector manifests{&del_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->FilterManifests(schema, specs, manifests, factory)); + + ASSERT_EQ(result.size(), 1U); + EXPECT_EQ(result[0].manifest_path, del_manifest.manifest_path); +} + +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanRejectsNegativeSequenceNumber) { + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DropDeleteFilesOlderThan(-1), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanDuringDeleteManifestRewrite) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + auto make_del_file = [&](const std::string& path) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 512; + f->record_count = 10; + f->partition_spec_id = spec_->spec_id(); + return f; + }; + auto old_file = make_del_file(table_location_ + "/delete/del_old.parquet"); + auto targeted_file = make_del_file(table_location_ + "/delete/del_targeted.parquet"); + auto keep_file = make_del_file(table_location_ + "/delete/del_keep.parquet"); + + auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto del_manifest, + WriteDeleteManifest({{old_file, 2L}, {targeted_file, 10L}, {keep_file, 10L}}, + file_io_, *metadata, manifest_path)); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + ASSERT_THAT(mgr->DeleteFile(targeted_file->file_path), IsOk()); + EXPECT_THAT(mgr->DropDeleteFilesOlderThan(5), IsOk()); + + std::vector manifests{&del_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->FilterManifests(schema, specs, manifests, factory)); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); + ASSERT_EQ(entries.size(), 3U); + auto deleted = std::count_if( + entries.begin(), entries.end(), + [](const ManifestEntry& e) { return e.status == ManifestStatus::kDeleted; }); + auto existing = std::count_if( + entries.begin(), entries.end(), + [](const ManifestEntry& e) { return e.status == ManifestStatus::kExisting; }); + EXPECT_EQ(deleted, 2); + EXPECT_EQ(existing, 1); + for (const auto& e : entries) { + if (e.status == ManifestStatus::kExisting) { + EXPECT_EQ(e.data_file->file_path, keep_file->file_path); + } else { + EXPECT_THAT(e.data_file->file_path, + ::testing::AnyOf(old_file->file_path, targeted_file->file_path)); + EXPECT_EQ(e.status, ManifestStatus::kDeleted); + } + } +} + +TEST_F(ManifestFilterManagerTest, FailAnyDeleteFailsOnSequenceNumberPruning) { + // An unrelated object delete opens the manifest; sequence-number pruning is the + // branch that marks this entry. + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + auto old_file = std::make_shared(); + old_file->content = DataFile::Content::kPositionDeletes; + old_file->file_path = table_location_ + "/delete/del_old.parquet"; + old_file->file_format = FileFormatType::kParquet; + old_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + old_file->file_size_in_bytes = 512; + old_file->record_count = 10; + old_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto del_manifest, + WriteDeleteManifest(old_file, /*data_seq=*/2L, file_io_, *metadata, manifest_path)); + + // Unrelated registered delete file: opens the manifest without matching its entry. + auto unrelated = std::make_shared(); + unrelated->content = DataFile::Content::kPositionDeletes; + unrelated->file_path = table_location_ + "/delete/unrelated.parquet"; + unrelated->file_format = FileFormatType::kParquet; + unrelated->partition = PartitionValues(std::vector{Literal::Long(9L)}); + unrelated->file_size_in_bytes = 512; + unrelated->record_count = 10; + unrelated->partition_spec_id = spec_->spec_id(); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DeleteFile(unrelated), IsOk()); + EXPECT_THAT(mgr->DropDeleteFilesOlderThan(5), IsOk()); + mgr->FailAnyDelete(); + + std::vector manifests{&del_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + auto result = mgr->FilterManifests(schema, specs, manifests, factory); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Operation would delete existing data")); +} + +TEST_F(ManifestFilterManagerTest, FailAnyDeleteFailsOnDanglingDeletionVector) { + // Dangling DVs honor fail-any-delete just like other delete branches. + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + const std::string data_file_path = table_location_ + "/data/referenced.parquet"; + + auto dv_file = std::make_shared(); + dv_file->content = DataFile::Content::kPositionDeletes; + dv_file->file_path = table_location_ + "/delete/dv.puffin"; + dv_file->file_format = FileFormatType::kPuffin; + dv_file->referenced_data_file = data_file_path; + dv_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + dv_file->file_size_in_bytes = 256; + dv_file->record_count = 5; + dv_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/dv-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto dv_manifest, + WriteDeleteManifest(dv_file, /*data_seq=*/3L, file_io_, *metadata, manifest_path)); + + auto deleted_data_file = std::make_shared(); + deleted_data_file->content = DataFile::Content::kData; + deleted_data_file->file_path = data_file_path; + deleted_data_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + deleted_data_file->file_size_in_bytes = 1024; + deleted_data_file->record_count = 50; + deleted_data_file->partition_spec_id = spec_->spec_id(); + + DataFileSet deleted_files; + deleted_files.insert(deleted_data_file); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + mgr->RemoveDanglingDeletesFor(deleted_files); + mgr->FailAnyDelete(); + + std::vector manifests{&dv_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + auto result = mgr->FilterManifests(schema, specs, manifests, factory); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Operation would delete existing data")); +} + +TEST_F(ManifestFilterManagerTest, RemoveDanglingDeletesForFiltersDanglingDV) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + const std::string data_file_path = table_location_ + "/data/referenced.parquet"; + + // Create a DV (position-delete, puffin format) referencing the data file. + auto dv_file = std::make_shared(); + dv_file->content = DataFile::Content::kPositionDeletes; + dv_file->file_path = table_location_ + "/delete/dv.puffin"; + dv_file->file_format = FileFormatType::kPuffin; + dv_file->referenced_data_file = data_file_path; + dv_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + dv_file->file_size_in_bytes = 256; + dv_file->record_count = 5; + dv_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/dv-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto dv_manifest, + WriteDeleteManifest(dv_file, /*data_seq=*/3L, file_io_, *metadata, manifest_path)); + + // Register the referenced data file as deleted. + auto deleted_data_file = std::make_shared(); + deleted_data_file->content = DataFile::Content::kData; + deleted_data_file->file_path = data_file_path; + deleted_data_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + deleted_data_file->file_size_in_bytes = 1024; + deleted_data_file->record_count = 50; + deleted_data_file->partition_spec_id = spec_->spec_id(); + + DataFileSet deleted_files; + deleted_files.insert(deleted_data_file); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + mgr->RemoveDanglingDeletesFor(deleted_files); + + std::vector manifests{&dv_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->FilterManifests(schema, specs, manifests, factory)); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); + ASSERT_EQ(entries.size(), 1U); + EXPECT_EQ(entries[0].status, ManifestStatus::kDeleted); +} + +TEST_F(ManifestFilterManagerTest, RemoveDanglingDeletesForKeepsNonDanglingDeletes) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + const std::string deleted_data_file_path = table_location_ + "/data/deleted.parquet"; + + auto make_delete = [&](std::string path, FileFormatType format, + std::optional referenced_data_file) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = std::move(path); + f->file_format = format; + f->referenced_data_file = std::move(referenced_data_file); + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 256; + f->record_count = 5; + f->partition_spec_id = spec_->spec_id(); + return f; + }; + + auto ordinary_position_delete = + make_delete(table_location_ + "/delete/ordinary.parquet", FileFormatType::kParquet, + deleted_data_file_path); + auto dv_without_reference = make_delete(table_location_ + "/delete/no-ref.puffin", + FileFormatType::kPuffin, std::nullopt); + auto unrelated_dv = + make_delete(table_location_ + "/delete/unrelated.puffin", FileFormatType::kPuffin, + table_location_ + "/data/other.parquet"); + + auto manifest_path = std::format("{}/metadata/dv-keep-manifest-{}.avro", + table_location_, manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL(auto manifest, + WriteDeleteManifest({{ordinary_position_delete, 3L}, + {dv_without_reference, 3L}, + {unrelated_dv, 3L}}, + file_io_, *metadata, manifest_path)); + + auto deleted_data_file = std::make_shared(); + deleted_data_file->content = DataFile::Content::kData; + deleted_data_file->file_path = deleted_data_file_path; + deleted_data_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + deleted_data_file->file_size_in_bytes = 1024; + deleted_data_file->record_count = 50; + deleted_data_file->partition_spec_id = spec_->spec_id(); + + DataFileSet deleted_files; + deleted_files.insert(deleted_data_file); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + mgr->RemoveDanglingDeletesFor(deleted_files); + + std::vector manifests{&manifest}; + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(schema, SpecsById(*metadata), + manifests, factory)); + + ASSERT_EQ(result.size(), 1U); + EXPECT_EQ(result[0].manifest_path, manifest.manifest_path); +} + +TEST_F(ManifestFilterManagerTest, RemoveDanglingDeletesForReplacesRemovedFileSet) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + auto make_dv = [&](std::string path, std::string referenced_data_file) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = std::move(path); + f->file_format = FileFormatType::kPuffin; + f->referenced_data_file = std::move(referenced_data_file); + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 256; + f->record_count = 5; + f->partition_spec_id = spec_->spec_id(); + return f; + }; + + const std::string first_data = table_location_ + "/data/first.parquet"; + const std::string second_data = table_location_ + "/data/second.parquet"; + auto first_dv = make_dv(table_location_ + "/delete/first.puffin", first_data); + auto second_dv = make_dv(table_location_ + "/delete/second.puffin", second_data); + + auto first_manifest_path = + std::format("{}/metadata/dv-first-{}.avro", table_location_, manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto first_manifest, + WriteDeleteManifest(first_dv, 3L, file_io_, *metadata, first_manifest_path)); + auto second_manifest_path = + std::format("{}/metadata/dv-second-{}.avro", table_location_, manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto second_manifest, + WriteDeleteManifest(second_dv, 3L, file_io_, *metadata, second_manifest_path)); + + auto make_deleted_data = [&](std::string path) { + auto file = std::make_shared(); + file->content = DataFile::Content::kData; + file->file_path = std::move(path); + file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + file->file_size_in_bytes = 1024; + file->record_count = 50; + file->partition_spec_id = spec_->spec_id(); + return file; + }; + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + DataFileSet first_deleted; + first_deleted.insert(make_deleted_data(first_data)); + mgr->RemoveDanglingDeletesFor(first_deleted); + + DataFileSet second_deleted; + second_deleted.insert(make_deleted_data(second_data)); + mgr->RemoveDanglingDeletesFor(second_deleted); + + std::vector manifests{&first_manifest, &second_manifest}; + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + ICEBERG_UNWRAP_OR_FAIL( + auto second_result, + mgr->FilterManifests(schema, SpecsById(*metadata), manifests, factory)); + ASSERT_EQ(second_result.size(), 2U); + EXPECT_EQ(second_result[0].manifest_path, first_manifest.manifest_path); + EXPECT_NE(second_result[1].manifest_path, second_manifest.manifest_path); + ICEBERG_UNWRAP_OR_FAIL(auto second_entries, ReadAllEntries(second_result, *metadata)); + + bool saw_first_live = false; + bool saw_second_deleted = false; + for (const auto& entry : second_entries) { + ASSERT_NE(entry.data_file, nullptr); + if (entry.data_file->file_path == first_dv->file_path) { + saw_first_live = true; + EXPECT_NE(entry.status, ManifestStatus::kDeleted); + } else if (entry.data_file->file_path == second_dv->file_path) { + saw_second_deleted = true; + EXPECT_EQ(entry.status, ManifestStatus::kDeleted); + } + } + EXPECT_TRUE(saw_first_live); + EXPECT_TRUE(saw_second_deleted); +} + +TEST_F(ManifestFilterManagerTest, DeleteFileObjectMatchesDeletionVectorByContent) { + auto metadata = *table_->metadata(); + metadata.format_version = 3; + auto factory = MakeWriterFactory(metadata); + + auto make_dv = [&](int64_t offset) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = table_location_ + "/delete/dv.puffin"; + f->file_format = FileFormatType::kPuffin; + f->referenced_data_file = + std::format("{}/data/referenced-{}.parquet", table_location_, offset); + f->content_offset = offset; + f->content_size_in_bytes = 10; + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 256; + f->record_count = 5; + f->partition_spec_id = spec_->spec_id(); + return f; + }; + auto dv0 = make_dv(0); + auto dv1 = make_dv(10); + + auto manifest_path = std::format("{}/metadata/dv-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto manifest, + WriteDeleteManifest({{dv0, 3L}, {dv1, 3L}}, file_io_, metadata, manifest_path)); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DeleteFile(dv0), IsOk()); + + std::vector manifests{&manifest}; + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata.Schema()); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr->FilterManifests(schema, SpecsById(metadata), manifests, factory)); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, metadata)); + ASSERT_EQ(entries.size(), 2U); + for (const auto& entry : entries) { + ASSERT_NE(entry.data_file, nullptr); + if (entry.data_file->content_offset == 0) { + EXPECT_EQ(entry.status, ManifestStatus::kDeleted); + } else { + EXPECT_EQ(entry.status, ManifestStatus::kExisting); + } + } +} + +TEST_F(ManifestFilterManagerTest, FailMissingDeleteFileObjectUsesDeleteFileIdentity) { + auto metadata = *table_->metadata(); + metadata.format_version = 3; + auto factory = MakeWriterFactory(metadata); + + auto make_dv = [&](int64_t offset) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = table_location_ + "/delete/dv.puffin"; + f->file_format = FileFormatType::kPuffin; + f->referenced_data_file = + std::format("{}/data/referenced-{}.parquet", table_location_, offset); + f->content_offset = offset; + f->content_size_in_bytes = 10; + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 256; + f->record_count = 5; + f->partition_spec_id = spec_->spec_id(); + return f; + }; + auto present_dv = make_dv(0); + auto missing_dv = make_dv(10); + + auto manifest_path = std::format("{}/metadata/dv-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteDeleteManifest(present_dv, 3L, file_io_, + metadata, manifest_path)); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DeleteFile(missing_dv), IsOk()); + mgr->FailMissingDeletePaths(); + + std::vector manifests{&manifest}; + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata.Schema()); + EXPECT_THAT(mgr->FilterManifests(schema, SpecsById(metadata), manifests, factory), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(ManifestFilterManagerTest, DuplicateDeletesCountRepeatedDeletedFiles) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + auto del_file = std::make_shared(); + del_file->content = DataFile::Content::kPositionDeletes; + del_file->file_path = table_location_ + "/delete/duplicate.parquet"; + del_file->file_format = FileFormatType::kParquet; + del_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + del_file->file_size_in_bytes = 512; + del_file->record_count = 10; + del_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/dup-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL(auto manifest, + WriteDeleteManifest({{del_file, 3L}, {del_file, 3L}}, file_io_, + *metadata, manifest_path)); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DeleteFile(del_file), IsOk()); + + std::vector manifests{&manifest}; + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->FilterManifests(schema, SpecsById(*metadata), + manifests, factory)); + + EXPECT_EQ(mgr->DeletedFiles().size(), 1U); + EXPECT_EQ(mgr->DuplicateDeletesCount(), 1); +} + +TEST_F(ManifestFilterManagerTest, DuplicateDeletesCountDoesNotCrossManifestBoundary) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + auto del_file = std::make_shared(); + del_file->content = DataFile::Content::kPositionDeletes; + del_file->file_path = table_location_ + "/delete/reused.parquet"; + del_file->file_format = FileFormatType::kParquet; + del_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + del_file->file_size_in_bytes = 512; + del_file->record_count = 10; + del_file->partition_spec_id = spec_->spec_id(); + + auto first_manifest_path = + std::format("{}/metadata/reused-a-{}.avro", table_location_, manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto first_manifest, + WriteDeleteManifest(del_file, 3L, file_io_, *metadata, first_manifest_path)); + auto second_manifest_path = + std::format("{}/metadata/reused-b-{}.avro", table_location_, manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto second_manifest, + WriteDeleteManifest(del_file, 3L, file_io_, *metadata, second_manifest_path)); + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestFilterManager::Make(ManifestContent::kDeletes, file_io_)); + EXPECT_THAT(mgr->DeleteFile(del_file), IsOk()); + + std::vector manifests{&first_manifest, &second_manifest}; + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->FilterManifests(schema, specs, manifests, factory)); + ICEBERG_UNWRAP_OR_FAIL(auto summary, mgr->BuildSummary(result, specs)); + + EXPECT_EQ(mgr->DeletedFiles().size(), 1U); + EXPECT_EQ(mgr->DuplicateDeletesCount(), 0); + auto summary_map = summary.Build(); + EXPECT_EQ(summary_map.at(SnapshotSummaryFields::kRemovedDeleteFiles), "2"); + EXPECT_EQ(summary_map.count(SnapshotSummaryFields::kDeletedDuplicatedFiles), 0U); +} + } // namespace iceberg diff --git a/src/iceberg/test/manifest_merge_manager_test.cc b/src/iceberg/test/manifest_merge_manager_test.cc index b19eace86..18d24db29 100644 --- a/src/iceberg/test/manifest_merge_manager_test.cc +++ b/src/iceberg/test/manifest_merge_manager_test.cc @@ -20,6 +20,7 @@ #include "iceberg/manifest/manifest_merge_manager.h" #include +#include #include #include #include @@ -51,6 +52,7 @@ namespace iceberg { namespace { constexpr int8_t kFormatVersion = 2; +constexpr int8_t kRowIdFormatVersion = 3; constexpr int64_t kSnapshotId = 12345L; constexpr int32_t kSpecId0 = 0; constexpr int32_t kSpecId1 = 1; @@ -85,6 +87,17 @@ class ManifestMergeManagerTest : public ::testing::Test { metadata_ = std::shared_ptr(std::move(metadata)); } + Result> BuildV3Metadata() { + auto builder = TableMetadataBuilder::BuildFromEmpty(kRowIdFormatVersion); + builder->SetCurrentSchema(schema_, schema_->HighestFieldId().value_or(0)); + builder->SetDefaultPartitionSpec(spec0_); + builder->AddPartitionSpec(spec1_); + builder->SetDefaultSortOrder(SortOrder::Unsorted()); + ICEBERG_ASSIGN_OR_RAISE(auto metadata, builder->Build()); + metadata->next_row_id = 1000; + return std::shared_ptr(std::move(metadata)); + } + // Write a small manifest with N data files and return the ManifestFile descriptor. Result WriteManifest(int32_t spec_id, int num_files, int64_t file_size_override = 512, @@ -116,14 +129,61 @@ class ManifestMergeManagerTest : public ::testing::Test { return manifest_file; } - ManifestWriterFactory MakeWriterFactory() { - return [this](int32_t spec_id, - ManifestContent content) -> Result> { + Result WriteDataManifestWithFileRowIds( + std::optional manifest_first_row_id, + std::optional entry_first_row_id, int64_t snapshot_id, + int64_t file_size_override = 512) { + auto path = std::format("manifest-{}.avro", manifest_counter_++); + ICEBERG_ASSIGN_OR_RAISE(auto writer, + ManifestWriter::MakeWriter( + kRowIdFormatVersion, snapshot_id, path, file_io_, spec0_, + schema_, ManifestContent::kData, entry_first_row_id)); + auto f = std::make_shared(); + f->content = DataFile::Content::kData; + f->file_path = std::format("data/row-id-file-{}.parquet", manifest_counter_); + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(0)}); + f->file_size_in_bytes = 1024; + f->record_count = 10; + f->partition_spec_id = kSpecId0; + f->first_row_id = entry_first_row_id; + ICEBERG_RETURN_UNEXPECTED( + writer->WriteExistingEntry(ManifestEntry{.status = ManifestStatus::kExisting, + .snapshot_id = snapshot_id, + .sequence_number = kSnapshotId, + .file_sequence_number = kSnapshotId, + .data_file = std::move(f)})); + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + ICEBERG_ASSIGN_OR_RAISE(auto manifest_file, writer->ToManifestFile()); + manifest_file.manifest_length = file_size_override; + manifest_file.first_row_id = manifest_first_row_id; + manifest_file.added_snapshot_id = snapshot_id; + return manifest_file; + } + + ManifestWriterFactory MakeWriterFactory() { return MakeWriterFactory(kFormatVersion); } + + Result> MakeMergeManager( + int64_t target_size_bytes, int32_t min_count_to_merge, bool merge_enabled, + ManifestContent content = ManifestContent::kData) { + return ManifestMergeManager::Make( + content, target_size_bytes, min_count_to_merge, merge_enabled, file_io_, + [] { return kSnapshotId; }, + [this](const std::string& location) { return file_io_->DeleteFile(location); }); + } + + ManifestWriterFactory MakeWriterFactory(int8_t format_version) { + return [this, format_version]( + int32_t spec_id, + ManifestContent content) -> Result> { ++factory_call_count_; auto spec = spec_id == kSpecId0 ? spec0_ : spec1_; auto path = std::format("merged-{}.avro", manifest_counter_++); - return ManifestWriter::MakeWriter(kFormatVersion, kSnapshotId, path, file_io_, spec, - schema_, content); + return ManifestWriter::MakeWriter( + format_version, kSnapshotId, path, file_io_, spec, schema_, content, + content == ManifestContent::kData && format_version >= 3 + ? std::make_optional(1000) + : std::nullopt); }; } @@ -140,6 +200,12 @@ class ManifestMergeManagerTest : public ::testing::Test { return total; } + Result> ReadEntries(const ManifestFile& manifest) { + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ManifestReader::Make(manifest, file_io_, schema_, spec0_)); + return reader->Entries(); + } + std::shared_ptr file_io_; std::shared_ptr schema_; std::shared_ptr spec0_; @@ -154,160 +220,276 @@ TEST_F(ManifestMergeManagerTest, MergeDisabled) { ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1)); ICEBERG_UNWRAP_OR_FAIL(auto m2, WriteManifest(kSpecId0, 1)); - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/false); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/false)); ICEBERG_UNWRAP_OR_FAIL( - auto result, mgr.MergeManifests({m0, m1}, {m2}, kSnapshotId, *metadata_, file_io_, - MakeWriterFactory())); - // merge disabled → all 3 manifests returned, factory never called + auto result, mgr->MergeManifests({m0, m1}, {m2}, *metadata_, MakeWriterFactory())); EXPECT_EQ(result.size(), 3U); EXPECT_EQ(factory_call_count_, 0); } +TEST_F(ManifestMergeManagerTest, MakeRejectsNullParameters) { + EXPECT_THAT( + ManifestMergeManager::Make(ManifestContent::kData, /*target=*/1024, /*min_count=*/2, + /*enabled=*/true, nullptr, [] { return kSnapshotId; }), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT( + ManifestMergeManager::Make(ManifestContent::kData, /*target=*/1024, /*min_count=*/2, + /*enabled=*/true, file_io_, {}), + IsError(ErrorKind::kInvalidArgument)); +} + TEST_F(ManifestMergeManagerTest, BelowMinCountThreshold) { ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1)); ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1)); - // min_count=3, only 2 manifests total → no merge - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/3, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL(auto result, - mgr.MergeManifests({m0}, {m1}, kSnapshotId, *metadata_, file_io_, - MakeWriterFactory())); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/3, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr->MergeManifests({m0}, {m1}, *metadata_, MakeWriterFactory())); EXPECT_EQ(result.size(), 2U); EXPECT_EQ(factory_call_count_, 0); } TEST_F(ManifestMergeManagerTest, MergeOccursAtThreshold) { - // 3 small manifests (each 100 bytes), target=1024 → all fit in one bin ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m2, WriteManifest(kSpecId0, 1, /*size=*/100)); - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/3, /*enabled=*/true); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/3, + /*enabled=*/true)); ICEBERG_UNWRAP_OR_FAIL( - auto result, mgr.MergeManifests({m0, m1}, {m2}, kSnapshotId, *metadata_, file_io_, - MakeWriterFactory())); - // All 3 merged into 1 manifest (total 3 entries) + auto result, mgr->MergeManifests({m0, m1}, {m2}, *metadata_, MakeWriterFactory())); EXPECT_EQ(result.size(), 1U); ICEBERG_UNWRAP_OR_FAIL(auto count1, CountEntries(result)); EXPECT_EQ(count1, 3); } +TEST_F(ManifestMergeManagerTest, ReplacedManifestCountTracksPreviousSnapshotInputs) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + + EXPECT_EQ(result.size(), 1U); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 2); +} + +TEST_F(ManifestMergeManagerTest, MergeManifestsCachesMergedBinAcrossRetries) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL( + auto first, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(first.size(), 1U); + EXPECT_EQ(factory_call_count_, 1); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 2); + + ICEBERG_UNWRAP_OR_FAIL( + auto second, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(second.size(), 1U); + EXPECT_EQ(second[0].manifest_path, first[0].manifest_path); + EXPECT_EQ(factory_call_count_, 1); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 2); +} + +TEST_F(ManifestMergeManagerTest, CleanUncommittedDropsMergeCacheAndRollsBackCount) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL( + auto first, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(first.size(), 1U); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 2); + + EXPECT_THAT(mgr->CleanUncommitted({}), IsOk()); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 0); + + ICEBERG_UNWRAP_OR_FAIL( + auto second, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(second.size(), 1U); + EXPECT_NE(second[0].manifest_path, first[0].manifest_path); + EXPECT_EQ(factory_call_count_, 2); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 2); +} + +TEST_F(ManifestMergeManagerTest, CleanUncommittedDeletesMergedManifestWithCallback) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + std::vector deleted_paths; + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + ManifestMergeManager::Make( + ManifestContent::kData, /*target=*/1024, /*min_count=*/2, + /*enabled=*/true, file_io_, [] { return kSnapshotId; }, + [&deleted_paths](const std::string& path) { + deleted_paths.push_back(path); + return Status{}; + })); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(result.size(), 1U); + + EXPECT_THAT(mgr->CleanUncommitted({}), IsOk()); + + EXPECT_THAT(deleted_paths, ::testing::ElementsAre(result[0].manifest_path)); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 0); +} + +TEST_F(ManifestMergeManagerTest, CleanUncommittedIgnoresDeleteCallbackError) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + ICEBERG_UNWRAP_OR_FAIL( + auto mgr, ManifestMergeManager::Make( + ManifestContent::kData, /*target=*/1024, /*min_count=*/2, + /*enabled=*/true, file_io_, [] { return kSnapshotId; }, + [](const std::string&) { return IOError("delete failed"); })); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(result.size(), 1U); + + EXPECT_THAT(mgr->CleanUncommitted({}), IsOk()); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 0); +} + +TEST_F(ManifestMergeManagerTest, CleanUncommittedKeepsCommittedMergeCache) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL( + auto first, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(first.size(), 1U); + + EXPECT_THAT(mgr->CleanUncommitted({first[0].manifest_path}), IsOk()); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 2); + + ICEBERG_UNWRAP_OR_FAIL( + auto second, mgr->MergeManifests({m0, m1}, {}, *metadata_, MakeWriterFactory())); + ASSERT_EQ(second.size(), 1U); + EXPECT_EQ(second[0].manifest_path, first[0].manifest_path); + EXPECT_EQ(factory_call_count_, 1); +} + +TEST_F(ManifestMergeManagerTest, ReplacedManifestCountIgnoresCurrentSnapshotInputs) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr->MergeManifests({}, {m0, m1}, *metadata_, MakeWriterFactory())); + + EXPECT_EQ(result.size(), 1U); + EXPECT_EQ(mgr->ReplacedManifestsCount(), 0); +} + +TEST_F(ManifestMergeManagerTest, + MergePreservesCurrentSnapshotFileFirstRowIdsWhenManifestFirstRowIdIsNull) { + constexpr int64_t kEntryFirstRowId = 1234; + ICEBERG_UNWRAP_OR_FAIL(auto current, WriteDataManifestWithFileRowIds( + /*manifest_first_row_id=*/std::nullopt, + kEntryFirstRowId, kSnapshotId)); + ICEBERG_UNWRAP_OR_FAIL(auto previous, WriteManifest(kSpecId0, 1, /*size=*/512)); + previous.added_snapshot_id = 7; + + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto v3_metadata, BuildV3Metadata()); + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->MergeManifests({previous}, {current}, *v3_metadata, + MakeWriterFactory(kRowIdFormatVersion))); + ASSERT_EQ(result.size(), 1U); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadEntries(result[0])); + auto current_entry = std::ranges::find_if(entries, [](const ManifestEntry& entry) { + return entry.data_file != nullptr && + entry.data_file->file_path.find("row-id-file") != std::string::npos; + }); + ASSERT_NE(current_entry, entries.end()); + ASSERT_TRUE(current_entry->data_file->first_row_id.has_value()); + EXPECT_EQ(*current_entry->data_file->first_row_id, kEntryFirstRowId); +} + TEST_F(ManifestMergeManagerTest, OversizedManifestPassedThrough) { - // m_large exceeds target → must not be merged; m_small fits ICEBERG_UNWRAP_OR_FAIL(auto m_large, WriteManifest(kSpecId0, 2, /*size=*/2000)); ICEBERG_UNWRAP_OR_FAIL(auto m_small, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m_small2, WriteManifest(kSpecId0, 1, /*size=*/100)); - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); ICEBERG_UNWRAP_OR_FAIL(auto result, - mgr.MergeManifests({m_large, m_small}, {m_small2}, kSnapshotId, - *metadata_, file_io_, MakeWriterFactory())); - // m_large is oversized and acts as a bin boundary — the two small manifests on either - // side of it are never merged together. m_small2 (the newest) is also protected by - // minCountToMerge (size 1 < 2). All three remain separate. + mgr->MergeManifests({m_large, m_small}, {m_small2}, *metadata_, + MakeWriterFactory())); EXPECT_EQ(result.size(), 3U); ICEBERG_UNWRAP_OR_FAIL(auto count2, CountEntries(result)); - EXPECT_EQ(count2, 4); // 2 + 1 + 1 + EXPECT_EQ(count2, 4); } TEST_F(ManifestMergeManagerTest, CrossSpecManifestsNotMerged) { - // Manifests with different spec IDs must never be merged together ICEBERG_UNWRAP_OR_FAIL(auto m_spec0a, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m_spec0b, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m_spec1a, WriteManifest(kSpecId1, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m_spec1b, WriteManifest(kSpecId1, 1, /*size=*/100)); - // With 4 manifests (target large enough for each pair), we get 2 merged outputs - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL( - auto result, - mgr.MergeManifests({m_spec0a, m_spec1a}, {m_spec0b, m_spec1b}, kSnapshotId, - *metadata_, file_io_, MakeWriterFactory())); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr->MergeManifests({m_spec0a, m_spec1a}, {m_spec0b, m_spec1b}, + *metadata_, MakeWriterFactory())); EXPECT_EQ(result.size(), 2U); - // Verify spec IDs are preserved per output manifest for (const auto& m : result) { EXPECT_THAT(m.partition_spec_id, ::testing::AnyOf(kSpecId0, kSpecId1)); } } TEST_F(ManifestMergeManagerTest, WriterFactoryCalledOncePerMergedManifest) { - // 4 small manifests in two groups → 2 merged outputs → factory called twice ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m2, WriteManifest(kSpecId1, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m3, WriteManifest(kSpecId1, 1, /*size=*/100)); - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL(auto result, - mgr.MergeManifests({m0, m2}, {m1, m3}, kSnapshotId, *metadata_, - file_io_, MakeWriterFactory())); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto result, mgr->MergeManifests({m0, m2}, {m1, m3}, *metadata_, + MakeWriterFactory())); EXPECT_EQ(result.size(), 2U); EXPECT_EQ(factory_call_count_, 2); } -TEST_F(ManifestMergeManagerTest, MixedContentManifestsNotMerged) { - // Data and delete manifests sharing the same spec_id must never be merged together. - // The grouping key is (spec_id, content), so they land in separate bins. - ICEBERG_UNWRAP_OR_FAIL( - auto d0, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kData)); - ICEBERG_UNWRAP_OR_FAIL( - auto d1, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kData)); - ICEBERG_UNWRAP_OR_FAIL( - auto del0, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kDeletes)); - ICEBERG_UNWRAP_OR_FAIL( - auto del1, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kDeletes)); - - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL( - auto result, mgr.MergeManifests({d0, del0}, {d1, del1}, kSnapshotId, *metadata_, - file_io_, MakeWriterFactory())); - // 2 data → 1 merged data manifest; 2 delete → 1 merged delete manifest - EXPECT_EQ(result.size(), 2U); - int data_count = 0; - int delete_count = 0; - for (const auto& m : result) { - if (m.content == ManifestContent::kData) ++data_count; - if (m.content == ManifestContent::kDeletes) ++delete_count; - } - EXPECT_EQ(data_count, 1); - EXPECT_EQ(delete_count, 1); -} - -TEST_F(ManifestMergeManagerTest, MixedContentUsesFirstManifestPerContent) { +TEST_F(ManifestMergeManagerTest, UnexpectedContentRejected) { ICEBERG_UNWRAP_OR_FAIL( auto d0, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kData)); - ICEBERG_UNWRAP_OR_FAIL( - auto d1, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kData)); - ICEBERG_UNWRAP_OR_FAIL( - auto del0, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kDeletes)); - ICEBERG_UNWRAP_OR_FAIL( - auto del1, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kDeletes)); - // Each content type's newest manifest must be protected by the threshold - // independently. - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/3, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL( - auto result, mgr.MergeManifests({d0, del0}, {d1, del1}, kSnapshotId, *metadata_, - file_io_, MakeWriterFactory())); - - // Each content type has exactly two manifests, below min_count=3, so neither pair - // should be merged. - ASSERT_EQ(result.size(), 4U); - int data_count = 0; - int delete_count = 0; - for (const auto& manifest : result) { - if (manifest.content == ManifestContent::kData) { - ++data_count; - } else if (manifest.content == ManifestContent::kDeletes) { - ++delete_count; - } - } - EXPECT_EQ(data_count, 2); - EXPECT_EQ(delete_count, 2); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + MakeMergeManager(/*target=*/1024, /*min_count=*/2, + /*enabled=*/true, ManifestContent::kDeletes)); + EXPECT_THAT(mgr->MergeManifests({}, {d0}, *metadata_, MakeWriterFactory()), + IsError(ErrorKind::kInvalidArgument)); } TEST_F(ManifestMergeManagerTest, DeleteManifestsMerged) { - // Delete manifests are bin-packed and merged just like data manifests. ICEBERG_UNWRAP_OR_FAIL( auto del0, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kDeletes)); ICEBERG_UNWRAP_OR_FAIL( @@ -315,10 +497,12 @@ TEST_F(ManifestMergeManagerTest, DeleteManifestsMerged) { ICEBERG_UNWRAP_OR_FAIL( auto del2, WriteManifest(kSpecId0, 1, /*size=*/100, ManifestContent::kDeletes)); - ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/3, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL(auto result, - mgr.MergeManifests({del0, del1}, {del2}, kSnapshotId, *metadata_, - file_io_, MakeWriterFactory())); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, + MakeMergeManager(/*target=*/1024, /*min_count=*/3, + /*enabled=*/true, ManifestContent::kDeletes)); + ICEBERG_UNWRAP_OR_FAIL( + auto result, + mgr->MergeManifests({del0, del1}, {del2}, *metadata_, MakeWriterFactory())); EXPECT_EQ(result.size(), 1U); EXPECT_EQ(result[0].content, ManifestContent::kDeletes); ICEBERG_UNWRAP_OR_FAIL(auto count, CountEntries(result)); @@ -326,26 +510,16 @@ TEST_F(ManifestMergeManagerTest, DeleteManifestsMerged) { } TEST_F(ManifestMergeManagerTest, PackEndOlderManifestsMergedNotNewest) { - // packEnd semantics: for [m0_new, m1_old, m2_old] with target=250 (pairs fit but - // triples don't), packing from the end merges m1+m2 (the older pair) and leaves - // m0 (the newest) in its own under-filled bin at the front of the output. - // This is the opposite of naive forward packing, which would merge m0+m1. ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m2, WriteManifest(kSpecId0, 1, /*size=*/100)); ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); - // target=250 fits two 100-byte manifests but not three. - // min_count=3 so m0's single-element bin is kept as-is (below threshold). - ManifestMergeManager mgr(/*target=*/250, /*min_count=*/3, /*enabled=*/true); + ICEBERG_UNWRAP_OR_FAIL(auto mgr, MakeMergeManager(/*target=*/250, /*min_count=*/3, + /*enabled=*/true)); ICEBERG_UNWRAP_OR_FAIL( - auto result, mgr.MergeManifests({m1, m2}, {m0}, kSnapshotId, *metadata_, file_io_, - MakeWriterFactory())); - // Expected: [m0 (pass-through), merged(m1+m2)] + auto result, mgr->MergeManifests({m1, m2}, {m0}, *metadata_, MakeWriterFactory())); ASSERT_EQ(result.size(), 2U); - // First output is the newest manifest m0, passed through unchanged (under-filled bin). EXPECT_EQ(result[0].manifest_length, m0.manifest_length); - // Second output is the merged older pair — it must be a newly written manifest - // (different path than either original). EXPECT_NE(result[1].manifest_path, m1.manifest_path); EXPECT_NE(result[1].manifest_path, m2.manifest_path); ICEBERG_UNWRAP_OR_FAIL(auto count, CountEntries(result)); diff --git a/src/iceberg/test/manifest_reader_test.cc b/src/iceberg/test/manifest_reader_test.cc index 3f85729a7..f2c4ef406 100644 --- a/src/iceberg/test/manifest_reader_test.cc +++ b/src/iceberg/test/manifest_reader_test.cc @@ -190,6 +190,41 @@ TEST_P(TestManifestReader, TestManifestReaderWithEmptyInheritableMetadata) { EXPECT_EQ(read_entry.snapshot_id, 1000L); } +TEST_P(TestManifestReader, DeletedEntriesDoNotInheritFirstRowId) { + auto version = GetParam(); + if (version < 3) { + GTEST_SKIP() << "first_row_id is only assigned in V3 manifests"; + } + + auto deleted_file = + MakeDataFile("/path/to/deleted.parquet", PartitionValues({Literal::Int(0)}), + /*record_count=*/10); + auto added_file = + MakeDataFile("/path/to/added.parquet", PartitionValues({Literal::Int(1)}), + /*record_count=*/5); + + auto deleted_entry = + MakeEntry(ManifestStatus::kDeleted, /*snapshot_id=*/1000L, std::move(deleted_file)); + deleted_entry.sequence_number = 0; + deleted_entry.file_sequence_number = 0; + + std::vector entries; + entries.push_back(std::move(deleted_entry)); + entries.push_back( + MakeEntry(ManifestStatus::kAdded, /*snapshot_id=*/1000L, std::move(added_file))); + auto manifest = WriteManifest(version, /*snapshot_id=*/1000L, std::move(entries)); + + ICEBERG_UNWRAP_OR_FAIL(auto reader, + ManifestReader::Make(manifest, file_io_, schema_, spec_)); + ICEBERG_UNWRAP_OR_FAIL(auto read_entries, reader->Entries()); + + ASSERT_EQ(read_entries.size(), 2U); + EXPECT_EQ(read_entries[0].status, ManifestStatus::kDeleted); + EXPECT_EQ(read_entries[0].data_file->first_row_id, std::nullopt); + EXPECT_EQ(read_entries[1].status, ManifestStatus::kAdded); + EXPECT_EQ(read_entries[1].data_file->first_row_id, 0); +} + TEST_P(TestManifestReader, TestReaderWithFilterWithoutSelect) { auto version = GetParam(); auto file_a = diff --git a/src/iceberg/test/manifest_writer_versions_test.cc b/src/iceberg/test/manifest_writer_versions_test.cc index 990224528..24669d5b5 100644 --- a/src/iceberg/test/manifest_writer_versions_test.cc +++ b/src/iceberg/test/manifest_writer_versions_test.cc @@ -435,6 +435,20 @@ TEST_F(ManifestWriterVersionsTest, TestV1WriteDelete) { "Cannot write equality_deletes file to data manifest file")); } +TEST_F(ManifestWriterVersionsTest, TestWriteAddedEntryRejectsMissingDataFile) { + const std::string manifest_path = CreateManifestPath(); + ICEBERG_UNWRAP_OR_FAIL( + auto writer, ManifestWriter::MakeWriter(/*format_version=*/2, kSnapshotId, + manifest_path, file_io_, spec_, schema_)); + + ManifestEntry entry; + entry.snapshot_id = kSnapshotId; + + auto status = writer->WriteAddedEntry(entry); + EXPECT_THAT(status, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(status, HasErrorMessage("Data file cannot be null")); +} + TEST_F(ManifestWriterVersionsTest, TestV1WriteWithInheritance) { auto manifests = WriteAndReadManifests({WriteManifest(/*format_version=*/1, {data_file_})}, 1); diff --git a/src/iceberg/test/merging_snapshot_update_test.cc b/src/iceberg/test/merging_snapshot_update_test.cc new file mode 100644 index 000000000..69ee10b91 --- /dev/null +++ b/src/iceberg/test/merging_snapshot_update_test.cc @@ -0,0 +1,1706 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/merging_snapshot_update.h" + +#include +#include +#include +#include + +#include +#include + +#include "iceberg/avro/avro_register.h" +#include "iceberg/constants.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/manifest/manifest_reader.h" +#include "iceberg/manifest/manifest_writer.h" +#include "iceberg/partition_spec.h" +#include "iceberg/row/partition_values.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" +#include "iceberg/table_properties.h" +#include "iceberg/test/matchers.h" +#include "iceberg/test/update_test_base.h" +#include "iceberg/transaction.h" +#include "iceberg/update/fast_append.h" +#include "iceberg/update/update_properties.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Concrete subclass of MergingSnapshotUpdate for testing. +class TestMergeAppend : public MergingSnapshotUpdate { + public: + static Result> Make(std::string table_name, + std::shared_ptr table) { + ICEBERG_ASSIGN_OR_RAISE( + auto ctx, TransactionContext::Make(std::move(table), TransactionKind::kUpdate)); + return std::unique_ptr( + new TestMergeAppend(std::move(table_name), std::move(ctx))); + } + + std::string operation() override { return "append"; } + + // Expose protected API for test access + Status AddFile(std::shared_ptr file) { return AddDataFile(std::move(file)); } + Status AddDelete(std::shared_ptr file) { + return AddDeleteFile(std::move(file)); + } + Status AddDelete(std::shared_ptr file, int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), data_sequence_number); + } + Status ValidateNoNewDeletesForDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const DataFileSet& replaced_files, + const std::shared_ptr& parent, + std::shared_ptr io) const { + return MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + metadata, starting_snapshot_id, std::move(data_filter), replaced_files, parent, + std::move(io), IsCaseSensitive()); + } + Status RemoveDataFile(std::shared_ptr file) { + return DeleteDataFile(std::move(file)); + } + Status RemoveDeleteFile(std::shared_ptr file) { + return DeleteDeleteFile(std::move(file)); + } + Status AppendManifest(ManifestFile manifest) { + return AddManifest(std::move(manifest)); + } + Result> DataSpec() const { + return MergingSnapshotUpdate::DataSpec(); + } + int64_t GeneratedSnapshotId() { return SnapshotId(); } + void SetDataSeqNumber(int64_t seq) { SetNewDataFilesDataSequenceNumber(seq); } + void SetCaseSensitive(bool case_sensitive) { CaseSensitive(case_sensitive); } + static Status ValidateAddedDataFilesForTest(const TableMetadata& metadata, + std::optional starting_snapshot_id, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateAddedDataFiles(metadata, starting_snapshot_id, + nullptr, parent, std::move(io)); + } + static Status ValidateAddedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateAddedDataFiles( + metadata, starting_snapshot_id, partition_set, parent, std::move(io)); + } + static Status ValidateDataFilesExistForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const std::unordered_set& file_paths, bool skip_deletes, + std::shared_ptr filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive = true) { + return MergingSnapshotUpdate::ValidateDataFilesExist( + metadata, starting_snapshot_id, file_paths, skip_deletes, std::move(filter), + parent, std::move(io), case_sensitive); + } + static Status ValidateNoNewDeletesForDataFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const DataFileSet& replaced_files, const std::shared_ptr& parent, + std::shared_ptr io, bool ignore_equality_deletes = false) { + return MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + metadata, starting_snapshot_id, replaced_files, parent, std::move(io), + ignore_equality_deletes); + } + static Status ValidateNoNewDeletesForDataFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, const DataFileSet& replaced_files, + const std::shared_ptr& parent, std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + metadata, starting_snapshot_id, std::move(data_filter), replaced_files, parent, + std::move(io)); + } + static Status ValidateNoNewDeleteFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + metadata, starting_snapshot_id, std::move(data_filter), parent, std::move(io)); + } + static Status ValidateNoNewDeleteFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + metadata, starting_snapshot_id, partition_set, parent, std::move(io)); + } + static Status ValidateDeletedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateDeletedDataFiles( + metadata, starting_snapshot_id, std::move(data_filter), parent, std::move(io)); + } + static Status ValidateDeletedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateDeletedDataFiles( + metadata, starting_snapshot_id, partition_set, parent, std::move(io)); + } + static Status ValidateAddedDVsForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr conflict_filter, + const std::unordered_set& referenced_data_files, + const std::shared_ptr& parent, std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateAddedDVs( + metadata, starting_snapshot_id, std::move(conflict_filter), referenced_data_files, + parent, std::move(io)); + } + + bool HasDataFiles() const { return AddsDataFiles(); } + bool HasDeleteFiles() const { return AddsDeleteFiles(); } + bool HasDataDeletes() const { return DeletesDataFiles(); } + + private: + TestMergeAppend(std::string table_name, std::shared_ptr ctx) + : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} +}; + +class TestOverwriteUpdate : public MergingSnapshotUpdate { + public: + static Result> Make(std::string table_name, + std::shared_ptr
table) { + ICEBERG_ASSIGN_OR_RAISE( + auto ctx, TransactionContext::Make(std::move(table), TransactionKind::kUpdate)); + return std::unique_ptr( + new TestOverwriteUpdate(std::move(table_name), std::move(ctx))); + } + + std::string operation() override { return DataOperation::kOverwrite; } + int64_t GeneratedSnapshotId() { return SnapshotId(); } + + Status AddDelete(std::shared_ptr file) { + return AddDeleteFile(std::move(file)); + } + Status AddDelete(std::shared_ptr file, int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), data_sequence_number); + } + Status RemoveDataFile(std::shared_ptr file) { + return DeleteDataFile(std::move(file)); + } + + private: + TestOverwriteUpdate(std::string table_name, std::shared_ptr ctx) + : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} +}; + +class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { + protected: + static void SetUpTestSuite() { avro::RegisterAll(); } + + void SetUp() override { + MinimalUpdateTestBase::SetUp(); + + ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec()); + ICEBERG_UNWRAP_OR_FAIL(schema_, table_->schema()); + + file_a_ = MakeDataFile("/data/file_a.parquet", /*partition_x=*/1L); + file_b_ = MakeDataFile("/data/file_b.parquet", /*partition_x=*/2L); + } + + std::shared_ptr MakeDataFile(const std::string& path, int64_t partition_x) { + auto f = std::make_shared(); + f->content = DataFile::Content::kData; + f->file_path = table_location_ + path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(partition_x)}); + f->file_size_in_bytes = 1024; + f->record_count = 100; + f->partition_spec_id = spec_->spec_id(); + return f; + } + + std::shared_ptr MakeDeleteFile(const std::string& path, int64_t partition_x) { + auto f = MakeDataFile(path, partition_x); + f->content = DataFile::Content::kPositionDeletes; + return f; + } + + std::shared_ptr MakeEqualityDeleteFile(const std::string& path, + int64_t partition_x) { + auto f = MakeDeleteFile(path, partition_x); + f->content = DataFile::Content::kEqualityDeletes; + f->equality_ids = {1}; + return f; + } + + Result> NewMergeAppend() { + return TestMergeAppend::Make(TableName(), table_); + } + + Result> NewOverwriteUpdate() { + return TestOverwriteUpdate::Make(TableName(), table_); + } + + void SetTableFormatVersion(int8_t format_version) { + table_->metadata()->format_version = format_version; + } + + // Commit file_a_ with FastAppend and refresh the table. + void CommitFileA() { + ICEBERG_UNWRAP_OR_FAIL(auto fa, table_->NewFastAppend()); + fa->AppendFile(file_a_); + EXPECT_THAT(fa->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + // Read all entries from a list of ManifestFiles. + Result> ReadAllEntries( + const std::vector& manifests, const TableMetadata& metadata) { + std::vector result; + for (const auto& m : manifests) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(m.partition_spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ManifestReader::Make(m, file_io_, schema, spec)); + ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); + result.insert(result.end(), entries.begin(), entries.end()); + } + return result; + } + + // Write a manifest file containing the given data files. + // Returns a ManifestFile with added_snapshot_id = kInvalidSnapshotId so it + // is eligible for snapshot ID inheritance. + Result WriteManifest( + const std::string& path, const std::vector>& files) { + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(/*format_version=*/2, kInvalidSnapshotId, path, + file_io_, spec_, schema_, ManifestContent::kData)); + for (const auto& f : files) { + ManifestEntry entry; + entry.status = ManifestStatus::kAdded; + entry.snapshot_id = std::nullopt; + entry.data_file = f; + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); + } + + Result WriteDataManifest( + const TableMetadata& metadata, const std::string& path, + const std::vector>& files, int64_t snapshot_id, + int64_t sequence_number) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(spec_->spec_id())); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(metadata.format_version, snapshot_id, path, file_io_, + spec, schema, ManifestContent::kData)); + for (const auto& f : files) { + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(f, sequence_number)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); + } + + Result WriteDeleteManifest( + const TableMetadata& metadata, const std::string& path, + const std::vector>& files, int64_t snapshot_id, + int64_t sequence_number) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(spec_->spec_id())); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(metadata.format_version, snapshot_id, path, file_io_, + spec, schema, ManifestContent::kDeletes)); + for (const auto& f : files) { + ManifestEntry entry; + entry.status = ManifestStatus::kAdded; + entry.snapshot_id = snapshot_id; + entry.sequence_number = sequence_number; + entry.data_file = f; + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); + } + + Result> MakeSyntheticSnapshot( + std::string operation, int64_t snapshot_id, + std::optional parent_snapshot_id, int64_t sequence_number, + const std::vector& manifests) { + auto manifest_list_path = table_location_ + "/metadata/manifest-list-" + + std::to_string(snapshot_id) + ".avro"; + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestListWriter::MakeWriter(table_->metadata()->format_version, snapshot_id, + parent_snapshot_id, manifest_list_path, file_io_, + sequence_number)); + ICEBERG_RETURN_UNEXPECTED(writer->AddAll(manifests)); + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + + ICEBERG_ASSIGN_OR_RAISE( + auto snapshot, + Snapshot::Make(sequence_number, snapshot_id, parent_snapshot_id, TimePointMs{}, + std::move(operation), {}, table_->metadata()->current_schema_id, + manifest_list_path)); + return std::shared_ptr(std::move(snapshot)); + } + + std::shared_ptr spec_; + std::shared_ptr schema_; + std::shared_ptr file_a_; + std::shared_ptr file_b_; +}; + +// ------------------------------------------------------------------------- +// State query tests +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AddsDataFilesInitiallyFalse) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_FALSE(op->HasDataFiles()); + EXPECT_FALSE(op->HasDeleteFiles()); + EXPECT_FALSE(op->HasDataDeletes()); +} + +TEST_F(MergingSnapshotUpdateTest, AddsDataFilesTrueAfterAdd) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_TRUE(op->HasDataFiles()); + EXPECT_FALSE(op->HasDeleteFiles()); +} + +TEST_F(MergingSnapshotUpdateTest, AddsDeleteFilesTrueAfterAdd) { + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_FALSE(op->HasDataFiles()); + EXPECT_TRUE(op->HasDeleteFiles()); +} + +TEST_F(MergingSnapshotUpdateTest, DeletesDataFilesTrueAfterRegisterDelete) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + EXPECT_TRUE(op->HasDataDeletes()); +} + +// ------------------------------------------------------------------------- +// Apply / Commit tests +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, CommitNewDataFile) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedRecords), "100"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitV3NewDataFileAssignsRowLineage) { + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(TableProperties::kFormatVersion.key(), "3"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto first_row_id, snapshot->FirstRowId()); + ICEBERG_UNWRAP_OR_FAIL(auto added_rows, snapshot->AddedRows()); + EXPECT_EQ(first_row_id, std::make_optional(0)); + EXPECT_EQ(added_rows, std::make_optional(100)); + EXPECT_EQ(table_->metadata()->next_row_id, 100); +} + +TEST_F(MergingSnapshotUpdateTest, CommitMultipleDataFiles) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "2"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedRecords), "200"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitDataFileAndDeleteFile) { + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // Data file summary + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitPreservesExistingManifests) { + // First append: file_a + CommitFileA(); + + // Second merge append: file_b + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitDeletesDataFile) { + CommitFileA(); + + // Remove file_a via merging snapshot update + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "0"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, SetNewDataFilesDataSequenceNumber) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + op->SetDataSeqNumber(42); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + + auto snapshot_cache = SnapshotCache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(table_->io())); + std::vector manifests(data_manifests.begin(), data_manifests.end()); + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(manifests, *table_->metadata())); + ASSERT_EQ(entries.size(), 1U); + EXPECT_EQ(entries[0].sequence_number, 42); +} + +TEST_F(MergingSnapshotUpdateTest, AddDataFileDoesNotMutateCallerFile) { + auto file = MakeDataFile("/data/with-row-id.parquet", 1L); + file->first_row_id = 42; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file), IsOk()); + + EXPECT_EQ(file->first_row_id, 42); +} + +TEST_F(MergingSnapshotUpdateTest, CustomSummaryPropertySurvivesApplyRebuild) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + op->Set("custom-prop", "custom-value"); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("custom-prop"), "custom-value"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, BaseSetCustomSummaryPropertySurvivesApplyRebuild) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + SnapshotUpdate& base_update = *op; + base_update.Set("custom-prop", "custom-value"); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("custom-prop"), "custom-value"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); +} + +// ------------------------------------------------------------------------- +// CleanUncommitted test +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, CleanUncommittedAfterSuccessfulCommitDoesNotCrash) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + // Cleanup may run from an error handler even after commit success. + EXPECT_THAT(op->CleanUncommitted({}), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, + CleanUncommittedDeletesManagerOutputsWithDeleteCallback) { + ICEBERG_UNWRAP_OR_FAIL(auto initial, NewMergeAppend()); + EXPECT_THAT(initial->AddFile(file_a_), IsOk()); + EXPECT_THAT(initial->AddFile(file_b_), IsOk()); + EXPECT_THAT(initial->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + std::vector deleted_paths; + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + op->DeleteWith([&deleted_paths](const std::string& path) { + deleted_paths.push_back(path); + return Status{}; + }); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL( + auto manifests, op->Apply(*table_->metadata(), table_->current_snapshot().value())); + EXPECT_THAT(manifests, ::testing::SizeIs(1)); + + EXPECT_THAT(op->CleanUncommitted({}), IsOk()); + EXPECT_THAT(deleted_paths, ::testing::Contains(::testing::HasSubstr("/metadata/"))); +} + +// ------------------------------------------------------------------------- +// Delete file summary tests +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, CommitDeleteFileSummaryHasAddedDeleteFiles) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedPosDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kRemovedDeleteFiles), 0); +} + +TEST_F(MergingSnapshotUpdateTest, CommitDeduplicatesStagedDeleteFiles) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedPosDeleteFiles), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, AddDeleteFileWithExplicitSequenceWritesSequenceNumber) { + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file, 17), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), nullptr)); + auto delete_manifest_it = + std::ranges::find_if(manifests, [](const ManifestFile& manifest) { + return manifest.content == ManifestContent::kDeletes; + }); + ASSERT_NE(delete_manifest_it, manifests.end()); + ICEBERG_UNWRAP_OR_FAIL(auto entries, + ReadAllEntries(std::vector{*delete_manifest_it}, + *table_->metadata())); + ASSERT_EQ(entries.size(), 1U); + ASSERT_TRUE(entries[0].sequence_number.has_value()); + EXPECT_EQ(entries[0].sequence_number.value(), 17); +} + +TEST_F(MergingSnapshotUpdateTest, ApplyRebuildsDeleteSummaryAfterPreparingDeletes) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto first_manifests, op->Apply(*table_->metadata(), nullptr)); + EXPECT_THAT(first_manifests, ::testing::Contains(::testing::Field( + &ManifestFile::content, ManifestContent::kDeletes))); + + ICEBERG_UNWRAP_OR_FAIL(auto second_manifests, op->Apply(*table_->metadata(), nullptr)); + EXPECT_THAT(second_manifests, ::testing::Contains(::testing::Field( + &ManifestFile::content, ManifestContent::kDeletes))); + + auto summary = op->Summary(); + EXPECT_EQ(summary.at(SnapshotSummaryFields::kAddedDeleteFiles), "1"); + EXPECT_EQ(summary.at(SnapshotSummaryFields::kAddedPosDeleteFiles), "1"); +} + +// Covers the bug where deleted delete files were not tracked in the snapshot summary. +TEST_F(MergingSnapshotUpdateTest, CommitDeletesDeleteFileSummaryHasRemovedDeleteFiles) { + // Step 1: commit a delete file. + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + // Step 2: commit a new snapshot that removes the delete file. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDeleteFile(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kRemovedDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kRemovedPosDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kAddedDeleteFiles), 0); +} + +// ------------------------------------------------------------------------- +// Deduplication test +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, DuplicateDataFileOnlyCountedOnce) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitSkipsMalformedPreviousSummaryTotal) { + { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + ICEBERG_UNWRAP_OR_FAIL(auto previous_snapshot, table_->current_snapshot()); + previous_snapshot->summary[SnapshotSummaryFields::kTotalRecords] = "not-a-number"; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kTotalRecords), 0U); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedRecords), "100"); +} + +// ------------------------------------------------------------------------- +// ValidateNewDeleteFile format version tests +// ------------------------------------------------------------------------- + +/// \brief V1-table test fixture. +class MergingSnapshotUpdateV1Test : public UpdateTestBase { + protected: + std::string MetadataResource() const override { return "TableMetadataV1Valid.json"; } + std::string TableName() const override { return "v1_test_table"; } + + void SetUp() override { + UpdateTestBase::SetUp(); + ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec()); + } + + std::shared_ptr MakeDeleteFile(const std::string& path) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = table_location_ + path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 512; + f->record_count = 10; + f->partition_spec_id = spec_->spec_id(); + return f; + } + + Result> NewMergeAppend() { + return TestMergeAppend::Make(TableName(), table_); + } + + std::shared_ptr spec_; +}; + +TEST_F(MergingSnapshotUpdateV1Test, ValidateNewDeleteFileV1Rejected) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet"); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV2AllowsReferencedPositionDelete) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + del_file->referenced_data_file = table_location_ + "/data/file_a.parquet"; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV2RejectsDeletionVector) { + auto del_file = MakeDeleteFile("/delete/dv_a.puffin", 1L); + del_file->file_format = FileFormatType::kPuffin; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV2AllowsEqualityDelete) { + auto eq_del = MakeDeleteFile("/delete/eq_del.parquet", 1L); + eq_del->content = DataFile::Content::kEqualityDeletes; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(eq_del), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV3RejectsNonDVPositionDelete) { + SetTableFormatVersion(3); + + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV3AllowsDeletionVector) { + SetTableFormatVersion(3); + + auto del_file = MakeDeleteFile("/delete/dv_a.puffin", 1L); + del_file->file_format = FileFormatType::kPuffin; + del_file->referenced_data_file = file_a_->file_path; + del_file->content_offset = 0; + del_file->content_size_in_bytes = 10; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileRejectsUnsupportedVersion) { + SetTableFormatVersion(TableMetadata::kSupportedTableFormatVersion + 1); + + auto del_file = MakeDeleteFile("/delete/dv_a.puffin", 1L); + del_file->file_format = FileFormatType::kPuffin; + del_file->referenced_data_file = file_a_->file_path; + del_file->content_offset = 0; + del_file->content_size_in_bytes = 10; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ApplyRejectsV2StagedPositionDeleteAfterV3Upgrade) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->format_version = 3; + EXPECT_THAT(op->Apply(*metadata, nullptr), IsError(ErrorKind::kInvalidArgument)); +} + +// ------------------------------------------------------------------------- +// AddManifest protected primitive behavior +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsDeleteManifest) { + ManifestFile del_manifest; + del_manifest.manifest_path = table_location_ + "/metadata/del.avro"; + del_manifest.content = ManifestContent::kDeletes; + del_manifest.added_snapshot_id = kInvalidSnapshotId; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(del_manifest), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestPrimitiveAllowsExistingFilesCount) { + ManifestFile manifest; + manifest.manifest_path = table_location_ + "/metadata/existing.avro"; + manifest.content = ManifestContent::kData; + manifest.added_snapshot_id = kInvalidSnapshotId; + manifest.existing_files_count = 1; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestPrimitiveAllowsDeletedFilesCount) { + ManifestFile manifest; + manifest.manifest_path = table_location_ + "/metadata/deleted.avro"; + manifest.content = ManifestContent::kData; + manifest.added_snapshot_id = kInvalidSnapshotId; + manifest.deleted_files_count = 1; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestCopiesManifestWithAssignedSnapshotId) { + auto path = table_location_ + "/metadata/snap.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_})); + manifest.added_snapshot_id = 12345; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + ASSERT_EQ(data_manifests.size(), 1U); + EXPECT_NE(data_manifests[0].manifest_path, path); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithFirstRowId) { + auto path = table_location_ + "/metadata/rowid.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_})); + manifest.first_row_id = 0; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); +} + +// ------------------------------------------------------------------------- +// AddManifest basic commit behavior +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AppendManifestEmptyTable) { + auto path = table_location_ + "/metadata/input.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_, file_b_})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + + // In v2 with snapshot ID inheritance, the manifest path is reused directly. + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + ASSERT_EQ(data_manifests.size(), 1); + + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "2"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, AppendManifestWithDataFiles) { + auto path = table_location_ + "/metadata/input.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_, file_b_})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); // file_b_ staged directly + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + // Written manifest (file_b_) + appended manifest (file_a_, file_b_) + EXPECT_EQ(data_manifests.size(), 2); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "3"); +} + +// ------------------------------------------------------------------------- +// AddManifest merge behavior +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AppendManifestMergeWithMinCountOne) { + // Set min-count-to-merge = 1 so all manifests are merged. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + auto path = table_location_ + "/metadata/input.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_, file_b_})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + // Both manifests merged into one. + EXPECT_EQ(data_manifests.size(), 1); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "3"); +} + +TEST_F(MergingSnapshotUpdateTest, AppendManifestDoNotMergeMinCount) { + // Set min-count-to-merge = 4 so 3 manifests are not merged. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "4"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + auto path1 = table_location_ + "/metadata/m1.avro"; + auto path2 = table_location_ + "/metadata/m2.avro"; + auto path3 = table_location_ + "/metadata/m3.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(path1, {file_a_})); + ICEBERG_UNWRAP_OR_FAIL(auto m2, WriteManifest(path2, {file_b_})); + ICEBERG_UNWRAP_OR_FAIL( + auto m3, WriteManifest(path3, {MakeDataFile("/data/file_c.parquet", 3L)})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(m1), IsOk()); + EXPECT_THAT(op->AppendManifest(m2), IsOk()); + EXPECT_THAT(op->AppendManifest(m3), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 3); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "3"); +} + +// ------------------------------------------------------------------------- +// Manifest merge data files only +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, ManifestMergeMergesIntoOne) { + // Set min-count-to-merge = 1 so every append triggers a merge. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + // Snapshot 1: file_a_ + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 1); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "2"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, ManifestMergeDoesNotMergeWhenBelowMinCount) { + // Default min-count-to-merge = 100, so manifests are not merged. + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 2); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, ManifestMergeDoesNotMergeWhenSizeTargetTooSmall) { + // Set a tiny size target so manifests never merge. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestTargetSizeBytes.key()), "10"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + SnapshotCache snapshot_cache(snapshot.get()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, snapshot_cache.DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 2); +} + +// ------------------------------------------------------------------------- +// Manifest count summary +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsOnFirstCommit) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "0"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); +} + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsOnSecondCommitNoMerge) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // 1 new manifest created, 1 existing manifest kept, 0 replaced. + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "0"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsAfterMerge) { + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // 1 merged output created, 1 existing manifest replaced, 0 kept. + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); +} + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsAfterDelete) { + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // Filter rewrites 1 manifest (replaced), merge produces 1 output (created). + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); +} + +TEST_F(MergingSnapshotUpdateTest, MissingRequestedDeleteDoesNotAffectSummary) { + CommitFileA(); + + auto missing = MakeDataFile("/data/missing.parquet", 3L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(missing), IsOk()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kDeletedDataFiles), 0U); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesFailsForTruncatedHistory) { + auto metadata = std::make_shared(); + metadata->format_version = 2; + metadata->location = table_location_; + metadata->current_schema_id = 0; + metadata->schemas.push_back(schema_); + + auto make_snapshot = [](int64_t snapshot_id, + std::optional parent_snapshot_id) { + return std::make_shared(Snapshot{ + .snapshot_id = snapshot_id, + .parent_snapshot_id = parent_snapshot_id, + .sequence_number = snapshot_id, + .timestamp_ms = TimePointMs{}, + .manifest_list = "", + .summary = {}, + .schema_id = 0, + }); + }; + + auto base_snapshot = make_snapshot(1, std::nullopt); + auto main_snapshot = make_snapshot(2, 1); + auto branch_snapshot = make_snapshot(3, 1); + metadata->snapshots = {base_snapshot, main_snapshot, branch_snapshot}; + + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest(*metadata, /*starting=*/2, + branch_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateAddedDataFilesWithNoStartingSnapshotFailsForTruncatedHistory) { + auto metadata = std::make_shared(); + metadata->format_version = 2; + metadata->location = table_location_; + metadata->current_schema_id = 0; + metadata->schemas.push_back(schema_); + + auto snapshot = std::make_shared(Snapshot{ + .snapshot_id = 2, + .parent_snapshot_id = 1, + .sequence_number = 2, + .timestamp_ms = TimePointMs{}, + .manifest_list = "", + .summary = {}, + .schema_id = 0, + }); + metadata->snapshots = {snapshot}; + + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest(*metadata, std::nullopt, + snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesWithNoStartingSnapshotChecksAll) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest( + *table_->metadata(), std::nullopt, snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest( + *table_->metadata(), snapshot->snapshot_id, snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesWithPartitionSetDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto fast_append, table_->NewFastAppend()); + fast_append->AppendFile(file_b_); + EXPECT_THAT(fast_append->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto second_snapshot, table_->current_snapshot()); + + PartitionSet partition_set; + ASSERT_TRUE(partition_set.add(spec_->spec_id(), file_b_->partition)); + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest( + *table_->metadata(), first_snapshot->snapshot_id, partition_set, + second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesIgnoresOldEntrySnapshotId) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto metadata = std::make_shared(*table_->metadata()); + + constexpr int64_t kSecondSnapshotId = 123456; + auto manifest_path = table_location_ + "/metadata/old-entry-data.avro"; + ICEBERG_UNWRAP_OR_FAIL( + auto manifest, + WriteDataManifest(*metadata, manifest_path, {file_b_}, first_snapshot->snapshot_id, + first_snapshot->sequence_number)); + manifest.added_snapshot_id = kSecondSnapshotId; + manifest.sequence_number = first_snapshot->sequence_number + 1; + manifest.min_sequence_number = first_snapshot->sequence_number; + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kAppend, kSecondSnapshotId, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, {manifest})); + + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest( + *metadata, first_snapshot->snapshot_id, second_snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateDataFilesExistUsesRowFilter) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + std::unordered_set required_files{file_a_->file_path}; + EXPECT_THAT(TestMergeAppend::ValidateDataFilesExistForTest( + *metadata, first_snapshot->snapshot_id, required_files, + /*skip_deletes=*/false, Expressions::Equal("x", Literal::Long(1L)), + second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(TestMergeAppend::ValidateDataFilesExistForTest( + *metadata, first_snapshot->snapshot_id, required_files, + /*skip_deletes=*/false, Expressions::Equal("x", Literal::Long(2L)), + second_snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeletesForDataFilesWithFilterDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + DataFileSet replaced_files; + replaced_files.insert(file_a_); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeletesForDataFilesForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + replaced_files, second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeletesForDataFilesDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + DataFileSet replaced_files; + replaced_files.insert(file_a_); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeletesForDataFilesForTest( + *metadata, first_snapshot->snapshot_id, replaced_files, second_snapshot, + file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeletesForDataFilesFailsOnPositionDeleteWhenIgnoringEqualityDeletes) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeDeleteFile("/delete/pos_del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + DataFileSet replaced_files; + replaced_files.insert(file_a_); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeletesForDataFilesForTest( + *metadata, first_snapshot->snapshot_id, replaced_files, second_snapshot, + file_io_, /*ignore_equality_deletes=*/true), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeletesForDataFilesUsesConfiguredCaseSensitivity) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto overwrite, NewOverwriteUpdate()); + EXPECT_THAT(overwrite->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = overwrite->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, + overwrite->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + DataFileSet replaced_files; + replaced_files.insert(file_a_); + ICEBERG_UNWRAP_OR_FAIL(auto validate, NewMergeAppend()); + validate->SetCaseSensitive(false); + EXPECT_THAT(validate->ValidateNoNewDeletesForDataFiles( + *metadata, first_snapshot->snapshot_id, + Expressions::Equal("X", Literal::Long(1L)), replaced_files, + second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeletesForDataFilesWithFilterSkipsNonMatchingDeletes) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + DataFileSet replaced_files; + replaced_files.insert(file_a_); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeletesForDataFilesForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysFalse(), + replaced_files, second_snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeleteFilesWithExpressionDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeleteFilesWithExpressionSkipsNonMatchingDeletes) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysFalse(), + second_snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeleteFilesWithPartitionSetDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + PartitionSet partition_set; + ASSERT_TRUE(partition_set.add(spec_->spec_id(), del_file->partition)); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( + *metadata, first_snapshot->snapshot_id, partition_set, second_snapshot, + file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDVsDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->format_version = 3; + + auto dv_file = MakeDeleteFile("/delete/dv_a.puffin", 1L); + dv_file->file_format = FileFormatType::kPuffin; + dv_file->referenced_data_file = file_a_->file_path; + dv_file->content_offset = 0; + dv_file->content_size_in_bytes = 10; + + constexpr int64_t kSecondSnapshotId = 123456; + auto manifest_path = table_location_ + "/metadata/dv-conflict.avro"; + ICEBERG_UNWRAP_OR_FAIL( + auto manifest, + WriteDeleteManifest(*metadata, manifest_path, {dv_file}, kSecondSnapshotId, + first_snapshot->sequence_number + 1)); + manifest.added_snapshot_id = kSecondSnapshotId; + manifest.sequence_number = first_snapshot->sequence_number + 1; + manifest.min_sequence_number = first_snapshot->sequence_number + 1; + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, kSecondSnapshotId, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, {manifest})); + + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + const std::unordered_set referenced_data_files{file_a_->file_path}; + EXPECT_THAT(TestMergeAppend::ValidateAddedDVsForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + referenced_data_files, second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDVsIgnoresUnrelatedDVs) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->format_version = 3; + + auto dv_file = MakeDeleteFile("/delete/dv_a.puffin", 1L); + dv_file->file_format = FileFormatType::kPuffin; + dv_file->referenced_data_file = file_a_->file_path; + dv_file->content_offset = 0; + dv_file->content_size_in_bytes = 10; + + constexpr int64_t kSecondSnapshotId = 123456; + auto manifest_path = table_location_ + "/metadata/dv-unrelated.avro"; + ICEBERG_UNWRAP_OR_FAIL( + auto manifest, + WriteDeleteManifest(*metadata, manifest_path, {dv_file}, kSecondSnapshotId, + first_snapshot->sequence_number + 1)); + manifest.added_snapshot_id = kSecondSnapshotId; + manifest.sequence_number = first_snapshot->sequence_number + 1; + manifest.min_sequence_number = first_snapshot->sequence_number + 1; + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, kSecondSnapshotId, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, {manifest})); + + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + const std::unordered_set referenced_data_files{file_b_->file_path}; + EXPECT_THAT(TestMergeAppend::ValidateAddedDVsForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + referenced_data_files, second_snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDVsIgnoresOldEntrySnapshotId) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->format_version = 3; + + auto dv_file = MakeDeleteFile("/delete/dv_a.puffin", 1L); + dv_file->file_format = FileFormatType::kPuffin; + dv_file->referenced_data_file = file_a_->file_path; + dv_file->content_offset = 0; + dv_file->content_size_in_bytes = 10; + + constexpr int64_t kSecondSnapshotId = 123456; + auto manifest_path = table_location_ + "/metadata/old-entry-dv.avro"; + ICEBERG_UNWRAP_OR_FAIL( + auto manifest, + WriteDeleteManifest(*metadata, manifest_path, {dv_file}, + first_snapshot->snapshot_id, first_snapshot->sequence_number)); + manifest.added_snapshot_id = kSecondSnapshotId; + manifest.sequence_number = first_snapshot->sequence_number + 1; + manifest.min_sequence_number = first_snapshot->sequence_number; + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, kSecondSnapshotId, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, {manifest})); + + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + const std::unordered_set referenced_data_files{file_a_->file_path}; + EXPECT_THAT(TestMergeAppend::ValidateAddedDVsForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + referenced_data_files, second_snapshot, file_io_), + IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateDeletedDataFilesWithExpressionDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + EXPECT_THAT(TestMergeAppend::ValidateDeletedDataFilesForTest( + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + second_snapshot, file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(MergingSnapshotUpdateTest, + ValidateDeletedDataFilesWithPartitionSetDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + PartitionSet partition_set; + ASSERT_TRUE(partition_set.add(spec_->spec_id(), file_a_->partition)); + EXPECT_THAT(TestMergeAppend::ValidateDeletedDataFilesForTest( + *metadata, first_snapshot->snapshot_id, partition_set, second_snapshot, + file_io_), + IsError(ErrorKind::kValidationFailed)); +} + +// ------------------------------------------------------------------------- +// DataSpec multiple partition specs +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, DataSpecThrowsWithMultipleSpecs) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->DataSpec(), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, DataSpecThrowsWhenEmpty) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->DataSpec(), IsError(ErrorKind::kInvalidArgument)); +} + +} // namespace iceberg diff --git a/src/iceberg/test/snapshot_util_test.cc b/src/iceberg/test/snapshot_util_test.cc index a47b403da..e94769e31 100644 --- a/src/iceberg/test/snapshot_util_test.cc +++ b/src/iceberg/test/snapshot_util_test.cc @@ -301,15 +301,38 @@ TEST_F(SnapshotUtilTest, SchemaForBranch) { ICEBERG_UNWRAP_OR_FAIL(auto initial_schema, table_->schema()); ASSERT_NE(initial_schema, nullptr); + auto branch_schema = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int32()), + SchemaField::MakeRequired(2, "data", string()), + SchemaField::MakeOptional(3, "branch_only", string())}, + 1); + table_->metadata()->schemas.push_back(branch_schema); + ICEBERG_UNWRAP_OR_FAIL(auto branch_snapshot, table_->SnapshotById(branch_snapshot_id_)); + branch_snapshot->schema_id = branch_schema->schema_id(); + std::string branch = "b1"; ICEBERG_UNWRAP_OR_FAIL(auto schema, SnapshotUtil::SchemaFor(*table_, branch)); - // Branch should return current schema (not snapshot schema) + EXPECT_EQ(schema->schema_id(), initial_schema->schema_id()); EXPECT_EQ(schema->fields().size(), initial_schema->fields().size()); + + ICEBERG_UNWRAP_OR_FAIL(auto metadata_schema, + SnapshotUtil::SchemaFor(*table_->metadata(), branch)); + EXPECT_EQ(metadata_schema->schema_id(), initial_schema->schema_id()); + EXPECT_EQ(metadata_schema->fields().size(), initial_schema->fields().size()); } TEST_F(SnapshotUtilTest, SchemaForTag) { // Create a tag pointing to base snapshot auto metadata = table_->metadata(); + auto tag_schema = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int32()), + SchemaField::MakeRequired(2, "data", string()), + SchemaField::MakeOptional(3, "tag_only", string())}, + 1); + metadata->schemas.push_back(tag_schema); + ICEBERG_UNWRAP_OR_FAIL(auto base_snapshot, table_->SnapshotById(base_snapshot_id_)); + base_snapshot->schema_id = tag_schema->schema_id(); + std::string tag = "tag1"; metadata->refs[tag] = std::make_shared( SnapshotRef{.snapshot_id = base_snapshot_id_, .retention = SnapshotRef::Tag{}}); @@ -318,9 +341,14 @@ TEST_F(SnapshotUtilTest, SchemaForTag) { ASSERT_NE(initial_schema, nullptr); ICEBERG_UNWRAP_OR_FAIL(auto schema, SnapshotUtil::SchemaFor(*table_, tag)); - // Tag should return the schema of the snapshot it points to - // Since base snapshot has schema_id = 0, it should return the same schema - EXPECT_EQ(schema->fields().size(), initial_schema->fields().size()); + EXPECT_EQ(schema->schema_id(), tag_schema->schema_id()); + EXPECT_EQ(schema->fields().size(), tag_schema->fields().size()); + EXPECT_NE(schema->fields().size(), initial_schema->fields().size()); + + ICEBERG_UNWRAP_OR_FAIL(auto metadata_schema, + SnapshotUtil::SchemaFor(*table_->metadata(), tag)); + EXPECT_EQ(metadata_schema->schema_id(), tag_schema->schema_id()); + EXPECT_EQ(metadata_schema->fields().size(), tag_schema->fields().size()); } TEST_F(SnapshotUtilTest, SnapshotAfter) { diff --git a/src/iceberg/update/fast_append.cc b/src/iceberg/update/fast_append.cc index d08f497cf..24f8e744f 100644 --- a/src/iceberg/update/fast_append.cc +++ b/src/iceberg/update/fast_append.cc @@ -131,7 +131,7 @@ std::unordered_map FastAppend::Summary() { return summary_.Build(); } -void FastAppend::CleanUncommitted(const std::unordered_set& committed) { +Status FastAppend::CleanUncommitted(const std::unordered_set& committed) { // Clean up new manifests that were written but not committed if (!new_manifests_.empty()) { for (const auto& manifest : new_manifests_) { @@ -152,6 +152,7 @@ void FastAppend::CleanUncommitted(const std::unordered_set& committ } } } + return {}; } bool FastAppend::CleanupAfterCommit() const { diff --git a/src/iceberg/update/fast_append.h b/src/iceberg/update/fast_append.h index 580fa4722..a04786c88 100644 --- a/src/iceberg/update/fast_append.h +++ b/src/iceberg/update/fast_append.h @@ -72,7 +72,7 @@ class ICEBERG_EXPORT FastAppend : public SnapshotUpdate { const TableMetadata& metadata_to_update, const std::shared_ptr& snapshot) override; std::unordered_map Summary() override; - void CleanUncommitted(const std::unordered_set& committed) override; + Status CleanUncommitted(const std::unordered_set& committed) override; bool CleanupAfterCommit() const override; private: diff --git a/src/iceberg/update/merging_snapshot_update.cc b/src/iceberg/update/merging_snapshot_update.cc new file mode 100644 index 000000000..a0d882d14 --- /dev/null +++ b/src/iceberg/update/merging_snapshot_update.cc @@ -0,0 +1,1325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/merging_snapshot_update.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/constants.h" +#include "iceberg/delete_file_index.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/manifest_evaluator.h" +#include "iceberg/expression/projections.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/manifest/manifest_group.h" +#include "iceberg/manifest/manifest_list.h" +#include "iceberg/manifest/manifest_reader.h" +#include "iceberg/manifest/manifest_util_internal.h" +#include "iceberg/manifest/manifest_writer.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" +#include "iceberg/table_properties.h" +#include "iceberg/transaction.h" +#include "iceberg/util/content_file_util.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/snapshot_util_internal.h" + +namespace iceberg { + +namespace { + +const std::array kValidateAddedFilesOperations = { + DataOperation::kAppend, DataOperation::kOverwrite}; +const std::array kValidateDataFilesExistOperations = { + DataOperation::kOverwrite, DataOperation::kReplace, DataOperation::kDelete}; +const std::array kValidateDataFilesExistSkipDeleteOperations = { + DataOperation::kOverwrite, DataOperation::kReplace}; +const std::array kValidateAddedDeleteFilesOperations = { + DataOperation::kOverwrite, DataOperation::kDelete}; +const std::array kValidateAddedDVsOperations = { + DataOperation::kOverwrite, DataOperation::kDelete, DataOperation::kReplace}; + +bool MatchesOperation(std::optional operation, + std::span expected) { + return operation.has_value() && + std::ranges::find(expected, operation.value()) != expected.end(); +} + +struct ValidationHistoryResult { + std::vector manifests; + std::unordered_set snapshot_ids; +}; + +Result>> ValidationAncestorsBetween( + const TableMetadata& metadata, int64_t latest_snapshot_id, + std::optional starting_snapshot_id) { + ICEBERG_ASSIGN_OR_RAISE( + auto ancestors, + SnapshotUtil::AncestorsBetween(metadata, latest_snapshot_id, starting_snapshot_id)); + if (!starting_snapshot_id.has_value()) { + if (!ancestors.empty()) { + const auto& oldest_checked = ancestors.back(); + if (oldest_checked == nullptr || oldest_checked->parent_snapshot_id.has_value()) { + return ValidationFailed( + "Cannot validate history: cannot determine complete history for snapshot {}", + latest_snapshot_id); + } + } + return ancestors; + } + + if (latest_snapshot_id == starting_snapshot_id.value()) { + return ancestors; + } + if (ancestors.empty()) { + return ValidationFailed( + "Cannot validate history: starting snapshot {} is not an ancestor " + "of snapshot {}", + starting_snapshot_id.value(), latest_snapshot_id); + } + + const auto& oldest_checked = ancestors.back(); + if (oldest_checked == nullptr || !oldest_checked->parent_snapshot_id.has_value() || + oldest_checked->parent_snapshot_id.value() != starting_snapshot_id.value()) { + return ValidationFailed( + "Cannot validate history: starting snapshot {} is not an ancestor " + "of snapshot {}", + starting_snapshot_id.value(), latest_snapshot_id); + } + return ancestors; +} + +Result ValidationHistory( + const TableMetadata& metadata, int64_t latest_snapshot_id, + std::optional starting_snapshot_id, + std::span matching_operations, ManifestContent content, + const std::shared_ptr& io) { + ICEBERG_ASSIGN_OR_RAISE( + auto ancestors, + ValidationAncestorsBetween(metadata, latest_snapshot_id, starting_snapshot_id)); + + ValidationHistoryResult result; + for (const auto& snapshot : ancestors) { + if (!MatchesOperation(snapshot->Operation(), matching_operations)) { + continue; + } + + result.snapshot_ids.insert(snapshot->snapshot_id); + auto cached = SnapshotCache(snapshot.get()); + ICEBERG_ASSIGN_OR_RAISE(auto manifests, content == ManifestContent::kData + ? cached.DataManifests(io) + : cached.DeleteManifests(io)); + for (const auto& manifest : manifests) { + if (manifest.added_snapshot_id == snapshot->snapshot_id) { + result.manifests.push_back(manifest); + } + } + } + + return result; +} + +Result>> PartitionSpecsByIdMap( + const TableMetadata& metadata) { + TableMetadataCache metadata_cache(&metadata); + ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, metadata_cache.GetPartitionSpecsById()); + return std::unordered_map>( + specs_ref.get().begin(), specs_ref.get().end()); +} + +Result> MakeValidationManifestGroup( + const TableMetadata& metadata, const std::shared_ptr& io, + std::vector manifests) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, PartitionSpecsByIdMap(metadata)); + return ManifestGroup::Make(io, std::move(schema), std::move(specs_by_id), + std::move(manifests)); +} + +Result StartingSequenceNumber(const TableMetadata& metadata, + std::optional starting_snapshot_id) { + if (starting_snapshot_id.has_value()) { + auto snapshot = metadata.SnapshotById(starting_snapshot_id.value()); + if (snapshot.has_value()) { + return snapshot.value()->sequence_number; + } + } + return TableMetadata::kInitialSequenceNumber; +} + +Result> BuildDeleteFileIndex( + const TableMetadata& metadata, const std::shared_ptr& io, + std::vector delete_manifests, int64_t starting_sequence_number, + std::shared_ptr data_filter, std::shared_ptr partition_set, + bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, PartitionSpecsByIdMap(metadata)); + ICEBERG_ASSIGN_OR_RAISE(auto builder, DeleteFileIndex::BuilderFor( + io, std::move(schema), std::move(specs_by_id), + std::move(delete_manifests))); + builder.AfterSequenceNumber(starting_sequence_number); + builder.CaseSensitive(case_sensitive); + if (data_filter != nullptr) { + builder.DataFilter(std::move(data_filter)); + } + if (partition_set != nullptr) { + builder.FilterPartitions(std::move(partition_set)); + } + return builder.Build(); +} + +Result> FilterManifestsByPartition( + const TableMetadata& metadata, std::shared_ptr conflict_detection_filter, + const std::vector& manifests, bool case_sensitive) { + if (conflict_detection_filter == nullptr || + conflict_detection_filter->op() == Expression::Operation::kTrue) { + return manifests; + } + + const int32_t default_spec_id = metadata.default_spec_id; + if (std::ranges::any_of(manifests, [default_spec_id](const ManifestFile& manifest) { + return manifest.partition_spec_id != default_spec_id; + })) { + return manifests; + } + + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, PartitionSpecsByIdMap(metadata)); + std::unordered_map> eval_cache; + std::vector matching_manifests; + for (const auto& manifest : manifests) { + auto it = eval_cache.find(manifest.partition_spec_id); + if (it == eval_cache.end()) { + auto spec_it = specs_by_id.find(manifest.partition_spec_id); + if (spec_it == specs_by_id.end()) { + return InvalidArgument("Cannot find partition spec ID {}", + manifest.partition_spec_id); + } + + auto projector = Projections::Inclusive(*spec_it->second, *schema, case_sensitive); + ICEBERG_ASSIGN_OR_RAISE(auto partition_filter, + projector->Project(conflict_detection_filter)); + ICEBERG_ASSIGN_OR_RAISE( + auto evaluator, + ManifestEvaluator::MakePartitionFilter( + std::move(partition_filter), spec_it->second, *schema, case_sensitive)); + it = eval_cache.emplace(manifest.partition_spec_id, std::move(evaluator)).first; + } + + ICEBERG_ASSIGN_OR_RAISE(auto matches, it->second->Evaluate(manifest)); + if (matches) { + matching_manifests.push_back(manifest); + } + } + return matching_manifests; +} + +void FilterManifestEntriesByPartitionSet(ManifestGroup& group, + const PartitionSet* partition_set) { + if (partition_set != nullptr) { + auto partitions = std::make_shared(*partition_set); + group.FilterManifestEntries([partitions](const ManifestEntry& entry) { + return entry.data_file != nullptr && + entry.data_file->partition_spec_id.has_value() && + partitions->contains(entry.data_file->partition_spec_id.value(), + entry.data_file->partition); + }); + } +} + +Result> MatchingAddedDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const PartitionSet* partition_set, + const std::shared_ptr& parent, const std::shared_ptr& io, + bool case_sensitive) { + if (parent == nullptr) { + return std::vector{}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + kValidateAddedFilesOperations, ManifestContent::kData, io)); + auto new_snapshots = + std::make_shared>(std::move(history.snapshot_ids)); + ICEBERG_ASSIGN_OR_RAISE(auto group, MakeValidationManifestGroup( + metadata, io, std::move(history.manifests))); + group->CaseSensitive(case_sensitive) + .FilterManifestEntries([new_snapshots](const ManifestEntry& entry) { + return entry.snapshot_id.has_value() && + new_snapshots->contains(entry.snapshot_id.value()) && + entry.data_file != nullptr; + }) + .IgnoreDeleted() + .IgnoreExisting(); + if (data_filter != nullptr) { + group->FilterData(std::move(data_filter)); + } + FilterManifestEntriesByPartitionSet(*group, partition_set); + return group->Entries(); +} + +Result> MatchingDeletedDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const PartitionSet* partition_set, + const std::shared_ptr& parent, const std::shared_ptr& io, + bool case_sensitive) { + if (parent == nullptr) { + return std::vector{}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + kValidateDataFilesExistOperations, ManifestContent::kData, io)); + auto new_snapshots = + std::make_shared>(std::move(history.snapshot_ids)); + ICEBERG_ASSIGN_OR_RAISE(auto group, MakeValidationManifestGroup( + metadata, io, std::move(history.manifests))); + group->CaseSensitive(case_sensitive) + .FilterManifestEntries([new_snapshots](const ManifestEntry& entry) { + return entry.snapshot_id.has_value() && + new_snapshots->contains(entry.snapshot_id.value()); + }) + .FilterManifestEntries([](const ManifestEntry& entry) { + return entry.status == ManifestStatus::kDeleted && entry.data_file != nullptr; + }) + .IgnoreExisting(); + if (data_filter != nullptr) { + group->FilterData(std::move(data_filter)); + } + FilterManifestEntriesByPartitionSet(*group, partition_set); + return group->Entries(); +} + +std::string FormatLocations(std::vector locations) { + std::string result = "["; + for (size_t i = 0; i < locations.size(); ++i) { + if (i > 0) { + result += ", "; + } + result += locations[i]; + } + result += "]"; + return result; +} + +std::vector DataFilePaths(const std::vector& entries) { + std::vector paths; + paths.reserve(entries.size()); + for (const auto& entry : entries) { + if (entry.data_file != nullptr) { + paths.push_back(entry.data_file->file_path); + } + } + return paths; +} + +std::optional DataFileLocations(const std::vector& entries) { + auto paths = DataFilePaths(entries); + if (paths.empty()) { + return std::optional{}; + } + return FormatLocations(std::move(paths)); +} + +std::optional DeleteFileLocations( + const std::vector>& delete_files) { + std::vector paths; + paths.reserve(delete_files.size()); + for (const auto& delete_file : delete_files) { + if (delete_file != nullptr) { + paths.push_back(delete_file->file_path); + } + } + if (paths.empty()) { + return std::optional{}; + } + return FormatLocations(std::move(paths)); +} + +Status ValidateAddedDVsInManifest( + const TableMetadata& metadata, const ManifestFile& manifest, + std::shared_ptr conflict_detection_filter, + const std::unordered_set& new_snapshot_ids, + const std::unordered_set& referenced_data_files, + const std::shared_ptr& io, const std::shared_ptr& schema, + bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, + metadata.PartitionSpecById(manifest.partition_spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto reader, ManifestReader::Make(manifest, io, schema, spec)); + reader->CaseSensitive(case_sensitive); + reader->FilterRows(std::move(conflict_detection_filter)); + ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->LiveEntries()); + + for (const auto& entry : entries) { + if (!entry.snapshot_id.has_value() || + !new_snapshot_ids.contains(entry.snapshot_id.value())) { + continue; + } + if (entry.data_file == nullptr || !ContentFileUtil::IsDV(*entry.data_file) || + !entry.data_file->referenced_data_file.has_value()) { + continue; + } + if (referenced_data_files.contains(*entry.data_file->referenced_data_file)) { + return ValidationFailed("Found concurrently added DV for {}: {}", + *entry.data_file->referenced_data_file, + ContentFileUtil::DVDesc(*entry.data_file)); + } + } + return {}; +} + +} // namespace + +MergingSnapshotUpdate::MergingSnapshotUpdate(std::string table_name, + std::shared_ptr ctx) + : SnapshotUpdate(std::move(ctx)), + table_name_(std::move(table_name)), + delete_expression_(Expressions::AlwaysFalse()) { + auto file_io = ctx_->table->io(); + auto data_filter_manager = ManifestFilterManager::Make( + ManifestContent::kData, file_io, + [this](const std::string& location) { return DeleteFile(location); }); + if (!data_filter_manager.has_value()) { + AddError(data_filter_manager.error()); + } else { + data_filter_manager_ = std::move(data_filter_manager.value()); + } + + auto delete_filter_manager = ManifestFilterManager::Make( + ManifestContent::kDeletes, file_io, + [this](const std::string& location) { return DeleteFile(location); }); + if (!delete_filter_manager.has_value()) { + AddError(delete_filter_manager.error()); + } else { + delete_filter_manager_ = std::move(delete_filter_manager.value()); + } + + const int64_t target_size_bytes = + base().properties.Get(TableProperties::kManifestTargetSizeBytes); + const int32_t min_count_to_merge = + base().properties.Get(TableProperties::kManifestMinMergeCount); + const bool merge_enabled = + base().properties.Get(TableProperties::kManifestMergeEnabled); + auto data_merge_manager = ManifestMergeManager::Make( + ManifestContent::kData, target_size_bytes, min_count_to_merge, merge_enabled, + file_io, [this] { return SnapshotId(); }, + [this](const std::string& location) { return DeleteFile(location); }); + if (!data_merge_manager.has_value()) { + AddError(data_merge_manager.error()); + } else { + data_merge_manager_ = std::move(data_merge_manager.value()); + } + + auto delete_merge_manager = ManifestMergeManager::Make( + ManifestContent::kDeletes, target_size_bytes, min_count_to_merge, merge_enabled, + file_io, [this] { return SnapshotId(); }, + [this](const std::string& location) { return DeleteFile(location); }); + if (!delete_merge_manager.has_value()) { + AddError(delete_merge_manager.error()); + } else { + delete_merge_manager_ = std::move(delete_merge_manager.value()); + } +} + +// ------------------------------------------------------------------------- +// Primitive API +// ------------------------------------------------------------------------- + +Status MergingSnapshotUpdate::AddDataFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot add a null data file"); + } + if (!file->partition_spec_id.has_value()) { + return InvalidArgument("Data file must have a partition spec ID"); + } + + int32_t spec_id = file->partition_spec_id.value(); + ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + + // Suppress first_row_id in the staged copy. The commit assigns row IDs for newly + // added files and must not mutate the caller-owned file object. + auto staged_file = std::make_shared(*file); + staged_file->first_row_id = std::nullopt; + + auto& data_files = new_data_files_by_spec_[spec_id]; + auto [it, inserted] = data_files.insert(staged_file); + if (inserted) { + has_new_data_files_ = true; + ICEBERG_RETURN_UNEXPECTED(added_data_files_summary_.AddedFile(*spec, *staged_file)); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNewDeleteFile(const TableMetadata& metadata, + const DataFile& file) { + if (file.content == DataFile::Content::kData) { + return InvalidArgument("Expected a delete file but got a data file: {}", + file.file_path); + } + const int8_t format_version = metadata.format_version; + const bool is_dv = ContentFileUtil::IsDV(file); + switch (format_version) { + case 1: + return InvalidArgument("Deletes are supported in V2 and above"); + case 2: + // Position deletes must NOT be DVs in v2. + if (file.content == DataFile::Content::kPositionDeletes && is_dv) { + return InvalidArgument("Must not use DVs for position deletes in V2: {}", + file.file_path); + } + break; + default: + if (format_version >= 3 && + format_version <= TableMetadata::kSupportedTableFormatVersion) { + // Position deletes MUST be DVs in v3+. + if (file.content == DataFile::Content::kPositionDeletes && !is_dv) { + return InvalidArgument("Must use DVs for position deletes in V{}: {}", + format_version, file.file_path); + } + } else { + return InvalidArgument("Unsupported format version: {}", format_version); + } + break; + } + return {}; +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file) { + return AddDeleteFile(std::move(file), std::nullopt); +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file, + int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), std::optional(data_sequence_number)); +} + +void MergingSnapshotUpdate::PendingDeleteFilesByReferencedFile::Add( + std::string referenced_file, PendingDeleteFile file) { + auto [iter, inserted] = + index_by_referenced_file_.try_emplace(referenced_file, entries_.size()); + if (inserted) { + entries_.push_back(Entry{.referenced_file = std::move(referenced_file), .files = {}}); + } + entries_[iter->second].files.push_back(std::move(file)); +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file, + std::optional data_sequence_number) { + if (!file) { + return InvalidArgument("Cannot add a null delete file"); + } + ICEBERG_RETURN_UNEXPECTED(ValidateNewDeleteFile(base(), *file)); + if (!file->partition_spec_id.has_value()) { + return InvalidArgument("Delete file must have a partition spec ID"); + } + ICEBERG_RETURN_UNEXPECTED(base().PartitionSpecById(file->partition_spec_id.value())); + has_new_delete_files_ = true; + PendingDeleteFile pending_file{.file = std::move(file), + .data_sequence_number = std::move(data_sequence_number)}; + if (ContentFileUtil::IsDV(*pending_file.file)) { + ICEBERG_PRECHECK(pending_file.file->referenced_data_file.has_value(), + "DV must have a referenced data file: {}", + pending_file.file->file_path); + auto referenced_data_file = *pending_file.file->referenced_data_file; + dvs_by_referenced_file_.Add(std::move(referenced_data_file), std::move(pending_file)); + } else { + v2_deletes_.push_back(std::move(pending_file)); + } + return {}; +} + +Status MergingSnapshotUpdate::DeleteDataFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot delete a null data file"); + } + return data_filter_manager_->DeleteFile(std::move(file)); +} + +Status MergingSnapshotUpdate::DeleteDeleteFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot delete a null delete file"); + } + return delete_filter_manager_->DeleteFile(std::move(file)); +} + +Status MergingSnapshotUpdate::DeleteByPath(std::string_view path) { + return data_filter_manager_->DeleteFile(path); +} + +Status MergingSnapshotUpdate::DeleteByRowFilter(std::shared_ptr expr) { + // If a delete file matches the row filter, it can also be removed because the rows + // it references will also be deleted. Both filter managers receive the expression. + delete_expression_ = expr; + ICEBERG_RETURN_UNEXPECTED(data_filter_manager_->DeleteByRowFilter(expr)); + return delete_filter_manager_->DeleteByRowFilter(std::move(expr)); +} + +Status MergingSnapshotUpdate::DropPartition(int32_t spec_id, PartitionValues partition) { + // Dropping data in a partition also drops all delete files in that partition. + ICEBERG_RETURN_UNEXPECTED(data_filter_manager_->DropPartition(spec_id, partition)); + ICEBERG_RETURN_UNEXPECTED( + delete_filter_manager_->DropPartition(spec_id, std::move(partition))); + return {}; +} + +void MergingSnapshotUpdate::FailMissingDeletePaths() { + data_filter_manager_->FailMissingDeletePaths(); + delete_filter_manager_->FailMissingDeletePaths(); +} + +void MergingSnapshotUpdate::FailAnyDelete() { + data_filter_manager_->FailAnyDelete(); + delete_filter_manager_->FailAnyDelete(); +} + +void MergingSnapshotUpdate::SetNewDataFilesDataSequenceNumber(int64_t sequence_number) { + new_data_files_data_seq_number_ = sequence_number; +} + +void MergingSnapshotUpdate::CaseSensitive(bool case_sensitive) { + case_sensitive_ = case_sensitive; + data_filter_manager_->CaseSensitive(case_sensitive); + delete_filter_manager_->CaseSensitive(case_sensitive); +} + +void MergingSnapshotUpdate::SetSummaryProperty(const std::string& property, + const std::string& value) { + custom_summary_properties_[property] = value; + SnapshotUpdate::SetSummaryProperty(property, value); +} + +Result> MergingSnapshotUpdate::DataSpec() const { + if (new_data_files_by_spec_.empty()) { + return InvalidArgument("DataSpec() called before any data file was added"); + } + if (new_data_files_by_spec_.size() > 1) { + return InvalidArgument( + "DataSpec() requires exactly one partition spec; got {} different specs", + new_data_files_by_spec_.size()); + } + return base().PartitionSpecById(new_data_files_by_spec_.begin()->first); +} + +std::vector> MergingSnapshotUpdate::AddedDataFiles() const { + std::vector> result; + for (const auto& [spec_id, files] : new_data_files_by_spec_) { + for (const auto& file : files) { + result.push_back(file); + } + } + return result; +} + +Status MergingSnapshotUpdate::AddManifest(ManifestFile manifest) { + if (manifest.content != ManifestContent::kData) { + return InvalidArgument("Cannot append delete manifest: {}", manifest.manifest_path); + } + if (can_inherit_snapshot_id() && manifest.added_snapshot_id == kInvalidSnapshotId) { + if (manifest.first_row_id.has_value()) { + return InvalidArgument("Cannot append manifest with assigned first row ID: {}", + manifest.manifest_path); + } + appended_manifests_summary_.AddedManifest(manifest); + append_manifests_.push_back(std::move(manifest)); + } else { + ICEBERG_ASSIGN_OR_RAISE(auto copied, CopyManifest(manifest)); + rewritten_append_manifests_.push_back(std::move(copied)); + } + return {}; +} + +Result MergingSnapshotUpdate::CopyManifest(const ManifestFile& manifest) { + const TableMetadata& current = base(); + ICEBERG_ASSIGN_OR_RAISE(auto schema, SnapshotUtil::SchemaFor(current, target_branch())); + ICEBERG_ASSIGN_OR_RAISE(auto spec, + current.PartitionSpecById(manifest.partition_spec_id)); + std::string path = ManifestPath(); + return CopyAppendManifest(manifest, ctx_->table->io(), schema, spec, SnapshotId(), path, + current.format_version, &appended_manifests_summary_); +} + +// ------------------------------------------------------------------------- +// State queries +// ------------------------------------------------------------------------- + +bool MergingSnapshotUpdate::AddsDataFiles() const { + return !new_data_files_by_spec_.empty(); +} + +bool MergingSnapshotUpdate::AddsDeleteFiles() const { + return !v2_deletes_.empty() || !dvs_by_referenced_file_.empty(); +} + +bool MergingSnapshotUpdate::DeletesDataFiles() const { + return data_filter_manager_->ContainsDeletes(); +} + +bool MergingSnapshotUpdate::DeletesDeleteFiles() const { + return delete_filter_manager_->ContainsDeletes(); +} + +Status MergingSnapshotUpdate::ManagersReady() const { + ICEBERG_CHECK(data_filter_manager_ != nullptr, + "Data filter manager is not initialized"); + ICEBERG_CHECK(delete_filter_manager_ != nullptr, + "Delete filter manager is not initialized"); + ICEBERG_CHECK(data_merge_manager_ != nullptr, "Data merge manager is not initialized"); + ICEBERG_CHECK(delete_merge_manager_ != nullptr, + "Delete merge manager is not initialized"); + return {}; +} + +// ------------------------------------------------------------------------- +// Apply pipeline +// ------------------------------------------------------------------------- + +ManifestWriterFactory MergingSnapshotUpdate::MakeWriterFactory( + const std::shared_ptr& schema) { + return + [this, schema](int32_t spec_id, + ManifestContent content) -> Result> { + const TableMetadata& meta = base(); + ICEBERG_ASSIGN_OR_RAISE(auto spec, meta.PartitionSpecById(spec_id)); + std::string path = ManifestPath(); + return ManifestWriter::MakeWriter(meta.format_version, SnapshotId(), + std::move(path), ctx_->table->io(), + std::move(spec), schema, content); + }; +} + +Result> MergingSnapshotUpdate::WriteNewDataManifests() { + // If new files were staged after the cache was populated (commit retry), invalidate. + if (has_new_data_files_ && !cached_new_data_manifests_.empty()) { + for (const auto& m : cached_new_data_manifests_) { + std::ignore = DeleteFile(m.manifest_path); + } + cached_new_data_manifests_.clear(); + } + + if (!cached_new_data_manifests_.empty()) { + return cached_new_data_manifests_; + } + + std::vector result; + for (const auto& [spec_id, data_files] : new_data_files_by_spec_) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + ICEBERG_ASSIGN_OR_RAISE( + auto written, + WriteDataManifests(data_files.as_span(), spec, new_data_files_data_seq_number_)); + result.insert(result.end(), std::make_move_iterator(written.begin()), + std::make_move_iterator(written.end())); + } + + cached_new_data_manifests_ = result; + has_new_data_files_ = false; + return result; +} + +Result> +MergingSnapshotUpdate::MergeDVs() const { + std::vector result; + result.reserve(dvs_by_referenced_file_.size()); + + for (const auto& entry : dvs_by_referenced_file_.entries()) { + const auto& referenced_file = entry.referenced_file; + const auto& dvs = entry.files; + if (dvs.empty()) { + continue; + } + if (dvs.size() > 1) { + // TODO(Guotao): Merge duplicate DVs for one referenced data file once C++ + // has DVUtil/Puffin DV rewriting; Java merges them before writing manifests. + return NotImplemented( + "Merging multiple deletion vectors is not supported yet for referenced " + "data file: {}", + referenced_file); + } + + result.push_back(dvs.front()); + } + + return result; +} + +Result> MergingSnapshotUpdate::WriteNewDeleteManifests() { + // If new files were staged after the cache was populated (commit retry), invalidate. + if (has_new_delete_files_ && !cached_new_delete_manifests_.empty()) { + for (const auto& m : cached_new_delete_manifests_) { + std::ignore = DeleteFile(m.manifest_path); + } + cached_new_delete_manifests_.clear(); + added_delete_files_summary_.Clear(); + } + + if (!cached_new_delete_manifests_.empty()) { + return cached_new_delete_manifests_; + } + + ICEBERG_ASSIGN_OR_RAISE(auto merged_dvs, MergeDVs()); + + std::vector new_delete_files; + new_delete_files.reserve(merged_dvs.size() + v2_deletes_.size()); + new_delete_files.insert(new_delete_files.end(), merged_dvs.begin(), merged_dvs.end()); + + DeleteFileSet v2_delete_set; + for (const auto& pending_file : v2_deletes_) { + if (v2_delete_set.insert(pending_file.file).second) { + new_delete_files.push_back(pending_file); + } + } + + // Group delete files by partition spec ID, mirroring Java newDeleteFilesAsManifests(). + std::unordered_map> delete_files_by_spec; + for (const auto& pending_file : new_delete_files) { + delete_files_by_spec[pending_file.file->partition_spec_id.value()].push_back( + pending_file); + } + + std::vector result; + added_delete_files_summary_.Clear(); + for (auto& [spec_id, delete_files] : delete_files_by_spec) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + std::vector delete_entries; + delete_entries.reserve(delete_files.size()); + for (const auto& pending_file : delete_files) { + ICEBERG_RETURN_UNEXPECTED( + added_delete_files_summary_.AddedFile(*spec, *pending_file.file)); + delete_entries.push_back(ContentFileWithSequenceNumber{ + .file = pending_file.file, + .data_sequence_number = pending_file.data_sequence_number, + }); + } + ICEBERG_ASSIGN_OR_RAISE(auto written, WriteDeleteManifests(delete_entries, spec)); + result.insert(result.end(), std::make_move_iterator(written.begin()), + std::make_move_iterator(written.end())); + } + + cached_new_delete_manifests_ = result; + has_new_delete_files_ = false; + return result; +} + +Result> MergingSnapshotUpdate::Apply( + const TableMetadata& metadata_to_update, const std::shared_ptr& snapshot) { + ICEBERG_RETURN_UNEXPECTED(ManagersReady()); + + // Re-validate buffered delete files against the current format version. A format + // upgrade between staging and commit could make previously-valid files invalid. + for (const auto& pending_file : v2_deletes_) { + ICEBERG_RETURN_UNEXPECTED( + ValidateNewDeleteFile(metadata_to_update, *pending_file.file)); + } + for (const auto& entry : dvs_by_referenced_file_.entries()) { + for (const auto& pending_file : entry.files) { + ICEBERG_RETURN_UNEXPECTED( + ValidateNewDeleteFile(metadata_to_update, *pending_file.file)); + } + } + + ICEBERG_ASSIGN_OR_RAISE(auto target_schema, + SnapshotUtil::SchemaFor(metadata_to_update, target_branch())); + auto writer_factory = MakeWriterFactory(target_schema); + + // Step 1: Filter data manifests. + ICEBERG_ASSIGN_OR_RAISE(auto filtered_data, data_filter_manager_->FilterManifests( + target_schema, metadata_to_update, + snapshot, writer_factory)); + + // Step 2: Compute min data sequence number; set up delete filter cleanup. + // Skip unassigned manifests written in this Apply() call. + int64_t min_data_seq = metadata_to_update.last_sequence_number; + for (const auto& manifest : filtered_data) { + if (manifest.min_sequence_number != kUnassignedSequenceNumber) { + min_data_seq = std::min(min_data_seq, manifest.min_sequence_number); + } + } + ICEBERG_RETURN_UNEXPECTED( + delete_filter_manager_->DropDeleteFilesOlderThan(min_data_seq)); + delete_filter_manager_->RemoveDanglingDeletesFor( + data_filter_manager_->FilesToBeDeleted()); + + // Step 3: Filter delete manifests. + ICEBERG_ASSIGN_OR_RAISE(auto filtered_deletes, delete_filter_manager_->FilterManifests( + target_schema, metadata_to_update, + snapshot, writer_factory)); + + TableMetadataCache metadata_cache(&metadata_to_update); + ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, metadata_cache.GetPartitionSpecsById()); + ICEBERG_ASSIGN_OR_RAISE( + auto data_filter_summary, + data_filter_manager_->BuildSummary(filtered_data, specs_by_id.get())); + ICEBERG_ASSIGN_OR_RAISE( + auto delete_filter_summary, + delete_filter_manager_->BuildSummary(filtered_deletes, specs_by_id.get())); + + // Drop manifests with no live files - they carry no data and should not be merged + // into the new snapshot. Manifests written by the current snapshot are always kept + // regardless of live-file counts; the merge stage handles any that are empty. + int64_t snapshot_id = SnapshotId(); + auto should_keep = [snapshot_id](const ManifestFile& m) { + return m.has_added_files() || m.has_existing_files() || + m.added_snapshot_id == snapshot_id; + }; + + // Step 4: Write (or retrieve cached) new data manifests. + ICEBERG_ASSIGN_OR_RAISE(auto written_data_manifests, WriteNewDataManifests()); + + // Incorporate append manifests (from AddManifest), stamping each with the + // current snapshot ID. append_manifests_ are used directly (inherit path); + // rewritten_append_manifests_ were already copied with the snapshot ID. + std::vector new_data_manifests = std::move(written_data_manifests); + for (const auto& src : append_manifests_) { + ManifestFile m = src; + m.added_snapshot_id = snapshot_id; + new_data_manifests.push_back(std::move(m)); + } + for (const auto& src : rewritten_append_manifests_) { + ManifestFile m = src; + m.added_snapshot_id = snapshot_id; + new_data_manifests.push_back(std::move(m)); + } + + // Step 5: Write (or retrieve cached) new delete manifests. + ICEBERG_ASSIGN_OR_RAISE(auto new_delete_manifests, WriteNewDeleteManifests()); + + std::erase_if(new_data_manifests, + [&](const ManifestFile& m) { return !should_keep(m); }); + std::erase_if(filtered_data, [&](const ManifestFile& m) { return !should_keep(m); }); + std::erase_if(new_delete_manifests, + [&](const ManifestFile& m) { return !should_keep(m); }); + std::erase_if(filtered_deletes, [&](const ManifestFile& m) { return !should_keep(m); }); + + // Rebuild summary from stable sub-builders so that commit retries don't double-count. + summary_builder().Clear(); + summary_builder().Merge(added_data_files_summary_); + summary_builder().Merge(added_delete_files_summary_); + summary_builder().Merge(appended_manifests_summary_); + for (const auto& [property, value] : custom_summary_properties_) { + summary_builder().Set(property, value); + } + summary_builder().Merge(data_filter_summary); + summary_builder().Merge(delete_filter_summary); + + // Step 6: Merge data manifests. + ICEBERG_ASSIGN_OR_RAISE(auto merged_data, data_merge_manager_->MergeManifests( + filtered_data, new_data_manifests, + metadata_to_update, writer_factory)); + + // Step 7: Merge delete manifests. + ICEBERG_ASSIGN_OR_RAISE(auto merged_deletes, delete_merge_manager_->MergeManifests( + filtered_deletes, new_delete_manifests, + metadata_to_update, writer_factory)); + + std::vector result; + result.reserve(merged_data.size() + merged_deletes.size()); + result.insert(result.end(), std::make_move_iterator(merged_data.begin()), + std::make_move_iterator(merged_data.end())); + result.insert(result.end(), std::make_move_iterator(merged_deletes.begin()), + std::make_move_iterator(merged_deletes.end())); + + // Manifest count summary: unassigned manifests count as neither created nor kept. + int32_t manifests_created = 0; + int32_t manifests_kept = 0; + for (const auto& m : result) { + if (m.added_snapshot_id == snapshot_id) { + ++manifests_created; + } else if (m.added_snapshot_id != kInvalidSnapshotId) { + ++manifests_kept; + } + } + int32_t replaced_manifests_count = data_filter_manager_->ReplacedManifestsCount() + + delete_filter_manager_->ReplacedManifestsCount() + + data_merge_manager_->ReplacedManifestsCount() + + delete_merge_manager_->ReplacedManifestsCount(); + summary_builder().Set(SnapshotSummaryFields::kManifestsCreated, + std::to_string(manifests_created)); + summary_builder().Set(SnapshotSummaryFields::kManifestsKept, + std::to_string(manifests_kept)); + summary_builder().Set(SnapshotSummaryFields::kManifestsReplaced, + std::to_string(replaced_manifests_count)); + + return result; +} + +Status MergingSnapshotUpdate::CleanUncommitted( + const std::unordered_set& committed) { + ICEBERG_RETURN_UNEXPECTED(ManagersReady()); + ICEBERG_RETURN_UNEXPECTED(data_merge_manager_->CleanUncommitted(committed)); + ICEBERG_RETURN_UNEXPECTED(data_filter_manager_->CleanUncommitted(committed)); + ICEBERG_RETURN_UNEXPECTED(delete_merge_manager_->CleanUncommitted(committed)); + ICEBERG_RETURN_UNEXPECTED(delete_filter_manager_->CleanUncommitted(committed)); + ICEBERG_RETURN_UNEXPECTED(CleanUncommittedAppends(committed)); + return {}; +} + +Status MergingSnapshotUpdate::CleanUncommittedAppends( + const std::unordered_set& committed) { + ICEBERG_RETURN_UNEXPECTED( + DeleteUncommitted(cached_new_data_manifests_, committed, /*clear=*/true)); + ICEBERG_RETURN_UNEXPECTED( + DeleteUncommitted(cached_new_delete_manifests_, committed, /*clear=*/true)); + // rewritten_append_manifests_ are always owned by the table. + ICEBERG_RETURN_UNEXPECTED( + DeleteUncommitted(rewritten_append_manifests_, committed, /*clear=*/false)); + + // append_manifests_ are only owned by the table if the commit succeeded. + if (!committed.empty()) { + ICEBERG_RETURN_UNEXPECTED( + DeleteUncommitted(append_manifests_, committed, /*clear=*/false)); + } + + has_new_data_files_ = false; + has_new_delete_files_ = false; + return {}; +} + +Status MergingSnapshotUpdate::DeleteUncommitted( + std::vector& manifests, + const std::unordered_set& committed, bool clear) { + for (const auto& manifest : manifests) { + if (!committed.contains(manifest.manifest_path)) { + std::ignore = DeleteFile(manifest.manifest_path); + } + } + if (clear) { + manifests.clear(); + } + return {}; +} + +std::unordered_map MergingSnapshotUpdate::Summary() { + summary_builder().SetPartitionSummaryLimit( + base().properties.Get(TableProperties::kWritePartitionSummaryLimit)); + return summary_builder().Build(); +} + +// ------------------------------------------------------------------------- +// Conflict-detection helpers +// ------------------------------------------------------------------------- + +Status MergingSnapshotUpdate::ValidateAddedDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_entries, + MatchingAddedDataFiles(metadata, starting_snapshot_id, data_filter, + /*partition_set=*/nullptr, parent, io, case_sensitive)); + auto conflict_paths = DataFileLocations(conflict_entries); + if (conflict_paths.has_value()) { + return ValidationFailed( + "Found conflicting files that can contain records matching {}: {}", + data_filter != nullptr ? data_filter->ToString() : "any expression", + conflict_paths.value()); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateDataFilesExist( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const std::unordered_set& file_paths, bool skip_deletes, + std::shared_ptr conflict_detection_filter, + const std::shared_ptr& parent, std::shared_ptr io, + bool /*case_sensitive*/) { + if (parent == nullptr || file_paths.empty()) { + return {}; + } + + std::span matching_operations = + skip_deletes + ? std::span(kValidateDataFilesExistSkipDeleteOperations) + : std::span(kValidateDataFilesExistOperations); + ICEBERG_ASSIGN_OR_RAISE( + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + matching_operations, ManifestContent::kData, io)); + + ICEBERG_ASSIGN_OR_RAISE(auto group, MakeValidationManifestGroup( + metadata, io, std::move(history.manifests))); + group->IgnoreExisting(); + group->FilterManifestEntries([&history, &file_paths](const ManifestEntry& entry) { + return entry.status != ManifestStatus::kAdded && entry.snapshot_id.has_value() && + history.snapshot_ids.contains(entry.snapshot_id.value()) && + entry.data_file != nullptr && file_paths.contains(entry.data_file->file_path); + }); + if (conflict_detection_filter != nullptr) { + group->FilterData(std::move(conflict_detection_filter)); + } + + ICEBERG_ASSIGN_OR_RAISE(auto entries, group->Entries()); + auto deleted_paths = DataFileLocations(entries); + if (deleted_paths.has_value()) { + return ValidationFailed("Cannot commit, missing data files: {}", + deleted_paths.value()); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const DataFileSet& replaced_files, const std::shared_ptr& parent, + std::shared_ptr io, bool ignore_equality_deletes) { + if (parent == nullptr || replaced_files.empty() || metadata.format_version < 2) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE(auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, + /*data_filter=*/nullptr, + /*partition_set=*/nullptr, parent, io)); + + ICEBERG_ASSIGN_OR_RAISE(auto starting_sequence_number, + StartingSequenceNumber(metadata, starting_snapshot_id)); + + for (const auto& data_file : replaced_files) { + ICEBERG_ASSIGN_OR_RAISE(auto delete_files, + deletes->ForDataFile(starting_sequence_number, *data_file)); + if (ignore_equality_deletes) { + // Only fail on position deletes — equality deletes at higher sequence numbers + // still apply to the rewritten files and are not a conflict. + for (const auto& df : delete_files) { + if (df->content == DataFile::Content::kPositionDeletes) { + return ValidationFailed( + "Cannot commit, found new position delete for replaced data file: {}", + data_file->file_path); + } + } + } else { + if (!delete_files.empty()) { + return ValidationFailed( + "Cannot commit, found new delete for replaced data file: {}", + data_file->file_path); + } + } + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateAddedDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_entries, + MatchingAddedDataFiles(metadata, starting_snapshot_id, + /*data_filter=*/nullptr, &partition_set, parent, io, + /*case_sensitive=*/true)); + auto conflict_paths = DataFileLocations(conflict_entries); + if (conflict_paths.has_value()) { + return ValidationFailed( + "Found conflicting files that can contain records matching validated " + "partitions: {}", + conflict_paths.value()); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const DataFileSet& replaced_files, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive) { + if (parent == nullptr || replaced_files.empty() || metadata.format_version < 2) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, std::move(data_filter), + /*partition_set=*/nullptr, parent, io, case_sensitive)); + + ICEBERG_ASSIGN_OR_RAISE(auto starting_sequence_number, + StartingSequenceNumber(metadata, starting_snapshot_id)); + + for (const auto& data_file : replaced_files) { + ICEBERG_ASSIGN_OR_RAISE(auto delete_files, + deletes->ForDataFile(starting_sequence_number, *data_file)); + if (!delete_files.empty()) { + return ValidationFailed( + "Cannot commit, found new delete for replaced data file: {}", + data_file->file_path); + } + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive) { + std::string data_filter_text = + data_filter != nullptr ? data_filter->ToString() : "any expression"; + ICEBERG_ASSIGN_OR_RAISE( + auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, std::move(data_filter), + /*partition_set=*/nullptr, parent, io, case_sensitive)); + auto referenced_delete_files = deletes->ReferencedDeleteFiles(); + auto delete_paths = DeleteFileLocations(referenced_delete_files); + if (delete_paths.has_value()) { + return ValidationFailed( + "Found new conflicting delete files that can apply to records matching {}: {}", + data_filter_text, delete_paths.value()); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + ICEBERG_ASSIGN_OR_RAISE( + auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, + /*data_filter=*/nullptr, + std::make_shared(partition_set), parent, io)); + auto referenced_delete_files = deletes->ReferencedDeleteFiles(); + auto delete_paths = DeleteFileLocations(referenced_delete_files); + if (delete_paths.has_value()) { + return ValidationFailed( + "Found new conflicting delete files that can apply to records matching " + "validated partitions: {}", + delete_paths.value()); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateDeletedDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_entries, + MatchingDeletedDataFiles(metadata, starting_snapshot_id, data_filter, + /*partition_set=*/nullptr, parent, io, case_sensitive)); + auto conflict_paths = DataFileLocations(conflict_entries); + if (conflict_paths.has_value()) { + return ValidationFailed( + "Found conflicting deleted files that can contain records matching {}: {}", + data_filter != nullptr ? data_filter->ToString() : "any expression", + conflict_paths.value()); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateDeletedDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_entries, + MatchingDeletedDataFiles(metadata, starting_snapshot_id, + /*data_filter=*/nullptr, &partition_set, parent, io, + /*case_sensitive=*/true)); + auto conflict_paths = DataFileLocations(conflict_entries); + if (conflict_paths.has_value()) { + return ValidationFailed( + "Found conflicting deleted files that can apply to records matching " + "validated partitions: {}", + conflict_paths.value()); + } + return {}; +} + +Result> MergingSnapshotUpdate::AddedDeleteFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, std::shared_ptr partition_set, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive) { + if (parent == nullptr || metadata.format_version < 2) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, PartitionSpecsByIdMap(metadata)); + ICEBERG_ASSIGN_OR_RAISE( + auto builder, + DeleteFileIndex::BuilderFor(io, std::move(schema), std::move(specs_by_id), + /*delete_manifests=*/{})); + return builder.Build(); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + kValidateAddedDeleteFilesOperations, + ManifestContent::kDeletes, io)); + + ICEBERG_ASSIGN_OR_RAISE(auto starting_sequence_number, + StartingSequenceNumber(metadata, starting_snapshot_id)); + return BuildDeleteFileIndex(metadata, io, std::move(history.manifests), + starting_sequence_number, std::move(data_filter), + std::move(partition_set), case_sensitive); +} + +Status MergingSnapshotUpdate::ValidateAddedDVs( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr conflict_detection_filter, + const std::unordered_set& referenced_data_files, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive) { + if (parent == nullptr || referenced_data_files.empty()) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + kValidateAddedDVsOperations, ManifestContent::kDeletes, io)); + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + + ICEBERG_ASSIGN_OR_RAISE(auto matching_manifests, + FilterManifestsByPartition(metadata, conflict_detection_filter, + history.manifests, case_sensitive)); + for (const auto& manifest : matching_manifests) { + if (!manifest.has_added_files()) { + continue; + } + ICEBERG_RETURN_UNEXPECTED(ValidateAddedDVsInManifest( + metadata, manifest, conflict_detection_filter, history.snapshot_ids, + referenced_data_files, io, schema, case_sensitive)); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateAddedDVs( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr conflict_filter, const std::shared_ptr& parent, + std::shared_ptr io) const { + if (parent == nullptr) { + return {}; + } + + std::unordered_set referenced_data_files; + for (const auto& entry : dvs_by_referenced_file_.entries()) { + referenced_data_files.insert(entry.referenced_file); + } + if (referenced_data_files.empty()) { + return {}; + } + return ValidateAddedDVs(metadata, starting_snapshot_id, std::move(conflict_filter), + referenced_data_files, parent, std::move(io), case_sensitive_); +} + +} // namespace iceberg diff --git a/src/iceberg/update/merging_snapshot_update.h b/src/iceberg/update/merging_snapshot_update.h new file mode 100644 index 000000000..5d4e128e9 --- /dev/null +++ b/src/iceberg/update/merging_snapshot_update.h @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/merging_snapshot_update.h + +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/delete_file_index.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/manifest/manifest_filter_manager.h" +#include "iceberg/manifest/manifest_merge_manager.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" +#include "iceberg/update/snapshot_update.h" +#include "iceberg/util/data_file_set.h" + +namespace iceberg { + +/// \brief Abstract base class for all merge-based snapshot write operations. +/// +/// Provides the complete filter → write → merge pipeline that all merge-based +/// operations (MergeAppend, OverwriteFiles, RowDelta, ReplacePartitions, +/// RewriteFiles) share. Subclasses only need to implement `operation()` and +/// call the protected primitive API to describe what changes to make. +/// +/// The Apply() pipeline: +/// 1. Filter data manifests (via data_filter_manager_) +/// 2. Compute min data sequence number and set up delete filter cleanup +/// 3. Filter delete manifests (via delete_filter_manager_) +/// 4. Write new data manifests (cached for commit retry) +/// 5. Write new delete manifests (cached for commit retry) +/// 6. Merge data manifests (via data_merge_manager_) +/// 7. Merge delete manifests (via delete_merge_manager_) +/// +/// TODO(Guotao): Java MergingSnapshotProducer overrides updateEvent() to return a +/// CreateSnapshotEvent(tableName, operation, snapshotId, sequenceNumber, summary) +/// for commit listeners. The C++ update framework does not yet have an event +/// notification mechanism, so this is intentionally not implemented here. Add it +/// once an equivalent CreateSnapshotEvent / listener facility exists. +class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { + public: + ~MergingSnapshotUpdate() override = default; + + // SnapshotUpdate overrides + Result> Apply( + const TableMetadata& metadata_to_update, + const std::shared_ptr& snapshot) override; + + Status CleanUncommitted(const std::unordered_set& committed) override; + + std::unordered_map Summary() override; + + protected: + /// \brief Constructor; reads merge configuration from table properties. + explicit MergingSnapshotUpdate(std::string table_name, + std::shared_ptr ctx); + + /// \brief Stage a data file to be added to the table. + Status AddDataFile(std::shared_ptr file); + + /// \brief Stage a delete file to be added to the table. + Status AddDeleteFile(std::shared_ptr file); + + /// \brief Validate a delete file against the table format version rules. + /// + /// - Format v1: deletes are not supported. + /// - Format v2: position deletes must NOT be deletion vectors (DVs). + /// - Format v3+: position deletes MUST be deletion vectors (DVs). + Status ValidateNewDeleteFile(const TableMetadata& metadata, const DataFile& file); + + /// \brief Stage a delete file with an explicit data sequence number. + /// + Status AddDeleteFile(std::shared_ptr file, int64_t data_sequence_number); + + /// \brief Add all files in a pre-existing data manifest to the new snapshot. + /// + /// The manifest must contain DATA content. If snapshot ID inheritance is + /// enabled and the manifest has no snapshot ID assigned, it is used directly; + /// otherwise it is copied with the current snapshot ID. + Status AddManifest(ManifestFile manifest); + + /// \brief Register a data file (by object) to be deleted from the table. + Status DeleteDataFile(std::shared_ptr file); + + /// \brief Register a delete file (by object) to be removed from the table. + Status DeleteDeleteFile(std::shared_ptr file); + + /// \brief Register a data file path to be deleted from the table. + /// + /// \note Only applies to data files. To remove delete files, use DeleteDeleteFile(). + Status DeleteByPath(std::string_view path); + + /// \brief Register an expression to delete matching rows. + /// + /// Both data and delete filter managers receive the expression: delete files that + /// match the row filter can also be removed because those rows will be deleted. + Status DeleteByRowFilter(std::shared_ptr expr); + + /// \brief Register a partition to be dropped. + /// + /// Both data and delete filter managers receive the partition drop, since dropping + /// data in a partition also drops all delete files in that partition. + Status DropPartition(int32_t spec_id, PartitionValues partition); + + /// \brief Fail if any registered delete path is not found in any manifest. + void FailMissingDeletePaths(); + + /// \brief Fail if any manifest entry matches a delete condition. + void FailAnyDelete(); + + /// \brief Override the data sequence number assigned to all newly-added data files. + void SetNewDataFilesDataSequenceNumber(int64_t sequence_number); + + /// \brief Set case sensitivity for row filter and expression evaluation. + void CaseSensitive(bool case_sensitive); + + /// \brief Returns true if case-sensitive matching is enabled (default: true). + bool IsCaseSensitive() const { return case_sensitive_; } + + /// \brief Returns true if any data files have been staged for addition. + bool AddsDataFiles() const; + + /// \brief Returns true if any delete files have been staged for addition. + bool AddsDeleteFiles() const; + + /// \brief Returns true if any data files have been registered for deletion. + bool DeletesDataFiles() const; + + /// \brief Returns true if any delete files have been registered for removal. + bool DeletesDeleteFiles() const; + + /// \brief Returns the row-filter expression set via DeleteByRowFilter, or nullptr. + const std::shared_ptr& RowFilter() const { return delete_expression_; } + + /// \brief Returns the single partition spec for all staged data files. + /// + /// Precondition: exactly one partition spec ID must be represented among staged + /// data files. + Result> DataSpec() const; + + /// \brief Returns all data files staged for addition. + std::vector> AddedDataFiles() const; + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a data file matching the given filter expression. + static Status ValidateAddedDataFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + std::shared_ptr filter, + const std::shared_ptr& parent, + std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a data file in any partition of the given partition + /// set. + static Status ValidateAddedDataFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, removed a file whose path is in file_paths (and + /// skip_deletes is false). + static Status ValidateDataFilesExist( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const std::unordered_set& file_paths, bool skip_deletes, + std::shared_ptr filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive = true); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a delete file that covers a file in replaced_files. + /// + /// Whether equality deletes are checked is derived automatically from whether + /// a custom data sequence number was set via SetNewDataFilesDataSequenceNumber(): + /// if set, equality deletes are ignored because they still apply to the rewritten + /// files and are not a conflict. + /// + /// Subclasses should prefer this overload over the static one. + Status ValidateNoNewDeletesForDataFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + const DataFileSet& replaced_files, + const std::shared_ptr& parent, + std::shared_ptr io) const { + const bool ignore_equality_deletes = new_data_files_data_seq_number_.has_value(); + return ValidateNoNewDeletesForDataFiles(metadata, starting_snapshot_id, + replaced_files, parent, io, + ignore_equality_deletes); + } + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a delete file that covers a file in replaced_files. + /// + /// \param ignore_equality_deletes If true, only position deletes are checked. + /// Set to true when replaced data files have the same sequence number as the + /// new files (e.g. RewriteFiles), so equality deletes at higher sequence numbers + /// still apply and are not a conflict. + static Status ValidateNoNewDeletesForDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + const DataFileSet& replaced_files, const std::shared_ptr& parent, + std::shared_ptr io, bool ignore_equality_deletes = false); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a delete file matching the data filter that covers a + /// file in replaced_files. + static Status ValidateNoNewDeletesForDataFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, const DataFileSet& replaced_files, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a delete file matching the given row filter. + /// + static Status ValidateNoNewDeleteFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a delete file matching any partition in the given + /// partition set. + static Status ValidateNoNewDeleteFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, deleted a data file matching the given row filter. + static Status ValidateDeletedDataFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, deleted a data file in any partition of the given partition + /// set. + static Status ValidateDeletedDataFiles(const TableMetadata& metadata, + std::optional starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Build a DeleteFileIndex of delete files added since starting_snapshot_id. + static Result> AddedDeleteFiles( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr data_filter, + std::shared_ptr partition_set, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot after starting_snapshot_id, or from + /// the beginning if unset, added a deletion vector that conflicts with DVs being + /// written. + static Status ValidateAddedDVs( + const TableMetadata& metadata, std::optional starting_snapshot_id, + std::shared_ptr conflict_filter, + const std::unordered_set& referenced_data_files, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive = true); + + private: + struct PendingDeleteFile { + std::shared_ptr file; + std::optional data_sequence_number; + }; + + /// \brief Ordered map from referenced data file path to pending DVs. + /// + /// Mirrors Java's LinkedHashMap-backed dvsByReferencedFile: lookup is by + /// referenced data file, and iteration preserves the first-seen key order. + struct PendingDeleteFilesByReferencedFile { + struct Entry { + std::string referenced_file; + std::vector files; + }; + + void Add(std::string referenced_file, PendingDeleteFile file); + bool empty() const { return entries_.empty(); } + size_t size() const { return entries_.size(); } + const std::vector& entries() const { return entries_; } + + private: + std::vector entries_; + std::unordered_map index_by_referenced_file_; + }; + + ManifestWriterFactory MakeWriterFactory(const std::shared_ptr& schema); + + /// \brief Copy a manifest with the current snapshot ID, for use when snapshot + /// ID inheritance is not possible. + Result CopyManifest(const ManifestFile& manifest); + + Status AddDeleteFile(std::shared_ptr file, + std::optional data_sequence_number); + + Status ValidateAddedDVs(const TableMetadata& metadata, + std::optional starting_snapshot_id, + std::shared_ptr conflict_filter, + const std::shared_ptr& parent, + std::shared_ptr io) const; + + Status ManagersReady() const; + + void SetSummaryProperty(const std::string& property, const std::string& value) override; + + Result> MergeDVs() const; + + /// \brief Write new data manifests for staged data files; caches the result. + Result> WriteNewDataManifests(); + + /// \brief Write new delete manifests for staged delete files; caches the result. + Result> WriteNewDeleteManifests(); + + Status CleanUncommittedAppends(const std::unordered_set& committed); + + Status DeleteUncommitted(std::vector& manifests, + const std::unordered_set& committed, bool clear); + + // Used for commit event notifications and diagnostic log messages. + std::string table_name_; + std::shared_ptr delete_expression_; + bool case_sensitive_ = true; + + // Stable sub-builders for added files — accumulated across retries and merged + // into summary_builder_ at the start of each Apply() call. + SnapshotSummaryBuilder added_data_files_summary_; + SnapshotSummaryBuilder added_delete_files_summary_; + SnapshotSummaryBuilder appended_manifests_summary_; + std::unordered_map custom_summary_properties_; + + std::unique_ptr data_filter_manager_; + std::unique_ptr delete_filter_manager_; + std::unique_ptr data_merge_manager_; + std::unique_ptr delete_merge_manager_; + + std::unordered_map new_data_files_by_spec_; + std::vector v2_deletes_; + PendingDeleteFilesByReferencedFile dvs_by_referenced_file_; + std::optional new_data_files_data_seq_number_; + + // Manifests passed via AddManifest(): inherit path (no copy needed) and + // rewrite path (must be copied with the current snapshot ID). + std::vector append_manifests_; + std::vector rewritten_append_manifests_; + + // Set to true when new files are staged after the cache was populated, so the + // cache is invalidated and re-written on the next Apply() call (commit retry). + bool has_new_data_files_ = false; + bool has_new_delete_files_ = false; + + std::vector cached_new_data_manifests_; + std::vector cached_new_delete_manifests_; +}; + +} // namespace iceberg diff --git a/src/iceberg/update/meson.build b/src/iceberg/update/meson.build index 6acb007a1..6405f603f 100644 --- a/src/iceberg/update/meson.build +++ b/src/iceberg/update/meson.build @@ -19,6 +19,7 @@ install_headers( [ 'expire_snapshots.h', 'fast_append.h', + 'merging_snapshot_update.h', 'pending_update.h', 'set_snapshot.h', 'snapshot_manager.h', diff --git a/src/iceberg/update/snapshot_update.cc b/src/iceberg/update/snapshot_update.cc index a59ebdc72..bb5376fa8 100644 --- a/src/iceberg/update/snapshot_update.cc +++ b/src/iceberg/update/snapshot_update.cc @@ -41,28 +41,34 @@ namespace iceberg { namespace { -// The Java impl skips updating total if parsing fails. Here we choose to be strict. Status UpdateTotal(std::unordered_map& summary, const std::unordered_map& previous_summary, const std::string& total_property, const std::string& added_property, const std::string& deleted_property) { auto total_it = previous_summary.find(total_property); if (total_it != previous_summary.end()) { - ICEBERG_ASSIGN_OR_RAISE(auto new_total, - StringUtils::ParseNumber(total_it->second)); + auto parsed_total = StringUtils::ParseNumber(total_it->second); + if (!parsed_total.has_value()) { + return {}; + } + int64_t new_total = parsed_total.value(); auto added_it = summary.find(added_property); if (new_total >= 0 && added_it != summary.end()) { - ICEBERG_ASSIGN_OR_RAISE(auto added_value, - StringUtils::ParseNumber(added_it->second)); - new_total += added_value; + auto parsed_added = StringUtils::ParseNumber(added_it->second); + if (!parsed_added.has_value()) { + return {}; + } + new_total += parsed_added.value(); } auto deleted_it = summary.find(deleted_property); if (new_total >= 0 && deleted_it != summary.end()) { - ICEBERG_ASSIGN_OR_RAISE(auto deleted_value, - StringUtils::ParseNumber(deleted_it->second)); - new_total -= deleted_value; + auto parsed_deleted = StringUtils::ParseNumber(deleted_it->second); + if (!parsed_deleted.has_value()) { + return {}; + } + new_total -= parsed_deleted.value(); } if (new_total >= 0) { @@ -163,6 +169,11 @@ SnapshotUpdate::SnapshotUpdate(std::shared_ptr ctx) target_manifest_size_bytes_( base().properties.Get(TableProperties::kManifestTargetSizeBytes)) {} +void SnapshotUpdate::SetSummaryProperty(const std::string& property, + const std::string& value) { + summary_.Set(property, value); +} + // TODO(xxx): write manifests in parallel Result> SnapshotUpdate::WriteDataManifests( std::span> files, @@ -178,8 +189,7 @@ Result> SnapshotUpdate::WriteDataManifests( snapshot_id = SnapshotId()]() -> Result> { return ManifestWriter::MakeWriter( base().format_version, snapshot_id, ManifestPath(), ctx_->table->io(), - std::move(spec), std::move(schema), ManifestContent::kData, - /*first_row_id=*/base().next_row_id); + std::move(spec), std::move(schema), ManifestContent::kData); }, target_manifest_size_bytes_); @@ -192,7 +202,7 @@ Result> SnapshotUpdate::WriteDataManifests( // TODO(xxx): write manifests in parallel Result> SnapshotUpdate::WriteDeleteManifests( - std::span> files, + std::span files, const std::shared_ptr& spec) { if (files.empty()) { return std::vector{}; @@ -208,10 +218,9 @@ Result> SnapshotUpdate::WriteDeleteManifests( }, target_manifest_size_bytes_); - for (const auto& file : files) { - // FIXME: Java impl wrap it with `PendingDeleteFile` and deals with - // file->data_sequence_number - ICEBERG_RETURN_UNEXPECTED(rolling_writer.WriteAddedEntry(file)); + for (const auto& entry : files) { + ICEBERG_RETURN_UNEXPECTED( + rolling_writer.WriteAddedEntry(entry.file, entry.data_sequence_number)); } ICEBERG_RETURN_UNEXPECTED(rolling_writer.Close()); return rolling_writer.ToManifestFiles(); @@ -232,7 +241,7 @@ Result SnapshotUpdate::Apply() { std::ignore = DeleteFile(manifest_list); } manifest_lists_.clear(); - CleanUncommitted(std::unordered_set{}); + ICEBERG_RETURN_UNEXPECTED(CleanUncommitted(std::unordered_set{})); staged_snapshot_ = nullptr; summary_.Clear(); @@ -245,9 +254,7 @@ Result SnapshotUpdate::Apply() { std::optional parent_snapshot_id = parent_snapshot ? std::make_optional(parent_snapshot->snapshot_id) : std::nullopt; - if (parent_snapshot) { - ICEBERG_RETURN_UNEXPECTED(Validate(base(), parent_snapshot)); - } + ICEBERG_RETURN_UNEXPECTED(Validate(base(), parent_snapshot)); ICEBERG_ASSIGN_OR_RAISE(auto manifests, Apply(base(), parent_snapshot)); for (auto& manifest : manifests) { @@ -314,7 +321,7 @@ Status SnapshotUpdate::Finalize(Result commit_result) { if (commit_result.error().kind == ErrorKind::kCommitStateUnknown) { return {}; } - CleanAll(); + std::ignore = CleanAll(); return {}; } @@ -322,11 +329,14 @@ Status SnapshotUpdate::Finalize(Result commit_result) { ICEBERG_CHECK(staged_snapshot_ != nullptr, "Staged snapshot is null during finalize after commit"); auto cached_snapshot = SnapshotCache(staged_snapshot_.get()); - ICEBERG_ASSIGN_OR_RAISE(auto manifests, cached_snapshot.Manifests(ctx_->table->io())); - CleanUncommitted(manifests | std::views::transform([](const auto& manifest) { - return manifest.manifest_path; - }) | - std::ranges::to>()); + if (auto manifests = cached_snapshot.Manifests(ctx_->table->io()); + manifests.has_value()) { + std::ignore = CleanUncommitted(manifests.value() | + std::views::transform([](const auto& manifest) { + return manifest.manifest_path; + }) | + std::ranges::to>()); + } } // Also clean up unused manifest lists created by multiple attempts @@ -389,23 +399,20 @@ Result> SnapshotUpdate::ComputeSumm return summary; } -void SnapshotUpdate::CleanAll() { +Status SnapshotUpdate::CleanAll() { for (const auto& manifest_list : manifest_lists_) { std::ignore = DeleteFile(manifest_list); } manifest_lists_.clear(); - CleanUncommitted(std::unordered_set{}); + std::ignore = CleanUncommitted(std::unordered_set{}); + return {}; } Status SnapshotUpdate::DeleteFile(const std::string& path) { - static const auto kDefaultDeleteFunc = [this](const std::string& path) { - return this->ctx_->table->io()->DeleteFile(path); - }; if (delete_func_) { return delete_func_(path); - } else { - return kDefaultDeleteFunc(path); } + return ctx_->table->io()->DeleteFile(path); } std::string SnapshotUpdate::ManifestListPath() { diff --git a/src/iceberg/update/snapshot_update.h b/src/iceberg/update/snapshot_update.h index f48e5f44d..03a74e788 100644 --- a/src/iceberg/update/snapshot_update.h +++ b/src/iceberg/update/snapshot_update.h @@ -105,7 +105,7 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { /// \param value The property value /// \return Reference to this for method chaining auto& Set(this auto& self, const std::string& property, const std::string& value) { - self.summary_.Set(property, value); + static_cast(self).SetSummaryProperty(property, value); return self; } @@ -122,6 +122,11 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { Status Finalize(Result commit_result) override; protected: + struct ContentFileWithSequenceNumber { + std::shared_ptr file; + std::optional data_sequence_number; + }; + explicit SnapshotUpdate(std::shared_ptr ctx); /// \brief Write data manifests for the given data files @@ -135,13 +140,8 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { const std::shared_ptr& spec, std::optional data_sequence_number = std::nullopt); - /// \brief Write delete manifests for the given delete files - /// - /// \param files Delete files to write - /// \param spec The partition spec to use - /// \return A vector of manifest files Result> WriteDeleteManifests( - std::span> files, + std::span files, const std::shared_ptr& spec); const std::string& target_branch() const { return target_branch_; } @@ -161,7 +161,7 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { /// actually committed. /// /// \param committed A set of manifest paths that were actually committed - virtual void CleanUncommitted(const std::unordered_set& committed) = 0; + virtual Status CleanUncommitted(const std::unordered_set& committed) = 0; /// \brief A string that describes the action that produced the new snapshot. /// @@ -193,6 +193,12 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { /// \return A map of summary properties virtual std::unordered_map Summary() = 0; + /// \brief Set a summary property. + /// + /// Implementations may override this to retain custom properties across + /// retry-safe summary rebuilds. + virtual void SetSummaryProperty(const std::string& property, const std::string& value); + /// \brief Check if cleanup should happen after commit /// /// \return True if cleanup should happen after commit @@ -217,7 +223,7 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { const TableMetadata& previous); /// \brief Clean up all uncommitted files - void CleanAll(); + Status CleanAll(); protected: SnapshotSummaryBuilder summary_; diff --git a/src/iceberg/util/data_file_set.h b/src/iceberg/util/data_file_set.h index 741b34e56..93abdfff7 100644 --- a/src/iceberg/util/data_file_set.h +++ b/src/iceberg/util/data_file_set.h @@ -20,12 +20,15 @@ #pragma once /// \file iceberg/util/data_file_set.h -/// A set of DataFile pointers with insertion order preserved and deduplicated by file -/// path. +/// Sets of DataFile pointers with insertion order preserved and Iceberg-compatible +/// file identity deduplication. +#include #include #include +#include #include +#include #include #include #include @@ -46,6 +49,16 @@ class ICEBERG_EXPORT DataFileSet { using difference_type = typename std::vector::difference_type; DataFileSet() = default; + DataFileSet(const DataFileSet& other) : elements_(other.elements_) { RebuildIndex(); } + DataFileSet& operator=(const DataFileSet& other) { + if (this != &other) { + elements_ = other.elements_; + RebuildIndex(); + } + return *this; + } + DataFileSet(DataFileSet&&) noexcept = default; + DataFileSet& operator=(DataFileSet&&) noexcept = default; /// \brief Insert a data file into the set. /// \param file The data file to insert @@ -58,6 +71,16 @@ class ICEBERG_EXPORT DataFileSet { return InsertImpl(std::move(file)); } + /// \brief Returns whether an equivalent data file exists in the set. + bool contains(const DataFile& file) const { + return index_by_path_.contains(file.file_path); + } + + /// \brief Returns whether an equivalent data file exists in the set. + bool contains(const value_type& file) const { + return file != nullptr && contains(*file); + } + /// \brief Get the number of elements in the set. size_t size() const { return elements_.size(); } @@ -100,9 +123,146 @@ class ICEBERG_EXPORT DataFileSet { return {std::prev(elements_.end()), true}; } + void RebuildIndex() { + index_by_path_.clear(); + for (size_t i = 0; i < elements_.size(); ++i) { + if (elements_[i] != nullptr) { + index_by_path_.try_emplace(elements_[i]->file_path, i); + } + } + } + // Vector to preserve insertion order std::vector elements_; std::unordered_map index_by_path_; }; +/// \brief A set of delete-file pointers deduplicated by delete-file identity. +/// +/// Delete files, especially deletion vectors, are identified by location plus +/// content offset and content size. This mirrors Java's DeleteFileSet behavior. +class ICEBERG_EXPORT DeleteFileSet { + public: + using value_type = std::shared_ptr; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; + using difference_type = typename std::vector::difference_type; + + DeleteFileSet() = default; + DeleteFileSet(const DeleteFileSet& other) : elements_(other.elements_) { + RebuildIndex(); + } + DeleteFileSet& operator=(const DeleteFileSet& other) { + if (this != &other) { + elements_ = other.elements_; + RebuildIndex(); + } + return *this; + } + DeleteFileSet(DeleteFileSet&&) noexcept = default; + DeleteFileSet& operator=(DeleteFileSet&&) noexcept = default; + + /// \brief Insert a delete file into the set. + /// \param file The delete file to insert + /// \return A pair with an iterator to the inserted element (or the existing one) and + /// a bool indicating whether insertion took place + std::pair insert(const value_type& file) { return InsertImpl(file); } + + /// \brief Insert a delete file into the set (move version). + std::pair insert(value_type&& file) { + return InsertImpl(std::move(file)); + } + + /// \brief Returns whether an equivalent delete file exists in the set. + bool contains(const DataFile& file) const { + return index_by_file_.contains(DeleteFileKey(file)); + } + + /// \brief Returns whether an equivalent delete file exists in the set. + bool contains(const value_type& file) const { + return file != nullptr && contains(*file); + } + + /// \brief Get the number of elements in the set. + size_t size() const { return elements_.size(); } + + /// \brief Check if the set is empty. + bool empty() const { return elements_.empty(); } + + /// \brief Clear all elements from the set. + void clear() { + elements_.clear(); + index_by_file_.clear(); + } + + /// \brief Get iterator to the beginning. + iterator begin() { return elements_.begin(); } + const_iterator begin() const { return elements_.begin(); } + const_iterator cbegin() const { return elements_.cbegin(); } + + /// \brief Get iterator to the end. + iterator end() { return elements_.end(); } + const_iterator end() const { return elements_.end(); } + const_iterator cend() const { return elements_.cend(); } + + /// \brief Get a non-owning view of the delete files in insertion order. + std::span as_span() const { return elements_; } + + private: + struct DeleteFileKey { + explicit DeleteFileKey(const DataFile& file) + : path(file.file_path), + content_offset(file.content_offset), + content_size_in_bytes(file.content_size_in_bytes) {} + + std::string path; + std::optional content_offset; + std::optional content_size_in_bytes; + + bool operator==(const DeleteFileKey& other) const = default; + }; + + struct DeleteFileKeyHash { + size_t operator()(const DeleteFileKey& key) const { + size_t hash = std::hash{}(key.path); + auto combine = [&hash](const auto& value) { + size_t value_hash = value.has_value() ? std::hash{}(*value) : 0; + hash ^= value_hash + 0x9e3779b9 + (hash << 6) + (hash >> 2); + }; + combine(key.content_offset); + combine(key.content_size_in_bytes); + return hash; + } + }; + + std::pair InsertImpl(value_type file) { + if (!file) { + return {elements_.end(), false}; + } + + auto [index_iter, inserted] = + index_by_file_.try_emplace(DeleteFileKey(*file), elements_.size()); + if (!inserted) { + auto pos = static_cast(index_iter->second); + return {elements_.begin() + pos, false}; + } + + elements_.push_back(std::move(file)); + return {std::prev(elements_.end()), true}; + } + + void RebuildIndex() { + index_by_file_.clear(); + for (size_t i = 0; i < elements_.size(); ++i) { + if (elements_[i] != nullptr) { + index_by_file_.try_emplace(DeleteFileKey(*elements_[i]), i); + } + } + } + + // Vector to preserve insertion order. + std::vector elements_; + std::unordered_map index_by_file_; +}; + } // namespace iceberg