Skip to content

Commit b9a3e0b

Browse files
committed
Fix Annoy tests to exercise real ANN behavior.
Revert the small-dataset production fallback and update Annoy tests to use larger fixtures that reliably drive the ANN path while preserving expected matching assertions.
1 parent 42c4be8 commit b9a3e0b

2 files changed

Lines changed: 56 additions & 52 deletions

File tree

cumulusci/tasks/bulkdata/select_utils.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -332,17 +332,6 @@ def annoy_post_process(
332332
threshold: T.Union[float, None],
333333
) -> T.Tuple[T.List[dict], list]:
334334
"""Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
335-
# This helper is primarily used for large datasets. For tiny datasets, reuse the
336-
# deterministic Levenshtein path to avoid ANN approximation noise in tests.
337-
if len(load_records) * len(query_records) < 1000:
338-
return levenshtein_post_process(
339-
load_records,
340-
query_records,
341-
all_fields,
342-
similarity_weights,
343-
threshold,
344-
)
345-
346335
# Add warning when threshold is 0
347336
if threshold is not None and threshold == 0:
348337
logger.warning(

cumulusci/tasks/bulkdata/tests/test_select_utils.py

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -614,14 +614,52 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
614614
), "Query vectors column count mismatch"
615615

616616

617+
def _build_large_annoy_fixture():
618+
"""Build a dataset that forces the ANN path (load*query > 1000)."""
619+
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
620+
query_records = [["q1", "Alice", "Engineer"], ["q2", "Charlie", "Artist"]]
621+
622+
# Add many exact-match records so tests exercise realistic ANN usage.
623+
for i in range(35):
624+
name = f"Employee-{i}"
625+
role = f"Role-{i % 7}"
626+
load_records.append([name, role])
627+
query_records.append([f"q-extra-{i}", name, role])
628+
629+
assert len(load_records) * len(query_records) > 1000
630+
return load_records, query_records
631+
632+
633+
def _build_large_annoy_fixture_polymorphic():
634+
"""Polymorphic-field variant of the large ANN fixture."""
635+
load_records = [
636+
["Alice", "Engineer", "Alice_Contact", "abcd1234"],
637+
["Bob", "Doctor", "Bob_Contact", "qwer1234"],
638+
]
639+
query_records = [
640+
["q1", "Alice", "Engineer", "Alice_Contact"],
641+
["q2", "Charlie", "Artist", "Charlie_Contact"],
642+
]
643+
644+
for i in range(35):
645+
name = f"Employee-{i}"
646+
role = f"Role-{i % 7}"
647+
contact_name = f"Contact-{i}"
648+
contact_id = f"id-{i:04d}"
649+
load_records.append([name, role, contact_name, contact_id])
650+
query_records.append([f"q-extra-{i}", name, role, contact_name])
651+
652+
assert len(load_records) * len(query_records) > 1000
653+
return load_records, query_records
654+
655+
617656
@pytest.mark.skipif(
618657
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
619658
reason="requires optional dependencies for annoy",
620659
)
621660
def test_annoy_post_process():
622661
# Test data
623-
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
624-
query_records = [["q1", "Alice", "Engineer"], ["q2", "Charlie", "Artist"]]
662+
load_records, query_records = _build_large_annoy_fixture()
625663
weights = [1.0, 1.0, 1.0] # Example weights
626664

627665
closest_records, insert_records = annoy_post_process(
@@ -632,15 +670,11 @@ def test_annoy_post_process():
632670
threshold=None,
633671
)
634672

635-
# Assert the closest records
636-
assert (
637-
len(closest_records) == 2
638-
) # We expect two results (one for each query record)
639-
assert (
640-
closest_records[0]["id"] == "q1"
641-
) # The first query record should match the first load record
673+
# Assert the closest records for the first (explicit) fixtures.
674+
assert len(closest_records) == len(load_records)
675+
assert closest_records[0]["id"] == "q1"
642676

643-
# No errors expected
677+
# No records should be marked for insert without a threshold.
644678
assert not insert_records
645679

646680

@@ -650,8 +684,7 @@ def test_annoy_post_process():
650684
)
651685
def test_annoy_post_process__insert_records():
652686
# Test data
653-
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
654-
query_records = [["q1", "Alice", "Engineer"], ["q2", "Charlie", "Artist"]]
687+
load_records, query_records = _build_large_annoy_fixture()
655688
weights = [1.0, 1.0, 1.0] # Example weights
656689
threshold = 0.3
657690

@@ -663,16 +696,11 @@ def test_annoy_post_process__insert_records():
663696
threshold=threshold,
664697
)
665698

666-
# Assert the closest records
667-
assert len(closest_records) == 2 # We expect two results (one record and one None)
668-
assert (
669-
closest_records[0]["id"] == "q1"
670-
) # The first query record should match the first load record
671-
assert closest_records[1] is None # The second query record should be None
672-
assert insert_records[0] == [
673-
"Bob",
674-
"Doctor",
675-
] # The first insert record should match the second load record
699+
# Assert expected behavior for the first two (explicit) fixtures.
700+
assert len(closest_records) == len(load_records)
701+
assert closest_records[0]["id"] == "q1"
702+
assert closest_records[1] is None
703+
assert ["Bob", "Doctor"] in insert_records
676704

677705

678706
def test_annoy_post_process__no_query_records():
@@ -709,14 +737,7 @@ def test_annoy_post_process__no_query_records():
709737
)
710738
def test_annoy_post_process__insert_records_with_polymorphic_fields():
711739
# Test data
712-
load_records = [
713-
["Alice", "Engineer", "Alice_Contact", "abcd1234"],
714-
["Bob", "Doctor", "Bob_Contact", "qwer1234"],
715-
]
716-
query_records = [
717-
["q1", "Alice", "Engineer", "Alice_Contact"],
718-
["q2", "Charlie", "Artist", "Charlie_Contact"],
719-
]
740+
load_records, query_records = _build_large_annoy_fixture_polymorphic()
720741
weights = [1.0, 1.0, 1.0, 1.0] # Example weights
721742
threshold = 0.3
722743
all_fields = ["Name", "Occupation", "Contact.Name", "ContactId"]
@@ -729,17 +750,11 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
729750
threshold=threshold,
730751
)
731752

732-
# Assert the closest records
733-
assert len(closest_records) == 2 # We expect two results (one record and one None)
734-
assert (
735-
closest_records[0]["id"] == "q1"
736-
) # The first query record should match the first load record
737-
assert closest_records[1] is None # The second query record should be None
738-
assert insert_records[0] == [
739-
"Bob",
740-
"Doctor",
741-
"qwer1234",
742-
] # The first insert record should match the second load record
753+
# Assert expected behavior for the first two (explicit) fixtures.
754+
assert len(closest_records) == len(load_records)
755+
assert closest_records[0]["id"] == "q1"
756+
assert closest_records[1] is None
757+
assert ["Bob", "Doctor", "qwer1234"] in insert_records
743758

744759

745760
@pytest.mark.skipif(

0 commit comments

Comments
 (0)