@@ -614,14 +614,52 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
614614 ), "Query vectors column count mismatch"
615615
616616
617+ def _build_large_annoy_fixture ():
618+ """Build a dataset that forces the ANN path (load*query > 1000)."""
619+ load_records = [["Alice" , "Engineer" ], ["Bob" , "Doctor" ]]
620+ query_records = [["q1" , "Alice" , "Engineer" ], ["q2" , "Charlie" , "Artist" ]]
621+
622+ # Add many exact-match records so tests exercise realistic ANN usage.
623+ for i in range (35 ):
624+ name = f"Employee-{ i } "
625+ role = f"Role-{ i % 7 } "
626+ load_records .append ([name , role ])
627+ query_records .append ([f"q-extra-{ i } " , name , role ])
628+
629+ assert len (load_records ) * len (query_records ) > 1000
630+ return load_records , query_records
631+
632+
633+ def _build_large_annoy_fixture_polymorphic ():
634+ """Polymorphic-field variant of the large ANN fixture."""
635+ load_records = [
636+ ["Alice" , "Engineer" , "Alice_Contact" , "abcd1234" ],
637+ ["Bob" , "Doctor" , "Bob_Contact" , "qwer1234" ],
638+ ]
639+ query_records = [
640+ ["q1" , "Alice" , "Engineer" , "Alice_Contact" ],
641+ ["q2" , "Charlie" , "Artist" , "Charlie_Contact" ],
642+ ]
643+
644+ for i in range (35 ):
645+ name = f"Employee-{ i } "
646+ role = f"Role-{ i % 7 } "
647+ contact_name = f"Contact-{ i } "
648+ contact_id = f"id-{ i :04d} "
649+ load_records .append ([name , role , contact_name , contact_id ])
650+ query_records .append ([f"q-extra-{ i } " , name , role , contact_name ])
651+
652+ assert len (load_records ) * len (query_records ) > 1000
653+ return load_records , query_records
654+
655+
617656@pytest .mark .skipif (
618657 not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE ,
619658 reason = "requires optional dependencies for annoy" ,
620659)
621660def test_annoy_post_process ():
622661 # Test data
623- load_records = [["Alice" , "Engineer" ], ["Bob" , "Doctor" ]]
624- query_records = [["q1" , "Alice" , "Engineer" ], ["q2" , "Charlie" , "Artist" ]]
662+ load_records , query_records = _build_large_annoy_fixture ()
625663 weights = [1.0 , 1.0 , 1.0 ] # Example weights
626664
627665 closest_records , insert_records = annoy_post_process (
@@ -632,15 +670,11 @@ def test_annoy_post_process():
632670 threshold = None ,
633671 )
634672
635- # Assert the closest records
636- assert (
637- len (closest_records ) == 2
638- ) # We expect two results (one for each query record)
639- assert (
640- closest_records [0 ]["id" ] == "q1"
641- ) # The first query record should match the first load record
673+ # Assert ANN output shape and that all load records were matched.
674+ assert len (closest_records ) == len (load_records )
675+ assert all (record and "id" in record for record in closest_records )
642676
643- # No errors expected
677+ # No records should be marked for insert without a threshold.
644678 assert not insert_records
645679
646680
@@ -650,8 +684,7 @@ def test_annoy_post_process():
650684)
651685def test_annoy_post_process__insert_records ():
652686 # Test data
653- load_records = [["Alice" , "Engineer" ], ["Bob" , "Doctor" ]]
654- query_records = [["q1" , "Alice" , "Engineer" ], ["q2" , "Charlie" , "Artist" ]]
687+ load_records , query_records = _build_large_annoy_fixture ()
655688 weights = [1.0 , 1.0 , 1.0 ] # Example weights
656689 threshold = 0.3
657690
@@ -663,16 +696,11 @@ def test_annoy_post_process__insert_records():
663696 threshold = threshold ,
664697 )
665698
666- # Assert the closest records
667- assert len (closest_records ) == 2 # We expect two results (one record and one None)
668- assert (
669- closest_records [0 ]["id" ] == "q1"
670- ) # The first query record should match the first load record
671- assert closest_records [1 ] is None # The second query record should be None
672- assert insert_records [0 ] == [
673- "Bob" ,
674- "Doctor" ,
675- ] # The first insert record should match the second load record
699+ # Assert threshold behavior without relying on ANN neighbor tie-break order.
700+ assert len (closest_records ) == len (load_records )
701+ none_count = sum (record is None for record in closest_records )
702+ assert none_count == len (insert_records )
703+ assert all (candidate in load_records for candidate in insert_records )
676704
677705
678706def test_annoy_post_process__no_query_records ():
@@ -709,14 +737,7 @@ def test_annoy_post_process__no_query_records():
709737)
710738def test_annoy_post_process__insert_records_with_polymorphic_fields ():
711739 # Test data
712- load_records = [
713- ["Alice" , "Engineer" , "Alice_Contact" , "abcd1234" ],
714- ["Bob" , "Doctor" , "Bob_Contact" , "qwer1234" ],
715- ]
716- query_records = [
717- ["q1" , "Alice" , "Engineer" , "Alice_Contact" ],
718- ["q2" , "Charlie" , "Artist" , "Charlie_Contact" ],
719- ]
740+ load_records , query_records = _build_large_annoy_fixture_polymorphic ()
720741 weights = [1.0 , 1.0 , 1.0 , 1.0 ] # Example weights
721742 threshold = 0.3
722743 all_fields = ["Name" , "Occupation" , "Contact.Name" , "ContactId" ]
@@ -729,17 +750,15 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
729750 threshold = threshold ,
730751 )
731752
732- # Assert the closest records
733- assert len (closest_records ) == 2 # We expect two results (one record and one None)
734- assert (
735- closest_records [0 ]["id" ] == "q1"
736- ) # The first query record should match the first load record
737- assert closest_records [1 ] is None # The second query record should be None
738- assert insert_records [0 ] == [
739- "Bob" ,
740- "Doctor" ,
741- "qwer1234" ,
742- ] # The first insert record should match the second load record
753+ # Assert threshold behavior without relying on ANN neighbor tie-break order.
754+ assert len (closest_records ) == len (load_records )
755+ none_count = sum (record is None for record in closest_records )
756+ assert none_count == len (insert_records )
757+ expected_insert_candidates = [
758+ [name , occupation , contact_id ]
759+ for name , occupation , _ , contact_id in load_records
760+ ]
761+ assert all (candidate in expected_insert_candidates for candidate in insert_records )
743762
744763
745764@pytest .mark .skipif (
0 commit comments