1111 find_closest_record ,
1212 levenshtein_distance ,
1313 reorder_records ,
14- replace_empty_strings_with_missing ,
1514 split_and_filter_fields ,
1615 vectorize_records ,
1716)
@@ -485,43 +484,9 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
485484 assert "Records must be same size as fields (weights)." in str (e .value )
486485
487486
488- def test_replace_empty_strings_with_missing ():
489- # Case 1: Normal case with some empty strings
490- records = [
491- ["Alice" , "" , "New York" ],
492- ["Bob" , "Engineer" , "" ],
493- ["" , "Teacher" , "Chicago" ],
494- ]
495- expected = [
496- ["Alice" , "missing" , "New York" ],
497- ["Bob" , "Engineer" , "missing" ],
498- ["missing" , "Teacher" , "Chicago" ],
499- ]
500- assert replace_empty_strings_with_missing (records ) == expected
501-
502- # Case 2: No empty strings, so the output should be the same as input
503- records = [["Alice" , "Manager" , "New York" ], ["Bob" , "Engineer" , "San Francisco" ]]
504- expected = [["Alice" , "Manager" , "New York" ], ["Bob" , "Engineer" , "San Francisco" ]]
505- assert replace_empty_strings_with_missing (records ) == expected
506-
507- # Case 3: List with all empty strings
508- records = [["" , "" , "" ], ["" , "" , "" ]]
509- expected = [["missing" , "missing" , "missing" ], ["missing" , "missing" , "missing" ]]
510- assert replace_empty_strings_with_missing (records ) == expected
511-
512- # Case 4: Empty list (should return an empty list)
513- records = []
514- expected = []
515- assert replace_empty_strings_with_missing (records ) == expected
516-
517- # Case 5: List with some empty sublists
518- records = [[], ["Alice" , "" ], []]
519- expected = [[], ["Alice" , "missing" ], []]
520- assert replace_empty_strings_with_missing (records ) == expected
521-
522-
523487def test_all_numeric_columns ():
524- df = pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : [4.5 , 5.5 , 6.5 ]})
488+ df_db = pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : [4.5 , 5.5 , 6.5 ]})
489+ df_query = pd .DataFrame ({"A" : [4 , 5 , "" ], "B" : [4.5 , 5.5 , 6.5 ]})
525490 weights = [0.1 , 0.2 ]
526491 expected_output = (
527492 ["A" , "B" ], # numerical_features
@@ -531,11 +496,31 @@ def test_all_numeric_columns():
531496 [], # boolean_weights
532497 [], # categorical_weights
533498 )
534- assert determine_field_types (df , weights ) == expected_output
499+ assert determine_field_types (df_db , df_query , weights ) == expected_output
500+
501+
502+ def test_numeric_columns__one_non_numeric ():
503+ df_db = pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : [4.5 , 5.5 , 6.5 ]})
504+ df_query = pd .DataFrame ({"A" : [4 , 5 , 6 ], "B" : ["abcd" , 5.5 , 6.5 ]})
505+ weights = [0.1 , 0.2 ]
506+ expected_output = (
507+ ["A" ], # numerical_features
508+ [], # boolean_features
509+ [], # categorical_features
510+ [0.1 ], # numerical_weights
511+ [], # boolean_weights
512+ [], # categorical_weights
513+ )
514+ assert determine_field_types (df_db , df_query , weights ) == expected_output
535515
536516
537517def test_all_boolean_columns ():
538- df = pd .DataFrame ({"A" : ["true" , "false" , "true" ], "B" : ["false" , "true" , "false" ]})
518+ df_db = pd .DataFrame (
519+ {"A" : ["true" , "false" , "true" ], "B" : ["false" , "true" , "false" ]}
520+ )
521+ df_query = pd .DataFrame (
522+ {"A" : ["true" , "false" , "true" ], "B" : ["false" , "true" , "false" ]}
523+ )
539524 weights = [0.3 , 0.4 ]
540525 expected_output = (
541526 [], # numerical_features
@@ -545,13 +530,16 @@ def test_all_boolean_columns():
545530 [0.3 , 0.4 ], # boolean_weights
546531 [], # categorical_weights
547532 )
548- assert determine_field_types (df , weights ) == expected_output
533+ assert determine_field_types (df_db , df_query , weights ) == expected_output
549534
550535
551536def test_all_categorical_columns ():
552- df = pd .DataFrame (
537+ df_db = pd .DataFrame (
553538 {"A" : ["apple" , "banana" , "cherry" ], "B" : ["dog" , "cat" , "mouse" ]}
554539 )
540+ df_query = pd .DataFrame (
541+ {"A" : ["banana" , "apple" , "cherry" ], "B" : ["cat" , "dog" , "mouse" ]}
542+ )
555543 weights = [0.5 , 0.6 ]
556544 expected_output = (
557545 [], # numerical_features
@@ -561,17 +549,24 @@ def test_all_categorical_columns():
561549 [], # boolean_weights
562550 [0.5 , 0.6 ], # categorical_weights
563551 )
564- assert determine_field_types (df , weights ) == expected_output
552+ assert determine_field_types (df_db , df_query , weights ) == expected_output
565553
566554
567555def test_mixed_types ():
568- df = pd .DataFrame (
556+ df_db = pd .DataFrame (
569557 {
570558 "A" : [1 , 2 , 3 ],
571559 "B" : ["true" , "false" , "true" ],
572560 "C" : ["apple" , "banana" , "cherry" ],
573561 }
574562 )
563+ df_query = pd .DataFrame (
564+ {
565+ "A" : [1 , 3 , "" ],
566+ "B" : ["true" , "true" , "true" ],
567+ "C" : ["apple" , "" , 3 ],
568+ }
569+ )
575570 weights = [0.7 , 0.8 , 0.9 ]
576571 expected_output = (
577572 ["A" ], # numerical_features
@@ -581,7 +576,7 @@ def test_mixed_types():
581576 [0.8 ], # boolean_weights
582577 [0.9 ], # categorical_weights
583578 )
584- assert determine_field_types (df , weights ) == expected_output
579+ assert determine_field_types (df_db , df_query , weights ) == expected_output
585580
586581
587582def test_vectorize_records_mixed_numerical_boolean_categorical ():
0 commit comments