Skip to content

Commit 44e80b2

Browse files
committed
Fix for numerical feature category bug
1 parent 6c439f2 commit 44e80b2

3 files changed

Lines changed: 62 additions & 68 deletions

File tree

cumulusci/tasks/bulkdata/select_utils.py

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,6 @@ def annoy_post_process(
352352
insertion_candidates = load_shaped_records
353353
return selected_records, insertion_candidates
354354

355-
query_records = replace_empty_strings_with_missing(query_records)
356-
select_shaped_records = replace_empty_strings_with_missing(select_shaped_records)
357-
358355
hash_features = 100
359356
num_trees = 10
360357

@@ -589,7 +586,7 @@ def add_limit_offset_to_user_filter(
589586
return f" {filter_clause}"
590587

591588

592-
def determine_field_types(df, weights):
589+
def determine_field_types(df_db, df_query, weights):
593590
numerical_features = []
594591
boolean_features = []
595592
categorical_features = []
@@ -598,23 +595,35 @@ def determine_field_types(df, weights):
598595
boolean_weights = []
599596
categorical_weights = []
600597

601-
for col, weight in zip(df.columns, weights):
598+
for col, weight in zip(df_db.columns, weights):
602599
# Check if the column can be converted to numeric
603600
try:
604-
# Attempt to convert to numeric
605-
df[col] = pd.to_numeric(df[col], errors="raise")
601+
temp_df_db = pd.to_numeric(df_db[col], errors="raise")
602+
temp_df_query = pd.to_numeric(df_query[col], errors="raise")
603+
# Replace empty values with 0 for numerical features
604+
df_db[col] = temp_df_db.fillna(0).replace("", 0)
605+
df_query[col] = temp_df_query.fillna(0).replace("", 0)
606606
numerical_features.append(col)
607607
numerical_weights.append(weight)
608608
except ValueError:
609609
# Check for boolean values
610-
if df[col].str.lower().isin(["true", "false"]).all():
610+
if (
611+
df_db[col].str.lower().isin(["true", "false"]).all()
612+
and df_query[col].str.lower().isin(["true", "false"]).all()
613+
):
611614
# Map to actual boolean values
612-
df[col] = df[col].str.lower().map({"true": True, "false": False})
615+
df_db[col] = df_db[col].str.lower().map({"true": True, "false": False})
616+
df_query[col] = (
617+
df_query[col].str.lower().map({"true": True, "false": False})
618+
)
613619
boolean_features.append(col)
614620
boolean_weights.append(weight)
615621
else:
616622
categorical_features.append(col)
617623
categorical_weights.append(weight)
624+
# Replace empty values with 'missing' for categorical features
625+
df_db[col] = df_db[col].replace("", "missing")
626+
df_query[col] = df_query[col].replace("", "missing")
618627

619628
return (
620629
numerical_features,
@@ -640,14 +649,7 @@ def vectorize_records(db_records, query_records, hash_features, weights):
640649
numerical_weights,
641650
boolean_weights,
642651
categorical_weights,
643-
) = determine_field_types(df_db, weights)
644-
645-
# Modify query dataframe boolean columns to True or False
646-
for col in df_query.columns:
647-
if df_query[col].str.lower().isin(["true", "false"]).all():
648-
df_query[col] = (
649-
df_query[col].str.lower().map({"true": True, "false": False})
650-
)
652+
) = determine_field_types(df_db, df_query, weights)
651653

652654
# Fit StandardScaler on the numerical features of the database records
653655
scaler = StandardScaler()
@@ -705,13 +707,6 @@ def vectorize_records(db_records, query_records, hash_features, weights):
705707
return final_db_vectors, final_query_vectors
706708

707709

708-
def replace_empty_strings_with_missing(records):
709-
return [
710-
[(field if field != "" else "missing") for field in record]
711-
for record in records
712-
]
713-
714-
715710
def split_and_filter_fields(fields: T.List[str]) -> T.Tuple[T.List[str], T.List[str]]:
716711
# List to store non-lookup fields (load fields)
717712
load_fields = []

cumulusci/tasks/bulkdata/step.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,9 +478,11 @@ def select_records(self, records):
478478
)
479479

480480
# Execute the main select query using Bulk API
481+
self.logger.info("Retrieving records from org...")
481482
select_query_records = self._execute_select_query(
482483
select_query=select_query, query_fields=query_fields
483484
)
485+
self.logger.info(f"Retrieved {len(select_query_records)} from org")
484486

485487
query_records.extend(select_query_records)
486488
# Post-process the query results
@@ -895,7 +897,9 @@ def select_records(self, records):
895897
)
896898

897899
# Execute the query and gather the records
900+
self.logger.info("Retrieving records from org...")
898901
query_records = self._execute_soql_query(select_query, query_fields)
902+
self.logger.info(f"Retrieved {len(query_records)} from org")
899903

900904
# Post-process the query results for this batch
901905
(

cumulusci/tasks/bulkdata/tests/test_select_utils.py

Lines changed: 39 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
find_closest_record,
1212
levenshtein_distance,
1313
reorder_records,
14-
replace_empty_strings_with_missing,
1514
split_and_filter_fields,
1615
vectorize_records,
1716
)
@@ -485,43 +484,9 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
485484
assert "Records must be same size as fields (weights)." in str(e.value)
486485

487486

488-
def test_replace_empty_strings_with_missing():
489-
# Case 1: Normal case with some empty strings
490-
records = [
491-
["Alice", "", "New York"],
492-
["Bob", "Engineer", ""],
493-
["", "Teacher", "Chicago"],
494-
]
495-
expected = [
496-
["Alice", "missing", "New York"],
497-
["Bob", "Engineer", "missing"],
498-
["missing", "Teacher", "Chicago"],
499-
]
500-
assert replace_empty_strings_with_missing(records) == expected
501-
502-
# Case 2: No empty strings, so the output should be the same as input
503-
records = [["Alice", "Manager", "New York"], ["Bob", "Engineer", "San Francisco"]]
504-
expected = [["Alice", "Manager", "New York"], ["Bob", "Engineer", "San Francisco"]]
505-
assert replace_empty_strings_with_missing(records) == expected
506-
507-
# Case 3: List with all empty strings
508-
records = [["", "", ""], ["", "", ""]]
509-
expected = [["missing", "missing", "missing"], ["missing", "missing", "missing"]]
510-
assert replace_empty_strings_with_missing(records) == expected
511-
512-
# Case 4: Empty list (should return an empty list)
513-
records = []
514-
expected = []
515-
assert replace_empty_strings_with_missing(records) == expected
516-
517-
# Case 5: List with some empty sublists
518-
records = [[], ["Alice", ""], []]
519-
expected = [[], ["Alice", "missing"], []]
520-
assert replace_empty_strings_with_missing(records) == expected
521-
522-
523487
def test_all_numeric_columns():
524-
df = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
488+
df_db = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
489+
df_query = pd.DataFrame({"A": [4, 5, ""], "B": [4.5, 5.5, 6.5]})
525490
weights = [0.1, 0.2]
526491
expected_output = (
527492
["A", "B"], # numerical_features
@@ -531,11 +496,31 @@ def test_all_numeric_columns():
531496
[], # boolean_weights
532497
[], # categorical_weights
533498
)
534-
assert determine_field_types(df, weights) == expected_output
499+
assert determine_field_types(df_db, df_query, weights) == expected_output
500+
501+
502+
def test_numeric_columns__one_non_numeric():
503+
df_db = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
504+
df_query = pd.DataFrame({"A": [4, 5, 6], "B": ["abcd", 5.5, 6.5]})
505+
weights = [0.1, 0.2]
506+
expected_output = (
507+
["A"], # numerical_features
508+
[], # boolean_features
509+
[], # categorical_features
510+
[0.1], # numerical_weights
511+
[], # boolean_weights
512+
[], # categorical_weights
513+
)
514+
assert determine_field_types(df_db, df_query, weights) == expected_output
535515

536516

537517
def test_all_boolean_columns():
538-
df = pd.DataFrame({"A": ["true", "false", "true"], "B": ["false", "true", "false"]})
518+
df_db = pd.DataFrame(
519+
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
520+
)
521+
df_query = pd.DataFrame(
522+
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
523+
)
539524
weights = [0.3, 0.4]
540525
expected_output = (
541526
[], # numerical_features
@@ -545,13 +530,16 @@ def test_all_boolean_columns():
545530
[0.3, 0.4], # boolean_weights
546531
[], # categorical_weights
547532
)
548-
assert determine_field_types(df, weights) == expected_output
533+
assert determine_field_types(df_db, df_query, weights) == expected_output
549534

550535

551536
def test_all_categorical_columns():
552-
df = pd.DataFrame(
537+
df_db = pd.DataFrame(
553538
{"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
554539
)
540+
df_query = pd.DataFrame(
541+
{"A": ["banana", "apple", "cherry"], "B": ["cat", "dog", "mouse"]}
542+
)
555543
weights = [0.5, 0.6]
556544
expected_output = (
557545
[], # numerical_features
@@ -561,17 +549,24 @@ def test_all_categorical_columns():
561549
[], # boolean_weights
562550
[0.5, 0.6], # categorical_weights
563551
)
564-
assert determine_field_types(df, weights) == expected_output
552+
assert determine_field_types(df_db, df_query, weights) == expected_output
565553

566554

567555
def test_mixed_types():
568-
df = pd.DataFrame(
556+
df_db = pd.DataFrame(
569557
{
570558
"A": [1, 2, 3],
571559
"B": ["true", "false", "true"],
572560
"C": ["apple", "banana", "cherry"],
573561
}
574562
)
563+
df_query = pd.DataFrame(
564+
{
565+
"A": [1, 3, ""],
566+
"B": ["true", "true", "true"],
567+
"C": ["apple", "", 3],
568+
}
569+
)
575570
weights = [0.7, 0.8, 0.9]
576571
expected_output = (
577572
["A"], # numerical_features
@@ -581,7 +576,7 @@ def test_mixed_types():
581576
[0.8], # boolean_weights
582577
[0.9], # categorical_weights
583578
)
584-
assert determine_field_types(df, weights) == expected_output
579+
assert determine_field_types(df_db, df_query, weights) == expected_output
585580

586581

587582
def test_vectorize_records_mixed_numerical_boolean_categorical():

0 commit comments

Comments
 (0)