Fix for numerical feature category bug

mjawadtp · mjawadtp · commit 44e80b256d6d · 2024-11-28T16:03:03.000+05:30
diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py
@@ -352,9 +352,6 @@ def annoy_post_process(
         insertion_candidates = load_shaped_records
         return selected_records, insertion_candidates
 
-    query_records = replace_empty_strings_with_missing(query_records)
-    select_shaped_records = replace_empty_strings_with_missing(select_shaped_records)
-
     hash_features = 100
     num_trees = 10
 
@@ -589,7 +586,7 @@ def add_limit_offset_to_user_filter(
     return f" {filter_clause}"
 
 
-def determine_field_types(df, weights):
+def determine_field_types(df_db, df_query, weights):
     numerical_features = []
     boolean_features = []
     categorical_features = []
@@ -598,23 +595,35 @@ def determine_field_types(df, weights):
     boolean_weights = []
     categorical_weights = []
 
-    for col, weight in zip(df.columns, weights):
+    for col, weight in zip(df_db.columns, weights):
         # Check if the column can be converted to numeric
         try:
-            # Attempt to convert to numeric
-            df[col] = pd.to_numeric(df[col], errors="raise")
+            temp_df_db = pd.to_numeric(df_db[col], errors="raise")
+            temp_df_query = pd.to_numeric(df_query[col], errors="raise")
+            # Replace empty values with 0 for numerical features
+            df_db[col] = temp_df_db.fillna(0).replace("", 0)
+            df_query[col] = temp_df_query.fillna(0).replace("", 0)
             numerical_features.append(col)
             numerical_weights.append(weight)
         except ValueError:
             # Check for boolean values
-            if df[col].str.lower().isin(["true", "false"]).all():
+            if (
+                df_db[col].str.lower().isin(["true", "false"]).all()
+                and df_query[col].str.lower().isin(["true", "false"]).all()
+            ):
                 # Map to actual boolean values
-                df[col] = df[col].str.lower().map({"true": True, "false": False})
+                df_db[col] = df_db[col].str.lower().map({"true": True, "false": False})
+                df_query[col] = (
+                    df_query[col].str.lower().map({"true": True, "false": False})
+                )
                 boolean_features.append(col)
                 boolean_weights.append(weight)
             else:
                 categorical_features.append(col)
                 categorical_weights.append(weight)
+                # Replace empty values with 'missing' for categorical features
+                df_db[col] = df_db[col].replace("", "missing")
+                df_query[col] = df_query[col].replace("", "missing")
 
     return (
         numerical_features,
@@ -640,14 +649,7 @@ def vectorize_records(db_records, query_records, hash_features, weights):
         numerical_weights,
         boolean_weights,
         categorical_weights,
-    ) = determine_field_types(df_db, weights)
-
-    # Modify query dataframe boolean columns to True or False
-    for col in df_query.columns:
-        if df_query[col].str.lower().isin(["true", "false"]).all():
-            df_query[col] = (
-                df_query[col].str.lower().map({"true": True, "false": False})
-            )
+    ) = determine_field_types(df_db, df_query, weights)
 
     # Fit StandardScaler on the numerical features of the database records
     scaler = StandardScaler()
@@ -705,13 +707,6 @@ def vectorize_records(db_records, query_records, hash_features, weights):
     return final_db_vectors, final_query_vectors
 
 
-def replace_empty_strings_with_missing(records):
-    return [
-        [(field if field != "" else "missing") for field in record]
-        for record in records
-    ]
-
-
 def split_and_filter_fields(fields: T.List[str]) -> T.Tuple[T.List[str], T.List[str]]:
     # List to store non-lookup fields (load fields)
     load_fields = []
diff --git a/cumulusci/tasks/bulkdata/step.py b/cumulusci/tasks/bulkdata/step.py
@@ -478,9 +478,11 @@ def select_records(self, records):
         )
 
         # Execute the main select query using Bulk API
+        self.logger.info("Retrieving records from org...")
         select_query_records = self._execute_select_query(
             select_query=select_query, query_fields=query_fields
         )
+        self.logger.info(f"Retrieved {len(select_query_records)} from org")
 
         query_records.extend(select_query_records)
         # Post-process the query results
@@ -895,7 +897,9 @@ def select_records(self, records):
         )
 
         # Execute the query and gather the records
+        self.logger.info("Retrieving records from org...")
         query_records = self._execute_soql_query(select_query, query_fields)
+        self.logger.info(f"Retrieved {len(query_records)} from org")
 
         # Post-process the query results for this batch
         (
diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py
@@ -11,7 +11,6 @@
     find_closest_record,
     levenshtein_distance,
     reorder_records,
-    replace_empty_strings_with_missing,
     split_and_filter_fields,
     vectorize_records,
 )
@@ -485,43 +484,9 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
     assert "Records must be same size as fields (weights)." in str(e.value)
 
 
-def test_replace_empty_strings_with_missing():
-    # Case 1: Normal case with some empty strings
-    records = [
-        ["Alice", "", "New York"],
-        ["Bob", "Engineer", ""],
-        ["", "Teacher", "Chicago"],
-    ]
-    expected = [
-        ["Alice", "missing", "New York"],
-        ["Bob", "Engineer", "missing"],
-        ["missing", "Teacher", "Chicago"],
-    ]
-    assert replace_empty_strings_with_missing(records) == expected
-
-    # Case 2: No empty strings, so the output should be the same as input
-    records = [["Alice", "Manager", "New York"], ["Bob", "Engineer", "San Francisco"]]
-    expected = [["Alice", "Manager", "New York"], ["Bob", "Engineer", "San Francisco"]]
-    assert replace_empty_strings_with_missing(records) == expected
-
-    # Case 3: List with all empty strings
-    records = [["", "", ""], ["", "", ""]]
-    expected = [["missing", "missing", "missing"], ["missing", "missing", "missing"]]
-    assert replace_empty_strings_with_missing(records) == expected
-
-    # Case 4: Empty list (should return an empty list)
-    records = []
-    expected = []
-    assert replace_empty_strings_with_missing(records) == expected
-
-    # Case 5: List with some empty sublists
-    records = [[], ["Alice", ""], []]
-    expected = [[], ["Alice", "missing"], []]
-    assert replace_empty_strings_with_missing(records) == expected
-
-
 def test_all_numeric_columns():
-    df = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
+    df_db = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
+    df_query = pd.DataFrame({"A": [4, 5, ""], "B": [4.5, 5.5, 6.5]})
     weights = [0.1, 0.2]
     expected_output = (
         ["A", "B"],  # numerical_features
@@ -531,11 +496,31 @@ def test_all_numeric_columns():
         [],  # boolean_weights
         [],  # categorical_weights
     )
-    assert determine_field_types(df, weights) == expected_output
+    assert determine_field_types(df_db, df_query, weights) == expected_output
+
+
+def test_numeric_columns__one_non_numeric():
+    df_db = pd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]})
+    df_query = pd.DataFrame({"A": [4, 5, 6], "B": ["abcd", 5.5, 6.5]})
+    weights = [0.1, 0.2]
+    expected_output = (
+        ["A"],  # numerical_features
+        [],  # boolean_features
+        [],  # categorical_features
+        [0.1],  # numerical_weights
+        [],  # boolean_weights
+        [],  # categorical_weights
+    )
+    assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
 def test_all_boolean_columns():
-    df = pd.DataFrame({"A": ["true", "false", "true"], "B": ["false", "true", "false"]})
+    df_db = pd.DataFrame(
+        {"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
+    )
+    df_query = pd.DataFrame(
+        {"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
+    )
     weights = [0.3, 0.4]
     expected_output = (
         [],  # numerical_features
@@ -545,13 +530,16 @@ def test_all_boolean_columns():
         [0.3, 0.4],  # boolean_weights
         [],  # categorical_weights
     )
-    assert determine_field_types(df, weights) == expected_output
+    assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
 def test_all_categorical_columns():
-    df = pd.DataFrame(
+    df_db = pd.DataFrame(
         {"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
     )
+    df_query = pd.DataFrame(
+        {"A": ["banana", "apple", "cherry"], "B": ["cat", "dog", "mouse"]}
+    )
     weights = [0.5, 0.6]
     expected_output = (
         [],  # numerical_features
@@ -561,17 +549,24 @@ def test_all_categorical_columns():
         [],  # boolean_weights
         [0.5, 0.6],  # categorical_weights
     )
-    assert determine_field_types(df, weights) == expected_output
+    assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
 def test_mixed_types():
-    df = pd.DataFrame(
+    df_db = pd.DataFrame(
         {
             "A": [1, 2, 3],
             "B": ["true", "false", "true"],
             "C": ["apple", "banana", "cherry"],
         }
     )
+    df_query = pd.DataFrame(
+        {
+            "A": [1, 3, ""],
+            "B": ["true", "true", "true"],
+            "C": ["apple", "", 3],
+        }
+    )
     weights = [0.7, 0.8, 0.9]
     expected_output = (
         ["A"],  # numerical_features
@@ -581,7 +576,7 @@ def test_mixed_types():
         [0.8],  # boolean_weights
         [0.9],  # categorical_weights
     )
-    assert determine_field_types(df, weights) == expected_output
+    assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
 def test_vectorize_records_mixed_numerical_boolean_categorical():

Original file line number	Diff line number	Diff line change
`@@ -478,9 +478,11 @@ def select_records(self, records):`
`478`	`478`	`)`
`479`	`479`
`480`	`480`	`# Execute the main select query using Bulk API`
	`481`	`+ self.logger.info("Retrieving records from org...")`
`481`	`482`	`select_query_records = self._execute_select_query(`
`482`	`483`	`select_query=select_query, query_fields=query_fields`
`483`	`484`	`)`
	`485`	`+ self.logger.info(f"Retrieved {len(select_query_records)} from org")`
`484`	`486`
`485`	`487`	`query_records.extend(select_query_records)`
`486`	`488`	`# Post-process the query results`
`@@ -895,7 +897,9 @@ def select_records(self, records):`
`895`	`897`	`)`
`896`	`898`
`897`	`899`	`# Execute the query and gather the records`
	`900`	`+ self.logger.info("Retrieving records from org...")`
`898`	`901`	`query_records = self._execute_soql_query(select_query, query_fields)`
	`902`	`+ self.logger.info(f"Retrieved {len(query_records)} from org")`
`899`	`903`
`900`	`904`	`# Post-process the query results for this batch`
`901`	`905`	`(`