@W-17427085: Set ANNOY related dependencies to be optional (#3858)

aditya-balachander · web-flow · commit 89a5b5ddb9b2 · 2024-12-18T20:27:09.000-08:00
Changes:
- Remove `"annoy", "numpy", "pandas", "scikit-learn"` from dependencies
under `pyproject.toml` and add them under optional dependencies
- Created flag `OPTIONAL_DEPENDENCIES_AVAILABLE`, to indicate if ANNOY
related dependencies are present in `select_utils.py`. If these optional
dependencies are not available, for high volume of records (i.e.
`complexity_constant &gt;= 1000`), still Levenshtein Distance based
selection will apply.
- Skipped those pytests which have dependencies on `pandas` and ANNOY
related optional dependencies under `test_select_utils.py`
- Adding a warning message for non-zero similarity score when using
ANNOY (for high volume of records). Updated the docs as well
- Added additional workflow to run all unit tests with all optional
dependencies installed
diff --git a/.github/workflows/feature_test.yml b/.github/workflows/feature_test.yml
@@ -63,6 +63,30 @@ jobs:
             - name: Run Pytest
               run: uv run pytest --cov-report= --cov=cumulusci
 
+    unit_tests_opt_deps:
+        name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
+        runs-on: ${{ matrix.os }}
+        strategy:
+            fail-fast: false
+            matrix:
+                os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
+                python-version: ["3.11", "3.12", "3.13"]
+        steps:
+            - uses: actions/checkout@v4
+            - name: Set up Python
+              uses: actions/setup-python@v4
+              with:
+                  python-version: "${{ matrix.python-version }}"
+            - name: Set up uv
+              uses: SFDO-Tooling/setup-uv@main
+              with:
+                  version: "0.5.0"
+                  enable-cache: true
+            - name: Install dependencies
+              run: uv sync --all-extras -p ${{ matrix.python-version }}
+            - name: Run Pytest
+              run: uv run pytest --cov-report= --cov=cumulusci
+
     robot_api:
         name: "Robot: No browser"
         runs-on: SFDO-Tooling-Ubuntu
diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py
@@ -1,22 +1,37 @@
+import logging
 import random
 import re
 import typing as T
 from enum import Enum
 
-import numpy as np
-import pandas as pd
-from annoy import AnnoyIndex
 from pydantic import Field, root_validator, validator
-from sklearn.feature_extraction.text import HashingVectorizer
-from sklearn.preprocessing import StandardScaler
 
 from cumulusci.core.enums import StrEnum
 from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
     DEFAULT_DECLARATIONS,
 )
 from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
+from cumulusci.utils import get_cci_upgrade_command
 from cumulusci.utils.yaml.model_parser import CCIDictModel
 
+logger = logging.getLogger(__name__)
+try:
+    import numpy as np
+    import pandas as pd
+    from annoy import AnnoyIndex
+    from sklearn.feature_extraction.text import HashingVectorizer
+    from sklearn.preprocessing import StandardScaler
+
+    OPTIONAL_DEPENDENCIES_AVAILABLE = True
+except ImportError:
+    logger.warning(
+        f"Optional dependencies are missing. "
+        "Handling high volumes of records for the 'select' functionality will be significantly slower, "
+        "as optimizations for this feature are currently disabled. "
+        f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
+    )
+    OPTIONAL_DEPENDENCIES_AVAILABLE = False
+
 
 class SelectStrategy(StrEnum):
     """Enum defining the different selection strategies requested."""
@@ -308,7 +323,7 @@ def similarity_post_process(
     select_records = []
     insert_records = []
 
-    if complexity_constant < 1000:
+    if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
         select_records, insert_records = levenshtein_post_process(
             load_records, query_records, fields, weights, threshold
         )
@@ -328,6 +343,12 @@ def annoy_post_process(
     threshold: T.Union[float, None],
 ) -> T.Tuple[T.List[dict], list]:
     """Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
+    # Add warning when threshold is 0
+    if threshold is not None and threshold == 0:
+        logger.warning(
+            "Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
+        )
+
     selected_records = []
     insertion_candidates = []
 
diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py
@@ -1,7 +1,7 @@
-import pandas as pd
 import pytest
 
 from cumulusci.tasks.bulkdata.select_utils import (
+    OPTIONAL_DEPENDENCIES_AVAILABLE,
     SelectOperationExecutor,
     SelectStrategy,
     add_limit_offset_to_user_filter,
@@ -15,6 +15,14 @@
     vectorize_records,
 )
 
+# Check for pandas availability
+try:
+    import pandas as pd
+
+    PANDAS_AVAILABLE = True
+except ImportError:
+    PANDAS_AVAILABLE = False
+
 
 # Test Cases for standard_generate_query
 def test_standard_generate_query_with_default_record_declaration():
@@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
     assert "Records must be same size as fields (weights)." in str(e.value)
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_numeric_columns():
     df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
     df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
@@ -526,6 +538,10 @@ def test_all_numeric_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_numeric_columns__one_non_numeric():
     df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
     df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
@@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_boolean_columns():
     df_db = pd.DataFrame(
         {"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
@@ -560,6 +580,10 @@ def test_all_boolean_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_categorical_columns():
     df_db = pd.DataFrame(
         {"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
@@ -579,6 +603,10 @@ def test_all_categorical_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_mixed_types():
     df_db = pd.DataFrame(
         {
@@ -606,6 +634,10 @@ def test_mixed_types():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_vectorize_records_mixed_numerical_boolean_categorical():
     # Test data with mixed types: numerical and categorical only
     db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
@@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
     ), "Query vectors column count mismatch"
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process():
     # Test data
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -659,6 +695,10 @@ def test_annoy_post_process():
     assert not insert_records
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process__insert_records():
     # Test data
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
     ]  # The first insert record should match the second load record
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process__insert_records_with_polymorphic_fields():
     # Test data
     load_records = [
@@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
     ]  # The first insert record should match the second load record
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_single_record_match_annoy_post_process():
     # Mock data where only the first query record matches the first load record
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
diff --git a/docs/data.md b/docs/data.md
@@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a
 
 This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org.
 
+**Important Note:**  
+For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process.
+
 ---
 
 #### Example
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,6 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-    "annoy",
     "click>=8.1",
     "cryptography",
     "python-dateutil",
@@ -35,8 +34,6 @@ dependencies = [
     "defusedxml",
     "lxml",
     "MarkupSafe",
-    "numpy",
-    "pandas",
     "psutil",
     "pydantic<2",
     "PyJWT",
@@ -53,7 +50,6 @@ dependencies = [
     "rst2ansi>=0.1.5",
     "salesforce-bulk",
     "sarge",
-    "scikit-learn",
     "selenium<4",
     "simple-salesforce==1.11.4",
     "snowfakery>=4.0.0",
@@ -88,6 +84,14 @@ lint = [
     "pre-commit>=3.5.0",
 ]
 
+[project.optional-dependencies]
+select = [
+    "annoy",
+    "numpy",
+    "pandas",
+    "scikit-learn",
+]
+
 [project.scripts]
 cci = "cumulusci.cli.cci:main"
 snowfakery = "snowfakery.cli:main"