Skip to content

Commit 89a5b5d

Browse files
@W-17427085: Set ANNOY related dependencies to be optional (#3858)
Changes: - Remove `"annoy", "numpy", "pandas", "scikit-learn"` from dependencies under `pyproject.toml` and add them under optional dependencies - Created flag `OPTIONAL_DEPENDENCIES_AVAILABLE`, to indicate if ANNOY related dependencies are present in `select_utils.py`. If these optional dependencies are not available, for high volume of records (i.e. `complexity_constant >= 1000`), still Levenshtein Distance based selection will apply. - Skipped those pytests which have dependencies on `pandas` and ANNOY related optional dependencies under `test_select_utils.py` - Adding a warning message for non-zero similarity score when using ANNOY (for high volume of records). Updated the docs as well - Added additional workflow to run all unit tests with all optional dependencies installed
1 parent 534210c commit 89a5b5d

5 files changed

Lines changed: 111 additions & 11 deletions

File tree

.github/workflows/feature_test.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,30 @@ jobs:
6363
- name: Run Pytest
6464
run: uv run pytest --cov-report= --cov=cumulusci
6565

66+
unit_tests_opt_deps:
67+
name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
68+
runs-on: ${{ matrix.os }}
69+
strategy:
70+
fail-fast: false
71+
matrix:
72+
os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
73+
python-version: ["3.11", "3.12", "3.13"]
74+
steps:
75+
- uses: actions/checkout@v4
76+
- name: Set up Python
77+
uses: actions/setup-python@v4
78+
with:
79+
python-version: "${{ matrix.python-version }}"
80+
- name: Set up uv
81+
uses: SFDO-Tooling/setup-uv@main
82+
with:
83+
version: "0.5.0"
84+
enable-cache: true
85+
- name: Install dependencies
86+
run: uv sync --all-extras -p ${{ matrix.python-version }}
87+
- name: Run Pytest
88+
run: uv run pytest --cov-report= --cov=cumulusci
89+
6690
robot_api:
6791
name: "Robot: No browser"
6892
runs-on: SFDO-Tooling-Ubuntu

cumulusci/tasks/bulkdata/select_utils.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,37 @@
1+
import logging
12
import random
23
import re
34
import typing as T
45
from enum import Enum
56

6-
import numpy as np
7-
import pandas as pd
8-
from annoy import AnnoyIndex
97
from pydantic import Field, root_validator, validator
10-
from sklearn.feature_extraction.text import HashingVectorizer
11-
from sklearn.preprocessing import StandardScaler
128

139
from cumulusci.core.enums import StrEnum
1410
from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
1511
DEFAULT_DECLARATIONS,
1612
)
1713
from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
14+
from cumulusci.utils import get_cci_upgrade_command
1815
from cumulusci.utils.yaml.model_parser import CCIDictModel
1916

17+
logger = logging.getLogger(__name__)
18+
try:
19+
import numpy as np
20+
import pandas as pd
21+
from annoy import AnnoyIndex
22+
from sklearn.feature_extraction.text import HashingVectorizer
23+
from sklearn.preprocessing import StandardScaler
24+
25+
OPTIONAL_DEPENDENCIES_AVAILABLE = True
26+
except ImportError:
27+
logger.warning(
28+
f"Optional dependencies are missing. "
29+
"Handling high volumes of records for the 'select' functionality will be significantly slower, "
30+
"as optimizations for this feature are currently disabled. "
31+
f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
32+
)
33+
OPTIONAL_DEPENDENCIES_AVAILABLE = False
34+
2035

2136
class SelectStrategy(StrEnum):
2237
"""Enum defining the different selection strategies requested."""
@@ -308,7 +323,7 @@ def similarity_post_process(
308323
select_records = []
309324
insert_records = []
310325

311-
if complexity_constant < 1000:
326+
if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
312327
select_records, insert_records = levenshtein_post_process(
313328
load_records, query_records, fields, weights, threshold
314329
)
@@ -328,6 +343,12 @@ def annoy_post_process(
328343
threshold: T.Union[float, None],
329344
) -> T.Tuple[T.List[dict], list]:
330345
"""Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
346+
# Add warning when threshold is 0
347+
if threshold is not None and threshold == 0:
348+
logger.warning(
349+
"Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
350+
)
351+
331352
selected_records = []
332353
insertion_candidates = []
333354

cumulusci/tasks/bulkdata/tests/test_select_utils.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import pandas as pd
21
import pytest
32

43
from cumulusci.tasks.bulkdata.select_utils import (
4+
OPTIONAL_DEPENDENCIES_AVAILABLE,
55
SelectOperationExecutor,
66
SelectStrategy,
77
add_limit_offset_to_user_filter,
@@ -15,6 +15,14 @@
1515
vectorize_records,
1616
)
1717

18+
# Check for pandas availability
19+
try:
20+
import pandas as pd
21+
22+
PANDAS_AVAILABLE = True
23+
except ImportError:
24+
PANDAS_AVAILABLE = False
25+
1826

1927
# Test Cases for standard_generate_query
2028
def test_standard_generate_query_with_default_record_declaration():
@@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
511519
assert "Records must be same size as fields (weights)." in str(e.value)
512520

513521

522+
@pytest.mark.skipif(
523+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
524+
reason="requires optional dependencies for annoy",
525+
)
514526
def test_all_numeric_columns():
515527
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
516528
df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
@@ -526,6 +538,10 @@ def test_all_numeric_columns():
526538
assert determine_field_types(df_db, df_query, weights) == expected_output
527539

528540

541+
@pytest.mark.skipif(
542+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
543+
reason="requires optional dependencies for annoy",
544+
)
529545
def test_numeric_columns__one_non_numeric():
530546
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
531547
df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
@@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
541557
assert determine_field_types(df_db, df_query, weights) == expected_output
542558

543559

560+
@pytest.mark.skipif(
561+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
562+
reason="requires optional dependencies for annoy",
563+
)
544564
def test_all_boolean_columns():
545565
df_db = pd.DataFrame(
546566
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
@@ -560,6 +580,10 @@ def test_all_boolean_columns():
560580
assert determine_field_types(df_db, df_query, weights) == expected_output
561581

562582

583+
@pytest.mark.skipif(
584+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
585+
reason="requires optional dependencies for annoy",
586+
)
563587
def test_all_categorical_columns():
564588
df_db = pd.DataFrame(
565589
{"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
@@ -579,6 +603,10 @@ def test_all_categorical_columns():
579603
assert determine_field_types(df_db, df_query, weights) == expected_output
580604

581605

606+
@pytest.mark.skipif(
607+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
608+
reason="requires optional dependencies for annoy",
609+
)
582610
def test_mixed_types():
583611
df_db = pd.DataFrame(
584612
{
@@ -606,6 +634,10 @@ def test_mixed_types():
606634
assert determine_field_types(df_db, df_query, weights) == expected_output
607635

608636

637+
@pytest.mark.skipif(
638+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
639+
reason="requires optional dependencies for annoy",
640+
)
609641
def test_vectorize_records_mixed_numerical_boolean_categorical():
610642
# Test data with mixed types: numerical and categorical only
611643
db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
@@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
633665
), "Query vectors column count mismatch"
634666

635667

668+
@pytest.mark.skipif(
669+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
670+
reason="requires optional dependencies for annoy",
671+
)
636672
def test_annoy_post_process():
637673
# Test data
638674
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -659,6 +695,10 @@ def test_annoy_post_process():
659695
assert not insert_records
660696

661697

698+
@pytest.mark.skipif(
699+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
700+
reason="requires optional dependencies for annoy",
701+
)
662702
def test_annoy_post_process__insert_records():
663703
# Test data
664704
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
714754
] # The first insert record should match the second load record
715755

716756

757+
@pytest.mark.skipif(
758+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
759+
reason="requires optional dependencies for annoy",
760+
)
717761
def test_annoy_post_process__insert_records_with_polymorphic_fields():
718762
# Test data
719763
load_records = [
@@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
749793
] # The first insert record should match the second load record
750794

751795

796+
@pytest.mark.skipif(
797+
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
798+
reason="requires optional dependencies for annoy",
799+
)
752800
def test_single_record_match_annoy_post_process():
753801
# Mock data where only the first query record matches the first load record
754802
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]

docs/data.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a
352352

353353
This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org.
354354

355+
**Important Note:**
356+
For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process.
357+
355358
---
356359

357360
#### Example

pyproject.toml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ classifiers = [
2323
"Programming Language :: Python :: 3.13",
2424
]
2525
dependencies = [
26-
"annoy",
2726
"click>=8.1",
2827
"cryptography",
2928
"python-dateutil",
@@ -35,8 +34,6 @@ dependencies = [
3534
"defusedxml",
3635
"lxml",
3736
"MarkupSafe",
38-
"numpy",
39-
"pandas",
4037
"psutil",
4138
"pydantic<2",
4239
"PyJWT",
@@ -53,7 +50,6 @@ dependencies = [
5350
"rst2ansi>=0.1.5",
5451
"salesforce-bulk",
5552
"sarge",
56-
"scikit-learn",
5753
"selenium<4",
5854
"simple-salesforce==1.11.4",
5955
"snowfakery>=4.0.0",
@@ -88,6 +84,14 @@ lint = [
8884
"pre-commit>=3.5.0",
8985
]
9086

87+
[project.optional-dependencies]
88+
select = [
89+
"annoy",
90+
"numpy",
91+
"pandas",
92+
"scikit-learn",
93+
]
94+
9195
[project.scripts]
9296
cci = "cumulusci.cli.cci:main"
9397
snowfakery = "snowfakery.cli:main"

0 commit comments

Comments
 (0)