SFDO-Tooling
diff --git a/‎.github/workflows/feature_test.yml‎
Lines changed: 24 additions & 0 deletions b/‎.github/workflows/feature_test.yml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.readthedocs.yml‎
Lines changed: 3 additions & 1 deletion b/‎.readthedocs.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎AUTHORS.rst‎
Lines changed: 1 addition & 0 deletions b/‎AUTHORS.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cumulusci/__about__.py‎
Lines changed: 1 addition & 1 deletion b/‎cumulusci/__about__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cumulusci/tasks/bulkdata/mapping_parser.py‎
Lines changed: 4 additions & 1 deletion b/‎cumulusci/tasks/bulkdata/mapping_parser.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cumulusci/tasks/bulkdata/select_utils.py‎
Lines changed: 30 additions & 20 deletions b/‎cumulusci/tasks/bulkdata/select_utils.py‎
Lines changed: 30 additions & 20 deletions
diff --git a/‎cumulusci/tasks/bulkdata/snowfakery.py‎
Lines changed: 4 additions & 2 deletions b/‎cumulusci/tasks/bulkdata/snowfakery.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎cumulusci/tasks/bulkdata/step.py‎
Lines changed: 2 additions & 3 deletions b/‎cumulusci/tasks/bulkdata/step.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_random_strategy.yaml‎
Lines changed: 2 additions & 2 deletions b/‎cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_random_strategy.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy.yaml‎
Lines changed: 1 addition & 1 deletion b/‎cumulusci/tasks/bulkdata/tests/cassettes/TestSelect.test_select_similarity_select_and_insert_strategy.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -63,6 +63,30 @@ jobs:
             - name: Run Pytest
               run: uv run pytest --cov-report= --cov=cumulusci
 
+    unit_tests_opt_deps:
+        name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
+        runs-on: ${{ matrix.os }}
+        strategy:
+            fail-fast: false
+            matrix:
+                os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
+                python-version: ["3.11", "3.12", "3.13"]
+        steps:
+            - uses: actions/checkout@v4
+            - name: Set up Python
+              uses: actions/setup-python@v4
+              with:
+                  python-version: "${{ matrix.python-version }}"
+            - name: Set up uv
+              uses: SFDO-Tooling/setup-uv@main
+              with:
+                  version: "0.5.0"
+                  enable-cache: true
+            - name: Install dependencies
+              run: uv sync --all-extras -p ${{ matrix.python-version }}
+            - name: Run Pytest
+              run: uv run pytest --cov-report= --cov=cumulusci
+
     robot_api:
         name: "Robot: No browser"
         runs-on: SFDO-Tooling-Ubuntu
 
@@ -14,7 +14,9 @@ build:
         - asdf plugin add uv
         - asdf install uv latest
         - asdf global uv latest
-        - uv sync --only-group docs --frozen
+        - uv sync --group docs --frozen
+        - uv run cci task doc --write
+        - uv run cci flow doc > docs/flows.rst
         - uv run -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html
 
 # Build documentation in the docs/ directory with Sphinx
 
@@ -38,3 +38,4 @@ For example:
 * Gustavo Tandeciarz (dcinzona)
 * Chandler Anderson (zenibako)
 * Ben French (BenjaminFrench)
+* Rupert Barrow (rupertbarrow)
@@ -1 +1 @@
-__version__ = "4.0.1"
+__version__ = "4.2.0"
@@ -338,7 +338,10 @@ def _get_required_permission_types(
         self, operation: DataOperationType
     ) -> T.Tuple[str]:
         """Return a tuple of the permission types required to execute an operation"""
-        if operation is DataOperationType.QUERY:
+        if (
+            operation is DataOperationType.QUERY
+            or self.action is DataOperationType.SELECT
+        ):
             return ("queryable",)
         if (
             operation is DataOperationType.INSERT
 
@@ -1,22 +1,34 @@
+import logging
 import random
 import re
 import typing as T
 from enum import Enum
 
-import numpy as np
-import pandas as pd
-from annoy import AnnoyIndex
 from pydantic import Field, root_validator, validator
-from sklearn.feature_extraction.text import HashingVectorizer
-from sklearn.preprocessing import StandardScaler
 
 from cumulusci.core.enums import StrEnum
-from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
-    DEFAULT_DECLARATIONS,
-)
 from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
+from cumulusci.utils import get_cci_upgrade_command
 from cumulusci.utils.yaml.model_parser import CCIDictModel
 
+logger = logging.getLogger(__name__)
+try:
+    import numpy as np
+    import pandas as pd
+    from annoy import AnnoyIndex
+    from sklearn.feature_extraction.text import HashingVectorizer
+    from sklearn.preprocessing import StandardScaler
+
+    OPTIONAL_DEPENDENCIES_AVAILABLE = True
+except ImportError:
+    logger.warning(
+        f"Optional dependencies are missing. "
+        "Handling high volumes of records for the 'select' functionality will be significantly slower, "
+        "as optimizations for this feature are currently disabled. "
+        f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
+    )
+    OPTIONAL_DEPENDENCIES_AVAILABLE = False
+
 
 class SelectStrategy(StrEnum):
     """Enum defining the different selection strategies requested."""
@@ -173,10 +185,6 @@ def standard_generate_query(
             filter_clause=user_filter, limit_clause=limit, offset_clause=offset
         )
     else:
-        # Get the WHERE clause from DEFAULT_DECLARATIONS if available
-        declaration = DEFAULT_DECLARATIONS.get(sobject)
-        if declaration:
-            query += f" WHERE {declaration.where}"
         query += f" LIMIT {limit}" if limit else ""
         query += f" OFFSET {offset}" if offset else ""
     return query, ["Id"]
@@ -266,10 +274,6 @@ def similarity_generate_query(
             filter_clause=user_filter, limit_clause=limit, offset_clause=offset
         )
     else:
-        # Get the WHERE clause from DEFAULT_DECLARATIONS if available
-        declaration = DEFAULT_DECLARATIONS.get(sobject)
-        if declaration:
-            query += f" WHERE {declaration.where}"
         query += f" LIMIT {limit}" if limit else ""
         query += f" OFFSET {offset}" if offset else ""
 
@@ -292,7 +296,7 @@ def similarity_post_process(
 ]:
     """Processes the query results for the similarity selection strategy"""
     # Handle case where query returns 0 records
-    if not query_records and not threshold:
+    if not query_records and threshold is None:
         error_message = f"No records found for {sobject} in the target org."
         return [], [], error_message
 
@@ -308,7 +312,7 @@ def similarity_post_process(
     select_records = []
     insert_records = []
 
-    if complexity_constant < 1000:
+    if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
         select_records, insert_records = levenshtein_post_process(
             load_records, query_records, fields, weights, threshold
         )
@@ -328,6 +332,12 @@ def annoy_post_process(
     threshold: T.Union[float, None],
 ) -> T.Tuple[T.List[dict], list]:
     """Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
+    # Add warning when threshold is 0
+    if threshold is not None and threshold == 0:
+        logger.warning(
+            "Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
+        )
+
     selected_records = []
     insertion_candidates = []
 
@@ -397,7 +407,7 @@ def annoy_post_process(
             # Retrieve the corresponding record from the database
             record = query_record_data[neighbor_index]
             closest_record_id = record_to_id_map[tuple(record)]
-            if threshold and (neighbor_distances[idx] >= threshold):
+            if threshold is not None and (neighbor_distances[idx] >= threshold):
                 selected_records.append(None)
                 insertion_candidates.append(load_shaped_records[i])
             else:
@@ -445,7 +455,7 @@ def levenshtein_post_process(
             select_record, target_records, similarity_weights
         )
 
-        if distance_threshold and match_distance > distance_threshold:
+        if distance_threshold is not None and match_distance > distance_threshold:
             # Append load record for insertion if distance exceeds threshold
             insertion_candidates.append(load_record)
             selected_records.append(None)
 
@@ -583,8 +583,10 @@ def _generate_and_load_initial_batch(self, working_directory: Path):
             self.sets_finished_while_generating_template = num_records
 
         new_template_dir = data_loader_new_directory_name(template_dir, self.run_until)
-        shutil.move(template_dir, new_template_dir)
-        template_dir = new_template_dir
+        # rename only if new_template_dir does not match template_dir
+        if template_dir.resolve() != new_template_dir.resolve():
+            shutil.move(template_dir, new_template_dir)
+            template_dir = new_template_dir
 
         # don't send data tables to child processes. All they
         # care about are ID->OID mappings
 
@@ -9,6 +9,7 @@
 from contextlib import contextmanager
 from itertools import tee
 from typing import Any, Dict, List, NamedTuple, Optional, Union
+from urllib.parse import quote
 
 import requests
 import salesforce_bulk
@@ -955,9 +956,7 @@ def _determine_limit_clause(self, total_num_records):
     def _execute_soql_query(self, select_query, query_fields):
         """Executes the SOQL query and returns the flattened records."""
         query_records = []
-        response = self.sf.restful(
-            requests.utils.requote_uri(f"query/?q={select_query}"), method="GET"
-        )
+        response = self.sf.restful(f"query/?q={quote(select_query)}", method="GET")
         query_records.extend(self._flatten_response_records(response, query_fields))
 
         while not response["done"]:
 
@@ -48,7 +48,7 @@ interactions:
 
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account
           body: null
           headers: *id004
       response:
@@ -125,7 +125,7 @@ interactions:
 
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'%20LIMIT%205
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20LIMIT%205
           body: null
           headers: *id004
       response:
 
@@ -225,7 +225,7 @@ interactions:
 
     - request:
           method: GET
-          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20TYPEOF%20Who%20WHEN%20Contact%20THEN%20LastName,%20Email%20WHEN%20Lead%20THEN%20LastName,%20Company%20ELSE%20Id%20END,%20TYPEOF%20What%20WHEN%20Account%20THEN%20Name,%20Description,%20Phone,%20AccountNumber%20ELSE%20Id%20END,%20Subject,%20DurationInMinutes,%20ActivityDateTime%20FROM%20Event
+          uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%2C%20TYPEOF%20Who%20WHEN%20Contact%20THEN%20LastName%2C%20Email%20WHEN%20Lead%20THEN%20LastName%2C%20Company%20ELSE%20Id%20END%2C%20TYPEOF%20What%20WHEN%20Account%20THEN%20Name%2C%20Description%2C%20Phone%2C%20AccountNumber%20ELSE%20Id%20END%2C%20Subject%2C%20DurationInMinutes%2C%20ActivityDateTime%20FROM%20Event
           body: null
           headers: *id004
       response:
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "4.0.1"`
	`1`	`+__version__ = "4.2.0"`