1+ import logging
12import random
23import re
34import typing as T
45from enum import Enum
56
6- import numpy as np
7- import pandas as pd
8- from annoy import AnnoyIndex
97from pydantic import Field , root_validator , validator
10- from sklearn .feature_extraction .text import HashingVectorizer
11- from sklearn .preprocessing import StandardScaler
128
139from cumulusci .core .enums import StrEnum
14- from cumulusci .tasks .bulkdata .extract_dataset_utils .hardcoded_default_declarations import (
15- DEFAULT_DECLARATIONS ,
16- )
1710from cumulusci .tasks .bulkdata .utils import CaseInsensitiveDict
11+ from cumulusci .utils import get_cci_upgrade_command
1812from cumulusci .utils .yaml .model_parser import CCIDictModel
1913
14+ logger = logging .getLogger (__name__ )
15+ try :
16+ import numpy as np
17+ import pandas as pd
18+ from annoy import AnnoyIndex
19+ from sklearn .feature_extraction .text import HashingVectorizer
20+ from sklearn .preprocessing import StandardScaler
21+
22+ OPTIONAL_DEPENDENCIES_AVAILABLE = True
23+ except ImportError :
24+ logger .warning (
25+ f"Optional dependencies are missing. "
26+ "Handling high volumes of records for the 'select' functionality will be significantly slower, "
27+ "as optimizations for this feature are currently disabled. "
28+ f"To enable optimized performance, install all required dependencies using: { get_cci_upgrade_command ()} [select]\n "
29+ )
30+ OPTIONAL_DEPENDENCIES_AVAILABLE = False
31+
2032
2133class SelectStrategy (StrEnum ):
2234 """Enum defining the different selection strategies requested."""
@@ -173,10 +185,6 @@ def standard_generate_query(
173185 filter_clause = user_filter , limit_clause = limit , offset_clause = offset
174186 )
175187 else :
176- # Get the WHERE clause from DEFAULT_DECLARATIONS if available
177- declaration = DEFAULT_DECLARATIONS .get (sobject )
178- if declaration :
179- query += f" WHERE { declaration .where } "
180188 query += f" LIMIT { limit } " if limit else ""
181189 query += f" OFFSET { offset } " if offset else ""
182190 return query , ["Id" ]
@@ -266,10 +274,6 @@ def similarity_generate_query(
266274 filter_clause = user_filter , limit_clause = limit , offset_clause = offset
267275 )
268276 else :
269- # Get the WHERE clause from DEFAULT_DECLARATIONS if available
270- declaration = DEFAULT_DECLARATIONS .get (sobject )
271- if declaration :
272- query += f" WHERE { declaration .where } "
273277 query += f" LIMIT { limit } " if limit else ""
274278 query += f" OFFSET { offset } " if offset else ""
275279
@@ -292,7 +296,7 @@ def similarity_post_process(
292296]:
293297 """Processes the query results for the similarity selection strategy"""
294298 # Handle case where query returns 0 records
295- if not query_records and not threshold :
299+ if not query_records and threshold is None :
296300 error_message = f"No records found for { sobject } in the target org."
297301 return [], [], error_message
298302
@@ -308,7 +312,7 @@ def similarity_post_process(
308312 select_records = []
309313 insert_records = []
310314
311- if complexity_constant < 1000 :
315+ if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE :
312316 select_records , insert_records = levenshtein_post_process (
313317 load_records , query_records , fields , weights , threshold
314318 )
@@ -328,6 +332,12 @@ def annoy_post_process(
328332 threshold : T .Union [float , None ],
329333) -> T .Tuple [T .List [dict ], list ]:
330334 """Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
335+ # Add warning when threshold is 0
336+ if threshold is not None and threshold == 0 :
337+ logger .warning (
338+ "Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
339+ )
340+
331341 selected_records = []
332342 insertion_candidates = []
333343
@@ -397,7 +407,7 @@ def annoy_post_process(
397407 # Retrieve the corresponding record from the database
398408 record = query_record_data [neighbor_index ]
399409 closest_record_id = record_to_id_map [tuple (record )]
400- if threshold and (neighbor_distances [idx ] >= threshold ):
410+ if threshold is not None and (neighbor_distances [idx ] >= threshold ):
401411 selected_records .append (None )
402412 insertion_candidates .append (load_shaped_records [i ])
403413 else :
@@ -445,7 +455,7 @@ def levenshtein_post_process(
445455 select_record , target_records , similarity_weights
446456 )
447457
448- if distance_threshold and match_distance > distance_threshold :
458+ if distance_threshold is not None and match_distance > distance_threshold :
449459 # Append load record for insertion if distance exceeds threshold
450460 insertion_candidates .append (load_record )
451461 selected_records .append (None )
0 commit comments