From 6c2601f7e289521eba8e57e6dcc4f1b543bb67ce Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 31 May 2026 11:50:32 -0400 Subject: [PATCH] . --- datafusion/physical-plan/src/joins/utils.rs | 115 +++++++++++++++----- 1 file changed, 85 insertions(+), 30 deletions(-) diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 9a6d1e5545eb3..8108b7f2db8bf 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -475,7 +475,7 @@ fn estimate_join_cardinality( right_stats: Statistics, on: &JoinOn, ) -> Option { - let (left_col_stats, right_col_stats) = on + let (left_key_stats, right_key_stats) = on .iter() .map(|(left, right)| { match ( @@ -500,12 +500,12 @@ fn estimate_join_cardinality( Statistics { num_rows: left_stats.num_rows, total_byte_size: Precision::Absent, - column_statistics: left_col_stats, + column_statistics: left_key_stats, }, Statistics { num_rows: right_stats.num_rows, total_byte_size: Precision::Absent, - column_statistics: right_col_stats, + column_statistics: right_key_stats, }, )?; @@ -545,38 +545,49 @@ fn estimate_join_cardinality( let is_left = matches!(join_type, JoinType::LeftSemi | JoinType::LeftAnti); let is_anti = matches!(join_type, JoinType::LeftAnti | JoinType::RightAnti); - let ((outer_stats, inner_stats), (outer_col_stats, inner_col_stats)) = - if is_left { - ( - (&left_stats, &right_stats), - (&left_col_stats, &right_col_stats), - ) - } else { - ( - (&right_stats, &left_stats), - (&right_col_stats, &left_col_stats), - ) - }; + let (outer_stats, inner_stats, outer_key_stats, inner_key_stats) = if is_left + { + (&left_stats, &right_stats, &left_key_stats, &right_key_stats) + } else { + (&right_stats, &left_stats, &right_key_stats, &left_key_stats) + }; let outer_rows = *outer_stats.num_rows.get_value()?; - let cardinality = - if estimate_disjoint_inputs(outer_stats, inner_stats).is_some() { - // Disjoint inputs: semi produces 0, anti keeps all rows. - if is_anti { outer_rows } else { 0 } + let outer_join_key_stats = Statistics { + num_rows: outer_stats.num_rows, + total_byte_size: Precision::Absent, + column_statistics: outer_key_stats.clone(), + }; + let inner_join_key_stats = Statistics { + num_rows: inner_stats.num_rows, + total_byte_size: Precision::Absent, + column_statistics: inner_key_stats.clone(), + }; + + let semi_cardinality = + if estimate_disjoint_inputs(&outer_join_key_stats, &inner_join_key_stats) + .is_some() + { + // If join keys are disjoint, no rows will match + Some(0) } else { - match estimate_semi_join_cardinality( + estimate_semi_join_cardinality( &outer_stats.num_rows, &inner_stats.num_rows, - outer_col_stats, - inner_col_stats, - ) { - Some(semi) if is_anti => outer_rows.saturating_sub(semi), - Some(semi) => semi, - None => outer_rows, - } + outer_key_stats, + inner_key_stats, + ) }; + // Semi joins keep the matching rows; anti joins keep the rest. When no + // estimate is available, conservatively assume all outer rows pass. + let cardinality = match (semi_cardinality, is_anti) { + (Some(semi), true) => outer_rows.saturating_sub(semi), + (Some(semi), false) => semi, + (None, _) => outer_rows, + }; + let outer_stats = if is_left { left_stats } else { right_stats }; Some(PartialJoinStatistics { num_rows: cardinality, @@ -759,8 +770,8 @@ fn estimate_disjoint_inputs( fn estimate_semi_join_cardinality( outer_num_rows: &Precision, inner_num_rows: &Precision, - outer_col_stats: &[ColumnStatistics], - inner_col_stats: &[ColumnStatistics], + outer_key_stats: &[ColumnStatistics], + inner_key_stats: &[ColumnStatistics], ) -> Option { let outer_rows = *outer_num_rows.get_value()?; if outer_rows == 0 { @@ -774,7 +785,7 @@ fn estimate_semi_join_cardinality( let mut selectivity = 1.0_f64; let mut has_selectivity_estimate = false; - for (outer_stat, inner_stat) in outer_col_stats.iter().zip(inner_col_stats.iter()) { + for (outer_stat, inner_stat) in outer_key_stats.iter().zip(inner_key_stats.iter()) { let outer_has_stats = outer_stat.distinct_count.get_value().is_some() || (outer_stat.min_value.get_value().is_some() && outer_stat.max_value.get_value().is_some()); @@ -3246,6 +3257,50 @@ mod tests { Ok(()) } + #[test] + fn test_semi_anti_join_disjoint_check_uses_only_join_keys() { + let join_on = vec![( + Arc::new(Column::new("l_key", 0)) as _, + Arc::new(Column::new("r_key", 0)) as _, + )]; + + // Ranges for the join key overlap; ranges for the other column are disjoint + let left_stats = Statistics { + num_rows: Inexact(50), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Inexact(1), Inexact(10), Absent, Absent), + create_column_stats(Inexact(100), Inexact(200), Absent, Absent), + ], + }; + let right_stats = Statistics { + num_rows: Inexact(10), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Inexact(1), Inexact(10), Absent, Absent), + create_column_stats(Inexact(1000), Inexact(2000), Absent, Absent), + ], + }; + + let left_semi = estimate_join_cardinality( + &JoinType::LeftSemi, + left_stats.clone(), + right_stats.clone(), + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!(left_semi, Some(50)); + + let left_anti = estimate_join_cardinality( + &JoinType::LeftAnti, + left_stats, + right_stats, + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!(left_anti, Some(0)); + } + #[test] fn test_calculate_join_output_ordering() -> Result<()> { let left_ordering = LexOrdering::new(vec![