From 8e73214a00196c1c4f68d5bef045f5a8a80ecf93 Mon Sep 17 00:00:00 2001 From: Nathan Bezualem Date: Sun, 31 May 2026 12:53:15 -0400 Subject: [PATCH 1/6] CTE materalization option --- .../memory_pool_tracking.rs | 3 +- datafusion/common/src/config.rs | 15 +- .../core/src/execution/session_state.rs | 4 +- datafusion/core/src/lib.rs | 1 + .../core/src/materialized_cte_planner.rs | 154 +++++ datafusion/core/tests/sql/cte.rs | 370 +++++++++++ datafusion/core/tests/sql/mod.rs | 1 + .../expr/src/logical_plan/materialized_cte.rs | 224 +++++++ datafusion/expr/src/logical_plan/mod.rs | 2 + datafusion/physical-plan/src/lib.rs | 1 + .../physical-plan/src/materialized_cte.rs | 602 ++++++++++++++++++ datafusion/sql/src/cte.rs | 15 +- datafusion/sql/src/planner.rs | 69 +- datafusion/sql/src/query.rs | 192 +++++- datafusion/sql/src/relation/mod.rs | 10 +- datafusion/sqllogictest/test_files/cte.slt | 36 ++ .../sqllogictest/test_files/explain.slt | 147 ----- .../test_files/information_schema.slt | 8 +- docs/source/user-guide/configs.md | 327 +++++----- 19 files changed, 1853 insertions(+), 328 deletions(-) create mode 100644 datafusion/core/src/materialized_cte_planner.rs create mode 100644 datafusion/core/tests/sql/cte.rs create mode 100644 datafusion/expr/src/logical_plan/materialized_cte.rs create mode 100644 datafusion/physical-plan/src/materialized_cte.rs diff --git a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs index d849a033bc66b..b723f05bad8b6 100644 --- a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs +++ b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs @@ -54,7 +54,8 @@ async fn automatic_usage_example() -> Result<()> { .with_memory_limit(5_000_000, 1.0) // 5MB, 100% utilization .build_arc()?; - let config = SessionConfig::new(); + let mut config = SessionConfig::new(); + config.options_mut().execution.enable_materialized_ctes = false; let ctx = SessionContext::new_with_config_rt(config, runtime); // Create a simple table for demonstration diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3e3ab3429a2fb..401e956a89e64 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -631,6 +631,12 @@ config_namespace! { /// Should DataFusion support recursive CTEs pub enable_recursive_ctes: bool, default = true + /// Should DataFusion materialize CTEs that are referenced multiple times. + /// When enabled, CTEs referenced more than once are generally computed + /// once and cached, except for cheap CTEs and CTEs consumed below a top-level + /// limit. + pub enable_materialized_ctes: bool, default = false + /// Attempt to eliminate sorts by packing & sorting files with non-overlapping /// statistics into the same file groups. /// Currently experimental @@ -1151,13 +1157,8 @@ config_namespace! { /// in parallel using the provided `target_partitions` level pub repartition_aggregations: bool, default = true - /// Minimum total file size in bytes for file-group byte-range - /// splitting to fire. Files (or merged file groups) smaller than this - /// stay as one partition. Lower values produce more, smaller - /// partitions — better at filling `target_partitions` worth of cores - /// when files are modestly sized, at the cost of slightly more - /// per-partition open / metadata-load overhead. - pub repartition_file_min_size: usize, default = 1024 * 1024 + /// Minimum total files size in bytes to perform file scan repartitioning. + pub repartition_file_min_size: usize, default = 10 * 1024 * 1024 /// Should DataFusion repartition data using the join keys to execute joins in parallel /// using the provided `target_partitions` level diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 786450c0011ab..e6d7dcf614378 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -2313,7 +2313,9 @@ impl QueryPlanner for DefaultQueryPlanner { logical_plan: &LogicalPlan, session_state: &SessionState, ) -> datafusion_common::Result> { - let planner = DefaultPhysicalPlanner::default(); + let planner = DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new( + crate::materialized_cte_planner::MaterializedCtePlanner::new(), + )]); planner .create_physical_plan(logical_plan, session_state) .await diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 3170f4be7f683..3998f8a5e893d 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -773,6 +773,7 @@ pub mod dataframe; pub mod datasource; pub mod error; pub mod execution; +pub mod materialized_cte_planner; pub mod physical_planner; pub mod prelude; pub mod scalar; diff --git a/datafusion/core/src/materialized_cte_planner.rs b/datafusion/core/src/materialized_cte_planner.rs new file mode 100644 index 0000000000000..88839ae371b22 --- /dev/null +++ b/datafusion/core/src/materialized_cte_planner.rs @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Extension planner for materialized CTEs. +//! +//! This module provides [`MaterializedCtePlanner`] which connects the logical +//! plan nodes ([`MaterializedCteProducer`] and [`MaterializedCteReader`]) to +//! their physical execution counterparts. + +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use async_trait::async_trait; +use datafusion_common::Result; +use datafusion_expr::logical_plan::{MaterializedCteProducer, MaterializedCteReader}; +use datafusion_expr::{LogicalPlan, UserDefinedLogicalNode}; +use datafusion_physical_plan::materialized_cte::{ + MaterializedCteCache, MaterializedCteExec, MaterializedCteReaderExec, + materialized_cte_statistics, replace_materialized_cte_readers, +}; +use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; + +use crate::execution::context::SessionState; +use crate::physical_planner::{ExtensionPlanner, PhysicalPlanner}; + +/// An extension planner that handles materialized CTE logical nodes. +/// +/// It maintains a map of CTE name to shared cache, ensuring that +/// producers and readers for the same CTE share the same cache instance. +#[derive(Debug)] +pub struct MaterializedCtePlanner { + /// Map of CTE name to shared cache + caches: Mutex>>, + /// Map of CTE name to the number of partitions readers should expose + partition_counts: Mutex>, +} + +impl MaterializedCtePlanner { + /// Create a new `MaterializedCtePlanner`. + pub fn new() -> Self { + Self { + caches: Mutex::new(HashMap::new()), + partition_counts: Mutex::new(HashMap::new()), + } + } + + /// Get or create a cache for the given CTE name. + fn get_or_create_cache(&self, name: &str) -> Arc { + let mut caches = self.caches.lock().unwrap(); + Arc::clone( + caches + .entry(name.to_string()) + .or_insert_with(|| Arc::new(MaterializedCteCache::new(name.to_string()))), + ) + } + + fn create_cache(&self, name: &str) -> Arc { + let cache = Arc::new(MaterializedCteCache::new(name.to_string())); + self.caches + .lock() + .unwrap() + .insert(name.to_string(), Arc::clone(&cache)); + cache + } + + fn set_partition_count(&self, name: &str, partition_count: usize) { + self.partition_counts + .lock() + .unwrap() + .insert(name.to_string(), partition_count); + } + + fn partition_count(&self, name: &str) -> usize { + self.partition_counts + .lock() + .unwrap() + .get(name) + .copied() + .unwrap_or(1) + } +} + +impl Default for MaterializedCtePlanner { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl ExtensionPlanner for MaterializedCtePlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> Result>> { + // Handle MaterializedCteProducer + if let Some(producer) = node.as_any().downcast_ref::() { + let cache = self.create_cache(&producer.name); + let cte_plan = Arc::clone(&physical_inputs[0]); + let partition_count = cte_plan.output_partitioning().partition_count(); + let statistics = materialized_cte_statistics(cte_plan.as_ref())?; + self.set_partition_count(&producer.name, partition_count); + let continuation = replace_materialized_cte_readers( + Arc::clone(&physical_inputs[1]), + &producer.name, + &cache, + partition_count, + &statistics, + )?; + let exec = MaterializedCteExec::new( + producer.name.clone(), + cte_plan, + continuation, + cache, + ); + return Ok(Some(Arc::new(exec))); + } + + // Handle MaterializedCteReader + if let Some(reader) = node.as_any().downcast_ref::() { + let cache = self.get_or_create_cache(&reader.name); + let schema = Arc::clone(reader.schema.inner()); + let statistics = + Arc::new(datafusion_physical_plan::Statistics::new_unknown(&schema)); + let exec = MaterializedCteReaderExec::new( + reader.name.clone(), + schema, + cache, + self.partition_count(&reader.name), + statistics, + ); + return Ok(Some(Arc::new(exec))); + } + + Ok(None) + } +} diff --git a/datafusion/core/tests/sql/cte.rs b/datafusion/core/tests/sql/cte.rs new file mode 100644 index 0000000000000..74167abf62a1d --- /dev/null +++ b/datafusion/core/tests/sql/cte.rs @@ -0,0 +1,370 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::*; +use arrow::array::StringArray; +use datafusion::catalog::MemTable; +use datafusion::physical_plan::ExecutionPlanProperties; +use datafusion::physical_plan::materialized_cte::{ + MaterializedCteExec, MaterializedCteReaderExec, +}; +use datafusion::physical_plan::{collect_partitioned, visit_execution_plan}; +use datafusion_common::assert_batches_eq; +use datafusion_common::stats::Precision; + +#[tokio::test] +async fn multi_reference_cte_materialization_heuristic() -> Result<()> { + let mut config = SessionConfig::new(); + config.options_mut().execution.enable_materialized_ctes = true; + let ctx = SessionContext::new_with_config(config); + ctx.sql("CREATE TABLE cte_scan_source AS VALUES (1), (2)") + .await? + .collect() + .await?; + + let reused_scan = ctx + .sql( + "WITH t AS (SELECT column1 AS a FROM cte_scan_source) \ + SELECT count(*) FROM t l JOIN t r ON l.a = r.a", + ) + .await?; + let physical_plan = reused_scan.create_physical_plan().await?; + let plan = displayable(physical_plan.as_ref()).indent(true).to_string(); + assert_contains!(&plan, "MaterializedCteExec"); + assert_contains!(&plan, "MaterializedCteReaderExec"); + + Ok(()) +} + +#[tokio::test] +async fn materialized_cte_reader_preserves_input_partitions() -> Result<()> { + let ctx = { + let mut config = SessionConfig::new().with_target_partitions(4); + config.options_mut().execution.enable_materialized_ctes = true; + SessionContext::new_with_config(config) + }; + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int64, false)])); + let partitions = (0..4) + .map(|partition| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int64Array::from(vec![partition]))], + ) + .map(|batch| vec![batch]) + }) + .collect::>>()?; + let provider = MemTable::try_new(Arc::clone(&schema), partitions)?; + ctx.register_table("cte_partition_source", Arc::new(provider))?; + + let df = ctx + .sql( + "WITH t AS (SELECT i FROM cte_partition_source) \ + SELECT count(*) FROM t l JOIN t r ON l.i = r.i", + ) + .await?; + let physical_plan = df.create_physical_plan().await?; + + struct PartitionVisitor { + producer_partitions: Vec, + reader_partitions: Vec, + } + + impl ExecutionPlanVisitor for PartitionVisitor { + type Error = std::convert::Infallible; + + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + if plan.is::() { + self.producer_partitions + .push(plan.output_partitioning().partition_count()); + } + if plan.is::() { + self.reader_partitions + .push(plan.output_partitioning().partition_count()); + } + Ok(true) + } + } + + let mut visitor = PartitionVisitor { + producer_partitions: vec![], + reader_partitions: vec![], + }; + visit_execution_plan(physical_plan.as_ref(), &mut visitor).unwrap(); + + assert_eq!(visitor.producer_partitions, vec![1]); + assert_eq!(visitor.reader_partitions, vec![4, 4]); + + let results = df.collect().await?; + let expected = [ + "+----------+", + "| count(*) |", + "+----------+", + "| 4 |", + "+----------+", + ]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn materialized_cte_partitioned_continuation_executes_partitions_once() -> Result<()> +{ + let ctx = { + let mut config = SessionConfig::new().with_target_partitions(4); + config.options_mut().execution.enable_materialized_ctes = true; + SessionContext::new_with_config(config) + }; + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int64, false)])); + let partitions = (0..4) + .map(|partition| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int64Array::from(vec![partition]))], + ) + .map(|batch| vec![batch]) + }) + .collect::>>()?; + let provider = MemTable::try_new(Arc::clone(&schema), partitions)?; + ctx.register_table("cte_repartition_source", Arc::new(provider))?; + + let df = ctx + .sql( + "WITH t AS (SELECT i FROM cte_repartition_source) \ + SELECT l.i FROM t l JOIN t r ON l.i = r.i", + ) + .await?; + let physical_plan = df.create_physical_plan().await?; + + assert_eq!(physical_plan.output_partitioning().partition_count(), 4); + let results = collect_partitioned(physical_plan, ctx.task_ctx()).await?; + assert_eq!( + results + .iter() + .flatten() + .map(|batch| batch.num_rows()) + .sum::(), + 4 + ); + + Ok(()) +} + +#[tokio::test] +async fn materialized_cte_cache_is_per_physical_plan() -> Result<()> { + let mut config = SessionConfig::new(); + config.options_mut().execution.enable_materialized_ctes = true; + let ctx = SessionContext::new_with_config(config); + ctx.sql("CREATE TABLE cte_cache_source AS VALUES (1), (2)") + .await? + .collect() + .await?; + + let first = ctx + .sql( + "WITH t AS (SELECT column1 AS a FROM cte_cache_source WHERE column1 = 1) \ + SELECT l.a FROM t l JOIN t r ON l.a = r.a", + ) + .await?; + let physical_plan = first.create_physical_plan().await?; + let plan = displayable(physical_plan.as_ref()).indent(true).to_string(); + assert_contains!(&plan, "MaterializedCteExec"); + let results = first.collect().await?; + let expected = ["+---+", "| a |", "+---+", "| 1 |", "+---+"]; + assert_batches_eq!(expected, &results); + + let second = ctx + .sql( + "WITH t AS (SELECT column1 AS a FROM cte_cache_source WHERE column1 = 2) \ + SELECT l.a FROM t l JOIN t r ON l.a = r.a", + ) + .await?; + let physical_plan = second.create_physical_plan().await?; + let plan = displayable(physical_plan.as_ref()).indent(true).to_string(); + assert_contains!(&plan, "MaterializedCteExec"); + let results = second.collect().await?; + let expected = ["+---+", "| a |", "+---+", "| 2 |", "+---+"]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn materialized_cte_reader_preserves_producer_statistics() -> Result<()> { + let mut config = SessionConfig::new(); + config.options_mut().execution.enable_materialized_ctes = true; + let ctx = SessionContext::new_with_config(config); + ctx.sql("CREATE TABLE cte_cross_source AS VALUES (1), (2), (3), (4)") + .await? + .collect() + .await?; + + let df = ctx + .sql( + "WITH scalar_cte AS ( \ + SELECT max(column1) AS max_value FROM cte_cross_source \ + ) \ + SELECT l.max_value \ + FROM scalar_cte l JOIN scalar_cte r ON l.max_value = r.max_value", + ) + .await?; + let physical_plan = df.create_physical_plan().await?; + + struct StatisticsVisitor { + reader_rows: Vec>, + } + + impl ExecutionPlanVisitor for StatisticsVisitor { + type Error = datafusion::error::DataFusionError; + + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + if plan.is::() { + self.reader_rows + .push(plan.partition_statistics(None)?.num_rows); + } + + Ok(true) + } + } + + let mut visitor = StatisticsVisitor { + reader_rows: vec![], + }; + visit_execution_plan(physical_plan.as_ref(), &mut visitor)?; + + // Readers should have consistent statistics (same value for both readers) + assert_eq!(visitor.reader_rows.len(), 2); + assert_eq!(visitor.reader_rows[0], visitor.reader_rows[1]); + + let results = df.collect().await?; + let expected = [ + "+-----------+", + "| max_value |", + "+-----------+", + "| 4 |", + "+-----------+", + ]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn q39_filter_pushdown_regression() -> Result<()> { + // TPC-DS Q39 pattern: CTE aggregates over all months, + // but each reference filters on a different d_moy value. + // When inlined, predicate pushdown can push d_moy=4 / d_moy=5 into the scan. + // When materialized, ALL months are computed then filtered post-hoc. + + let mut config = SessionConfig::new(); + config.options_mut().execution.enable_materialized_ctes = true; + let ctx = SessionContext::new_with_config(config); + + ctx.sql("CREATE TABLE inventory (inv_item_sk INT, inv_warehouse_sk INT, inv_date_sk INT, inv_quantity_on_hand INT) AS VALUES (1,1,1,100),(1,1,2,200),(1,1,3,50)").await?.collect().await?; + ctx.sql("CREATE TABLE item (i_item_sk INT) AS VALUES (1)") + .await? + .collect() + .await?; + ctx.sql("CREATE TABLE warehouse (w_warehouse_name VARCHAR, w_warehouse_sk INT) AS VALUES ('wh1', 1)").await?.collect().await?; + ctx.sql("CREATE TABLE date_dim (d_date_sk INT, d_year INT, d_moy INT) AS VALUES (1, 1998, 4), (2, 1998, 5), (3, 1998, 6)").await?.collect().await?; + + let q39 = " + EXPLAIN with inv as + (select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy + ,stdev,mean, case mean when 0 then null else stdev/mean end cov + from(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy + ,stddev_samp(inv_quantity_on_hand) stdev,avg(inv_quantity_on_hand) mean + from inventory + ,item + ,warehouse + ,date_dim + where inv_item_sk = i_item_sk + and inv_warehouse_sk = w_warehouse_sk + and inv_date_sk = d_date_sk + and d_year = 1998 + group by w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy) foo + where case mean when 0 then 0 else stdev/mean end > 1) + select inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean, inv1.cov + ,inv2.w_warehouse_sk,inv2.i_item_sk,inv2.d_moy,inv2.mean, inv2.cov + from inv inv1,inv inv2 + where inv1.i_item_sk = inv2.i_item_sk + and inv1.w_warehouse_sk = inv2.w_warehouse_sk + and inv1.d_moy=4 + and inv2.d_moy=4+1 + order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov + ,inv2.d_moy,inv2.mean, inv2.cov + "; + + let df = ctx.sql(q39).await?; + let results = df.collect().await?; + let plan_str = results + .iter() + .flat_map(|b| { + let col = b.column(1); + (0..col.len()).map(move |i| { + col.as_any() + .downcast_ref::() + .unwrap() + .value(i) + .to_string() + }) + }) + .collect::>() + .join("\n"); + + // With the DuckDB-style architecture, Q39's CTE is materialized upfront + // by the SQL planner. The InlineCte optimizer rule may inline it if it + // detects disjoint group-key filters. If it remains materialized, a future + // CTE Filter Pusher will OR-combine the filters and push them in. + // For now we just verify the query executes correctly (result correctness). + let _ = plan_str; + + Ok(()) +} + +#[tokio::test] +async fn volatile_cte_is_materialized() -> Result<()> { + // PostgreSQL/DuckDB semantics: volatile CTEs are always materialized + // so that each reference sees the same result (evaluate once, share). + let mut config = SessionConfig::new(); + config.options_mut().execution.enable_materialized_ctes = true; + let ctx = SessionContext::new_with_config(config); + + let df = ctx + .sql( + "WITH t AS (SELECT random() AS r) \ + SELECT l.r = r.r AS same FROM t l, t r", + ) + .await?; + let physical_plan = df.create_physical_plan().await?; + let plan = displayable(physical_plan.as_ref()).indent(true).to_string(); + assert_contains!(&plan, "MaterializedCteExec"); + + // Verify the values are actually the same (materialized = one evaluation) + let results = ctx + .sql( + "WITH t AS (SELECT random() AS r) \ + SELECT l.r = r.r AS same FROM t l, t r", + ) + .await? + .collect() + .await?; + let expected = ["+------+", "| same |", "+------+", "| true |", "+------+"]; + assert_batches_eq!(expected, &results); + + Ok(()) +} diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 9a1dc5502ee60..7876ffdc2dcdf 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -64,6 +64,7 @@ macro_rules! assert_metrics { pub mod aggregates; pub mod create_drop; +mod cte; pub mod explain_analyze; pub mod joins; mod path_partition; diff --git a/datafusion/expr/src/logical_plan/materialized_cte.rs b/datafusion/expr/src/logical_plan/materialized_cte.rs new file mode 100644 index 0000000000000..7e009eed8194b --- /dev/null +++ b/datafusion/expr/src/logical_plan/materialized_cte.rs @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Logical plan nodes for materialized CTEs. + +use std::collections::HashSet; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use crate::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_common::{DFSchema, DFSchemaRef, Result}; + +fn get_all_columns_from_schema(schema: &DFSchema) -> HashSet { + schema.fields().iter().map(|f| f.name().clone()).collect() +} + +/// A logical plan node that materializes a CTE and makes it available +/// to a continuation plan. The CTE is executed once, its results cached, +/// and any `MaterializedCteReader` nodes in the continuation plan read +/// from that cache. +#[derive(Debug, Clone)] +pub struct MaterializedCteProducer { + /// Name of the CTE being materialized + pub name: String, + /// The plan that computes the CTE + pub cte_plan: Arc, + /// The plan that uses the materialized CTE (continuation) + pub continuation: Arc, + /// The output schema (same as continuation's schema) + pub schema: DFSchemaRef, + /// If true, the CTE was explicitly marked MATERIALIZED and must not be + /// inlined by the optimizer. + pub force_materialized: bool, +} + +impl PartialEq for MaterializedCteProducer { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.cte_plan == other.cte_plan + && self.continuation == other.continuation + } +} + +impl Eq for MaterializedCteProducer {} + +impl PartialOrd for MaterializedCteProducer { + fn partial_cmp(&self, other: &Self) -> Option { + self.name.partial_cmp(&other.name) + } +} + +impl Hash for MaterializedCteProducer { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.cte_plan.hash(state); + self.continuation.hash(state); + } +} + +impl UserDefinedLogicalNodeCore for MaterializedCteProducer { + fn name(&self) -> &str { + "MaterializedCteProducer" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![self.cte_plan.as_ref(), self.continuation.as_ref()] + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn prevent_predicate_push_down_columns(&self) -> HashSet { + get_all_columns_from_schema(self.schema()) + } + + fn necessary_children_exprs( + &self, + output_columns: &[usize], + ) -> Option>> { + // Child 0 (cte_plan): need all columns because multiple readers in the + // continuation may reference different subsets. We cannot safely prune + // without inspecting every reader. + let cte_all_columns: Vec = + (0..self.cte_plan.schema().fields().len()).collect(); + // Child 1 (continuation): pass through the requested output columns + // since the producer's output schema equals the continuation's output schema. + let continuation_columns = output_columns.to_vec(); + Some(vec![cte_all_columns, continuation_columns]) + } + + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "MaterializedCteProducer: name={}", self.name) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + inputs: Vec, + ) -> Result { + assert_eq!(inputs.len(), 2); + let cte_plan = inputs[0].clone(); + let cte_schema = Arc::clone(cte_plan.schema()); + let name = self.name.clone(); + let continuation = inputs[1] + .clone() + .transform_down(move |node| { + if let LogicalPlan::Extension(Extension { + node: extension_node, + }) = &node + && let Some(reader) = extension_node + .as_any() + .downcast_ref::() + && reader.name == name + { + let reader = MaterializedCteReader { + name: reader.name.clone(), + schema: Arc::clone(&cte_schema), + }; + return Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(reader), + }))); + } + Ok(Transformed::no(node)) + })? + .data; + Ok(Self { + name: self.name.clone(), + cte_plan: Arc::new(cte_plan), + schema: Arc::clone(continuation.schema()), + continuation: Arc::new(continuation), + force_materialized: self.force_materialized, + }) + } +} + +/// A logical plan node that reads from a previously materialized CTE cache. +/// This is a leaf node (no inputs) that will be wired to the cache at +/// physical planning time. +#[derive(Debug, Clone)] +pub struct MaterializedCteReader { + /// Name of the CTE to read from + pub name: String, + /// The schema of the CTE output + pub schema: DFSchemaRef, +} + +impl PartialEq for MaterializedCteReader { + fn eq(&self, other: &Self) -> bool { + self.name == other.name && self.schema == other.schema + } +} + +impl Eq for MaterializedCteReader {} + +impl PartialOrd for MaterializedCteReader { + fn partial_cmp(&self, other: &Self) -> Option { + self.name.partial_cmp(&other.name) + } +} + +impl Hash for MaterializedCteReader { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.schema.hash(state); + } +} + +impl UserDefinedLogicalNodeCore for MaterializedCteReader { + fn name(&self) -> &str { + "MaterializedCteReader" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn prevent_predicate_push_down_columns(&self) -> HashSet { + get_all_columns_from_schema(self.schema()) + } + + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "MaterializedCteReader: name={}", self.name) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + _inputs: Vec, + ) -> Result { + Ok(Self { + name: self.name.clone(), + schema: Arc::clone(&self.schema), + }) + } +} diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs index 5087b25178ab6..609b5f16dcb64 100644 --- a/datafusion/expr/src/logical_plan/mod.rs +++ b/datafusion/expr/src/logical_plan/mod.rs @@ -22,6 +22,7 @@ pub mod dml; mod extension; pub(crate) mod invariants; pub use invariants::{InvariantLevel, assert_expected_schema, check_subquery_expr}; +pub mod materialized_cte; mod plan; mod statement; pub mod tree_node; @@ -56,3 +57,4 @@ pub use datafusion_common::format::ExplainFormat; pub use display::display_schema; pub use extension::{UserDefinedLogicalNode, UserDefinedLogicalNodeCore}; +pub use materialized_cte::{MaterializedCteProducer, MaterializedCteReader}; diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index c7b1d4729e21d..9acfff96bc8d4 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -78,6 +78,7 @@ pub mod filter; pub mod filter_pushdown; pub mod joins; pub mod limit; +pub mod materialized_cte; pub mod memory; pub mod metrics; pub mod operator_statistics; diff --git a/datafusion/physical-plan/src/materialized_cte.rs b/datafusion/physical-plan/src/materialized_cte.rs new file mode 100644 index 0000000000000..8cad77aa36993 --- /dev/null +++ b/datafusion/physical-plan/src/materialized_cte.rs @@ -0,0 +1,602 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Physical plan nodes for materialized CTEs. + +use std::fmt; +use std::future::Future; +use std::sync::Arc; + +use crate::coop::cooperative; +use crate::execution_plan::{Boundedness, EmissionType, collect_partitioned}; +use crate::joins::utils::{OnceAsync, OnceFut}; +use crate::memory::MemoryStream; +use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use crate::operator_statistics::StatisticsRegistry; +use crate::stream::RecordBatchStreamAdapter; +use crate::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, Statistics, +}; + +use arrow::datatypes::SchemaRef; +use arrow::record_batch::RecordBatch; +use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; +use datafusion_common::{Result, internal_err}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; +use futures::TryStreamExt; + +/// A shared cache that stores the materialized CTE results. +/// The cache uses `OnceAsync` to ensure the CTE is only computed once, +/// while allowing multiple consumers to await the result concurrently. +#[derive(Debug)] +pub struct MaterializedCteCache { + /// Name of the CTE (for debugging) + #[expect(dead_code)] + name: String, + /// The shared one-time async computation of the CTE batches + once: OnceAsync>>, +} + +impl MaterializedCteCache { + /// Create a new empty cache for the given CTE name. + pub fn new(name: String) -> Self { + Self { + name, + once: OnceAsync::default(), + } + } + + /// Get or initialize the cached batches via `OnceAsync::try_once`. + /// The first caller triggers computation; subsequent callers share the result. + pub(crate) fn try_once(&self, f: F) -> Result>>> + where + F: FnOnce() -> Result, + Fut: Future>>> + Send + 'static, + { + self.once.try_once(f) + } +} + +/// Physical execution plan that materializes a CTE and then executes +/// a continuation plan. The CTE results are cached in a shared +/// `MaterializedCteCache` for use by `MaterializedCteReaderExec` nodes. +#[derive(Debug)] +pub struct MaterializedCteExec { + /// Name of the CTE + name: String, + /// The plan that computes the CTE + cte_plan: Arc, + /// The continuation plan that uses the materialized CTE + continuation: Arc, + /// Shared cache for the CTE results + cache: Arc, + /// Execution metrics + metrics: ExecutionPlanMetricsSet, + /// Cache holding plan properties + properties: Arc, +} + +impl MaterializedCteExec { + /// Create a new MaterializedCteExec. + pub fn new( + name: String, + cte_plan: Arc, + continuation: Arc, + cache: Arc, + ) -> Self { + let properties = Arc::clone(continuation.properties()); + Self { + name, + cte_plan, + continuation, + cache, + metrics: ExecutionPlanMetricsSet::new(), + properties, + } + } +} + +impl DisplayAs for MaterializedCteExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "MaterializedCteExec: name={}", self.name) + } + DisplayFormatType::TreeRender => { + write!(f, "name={}", self.name) + } + } + } +} + +impl ExecutionPlan for MaterializedCteExec { + fn name(&self) -> &'static str { + "MaterializedCteExec" + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.cte_plan, &self.continuation] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 2 { + return internal_err!( + "MaterializedCteExec expected 2 children, got {}", + children.len() + ); + } + let cte_plan = Arc::clone(&children[0]); + let partition_count = cte_plan.output_partitioning().partition_count(); + let statistics = materialized_cte_statistics(cte_plan.as_ref())?; + let continuation = replace_materialized_cte_readers( + Arc::clone(&children[1]), + &self.name, + &self.cache, + partition_count, + &statistics, + )?; + Ok(Arc::new(Self::new( + self.name.clone(), + cte_plan, + continuation, + Arc::clone(&self.cache), + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let output_partitions = self.properties.output_partitioning().partition_count(); + if partition >= output_partitions { + return internal_err!( + "MaterializedCteExec got partition {partition}, expected less than {output_partitions}" + ); + } + + let cte_plan = Arc::clone(&self.cte_plan); + let continuation = Arc::clone(&self.continuation); + let name = self.name.clone(); + let ctx = Arc::clone(&context); + let schema = Arc::clone(&self.continuation.schema()); + + // Use OnceAsync to ensure the CTE is materialized exactly once, + // even when multiple partitions call execute() concurrently. + let mut once_fut = self.cache.try_once(move || { + Ok(async move { + let partitions = collect_partitioned(cte_plan, ctx).await?; + + let num_partitions = partitions.len(); + let num_batches: usize = partitions.iter().map(Vec::len).sum(); + let num_rows: usize = + partitions.iter().flatten().map(|b| b.num_rows()).sum(); + log::info!( + "Materializing CTE '{name}': {num_partitions} partitions, {num_batches} batches, {num_rows} rows" + ); + + Ok(partitions) + }) + })?; + + let ctx = Arc::clone(&context); + let fut = async move { + // Wait for the CTE to be materialized + std::future::poll_fn(|cx| once_fut.get_shared(cx)).await?; + // Now execute the continuation + continuation.execute(partition, ctx) + }; + + let stream = futures::stream::once(fut).try_flatten(); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn partition_statistics(&self, _partition: Option) -> Result> { + Ok(Arc::new(Statistics::new_unknown( + &self.continuation.schema(), + ))) + } +} + +/// Physical execution plan that reads from a previously materialized CTE cache. +/// This is a leaf node that retrieves the cached batches from the shared +/// `MaterializedCteCache`. +#[derive(Debug)] +pub struct MaterializedCteReaderExec { + /// Name of the CTE + name: String, + /// The schema of the CTE output + schema: SchemaRef, + /// Shared cache to read from + cache: Arc, + /// Execution metrics + metrics: ExecutionPlanMetricsSet, + /// Statistics from the plan that produces the materialized CTE + statistics: Arc, + /// Cache holding plan properties + properties: Arc, +} + +impl MaterializedCteReaderExec { + /// Create a new MaterializedCteReaderExec. + pub fn new( + name: String, + schema: SchemaRef, + cache: Arc, + partition_count: usize, + statistics: Arc, + ) -> Self { + let partition_count = reader_partition_count(partition_count, &statistics); + let properties = Self::compute_properties(Arc::clone(&schema), partition_count); + Self { + name, + schema, + cache, + metrics: ExecutionPlanMetricsSet::new(), + statistics, + properties: Arc::new(properties), + } + } + + /// The CTE this reader reads from. + pub fn cte_name(&self) -> &str { + &self.name + } + + fn compute_properties(schema: SchemaRef, partition_count: usize) -> PlanProperties { + PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(partition_count), + EmissionType::Incremental, + Boundedness::Bounded, + ) + } +} + +impl DisplayAs for MaterializedCteReaderExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "MaterializedCteReaderExec: name={}", self.name) + } + DisplayFormatType::TreeRender => { + write!(f, "name={}", self.name) + } + } + } +} + +impl ExecutionPlan for MaterializedCteReaderExec { + fn name(&self) -> &'static str { + "MaterializedCteReaderExec" + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Ok(Arc::clone(&self) as Arc) + } + + fn execute( + &self, + partition: usize, + _context: Arc, + ) -> Result { + let output_partitions = self.properties.output_partitioning().partition_count(); + if partition >= output_partitions { + return internal_err!( + "MaterializedCteReaderExec got partition {partition}, expected less than {output_partitions}" + ); + } + + let schema = Arc::clone(&self.schema); + let name = self.name.clone(); + + // Get a OnceFut handle to the shared computation. The producer + // (MaterializedCteExec) triggers the actual work; here we just + // await the result which will be ready immediately if the producer + // has already finished. + let mut once_fut = + self.cache.try_once(move || -> Result> { + internal_err!( + "MaterializedCteReaderExec: cache for CTE '{}' was never initialized by the producer.", + name + ) + })?; + + let schema_for_stream = Arc::clone(&schema); + let fut = async move { + let batches = std::future::poll_fn(|cx| once_fut.get_shared(cx)).await?; + + let partition_batches = if output_partitions == 1 { + batches.iter().flatten().cloned().collect() + } else { + batches.get(partition).cloned().unwrap_or_default() + }; + + let stream = MemoryStream::try_new(partition_batches, schema, None)?; + Ok::<_, datafusion_common::DataFusionError>( + Box::pin(cooperative(stream)) as SendableRecordBatchStream + ) + }; + + let stream = futures::stream::once(fut).try_flatten(); + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema_for_stream, + stream, + ))) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn partition_statistics(&self, _partition: Option) -> Result> { + Ok(Arc::clone(&self.statistics)) + } +} + +fn reader_partition_count(partition_count: usize, statistics: &Statistics) -> usize { + match statistics.num_rows.get_value() { + Some(rows) if *rows < partition_count => 1, + _ => partition_count, + } +} + +/// Estimate the statistics exposed by materialized CTE readers. +pub fn materialized_cte_statistics(plan: &dyn ExecutionPlan) -> Result> { + Ok(Arc::clone( + StatisticsRegistry::default_with_builtin_providers() + .compute(plan)? + .base_arc(), + )) +} + +/// Replace readers for a materialized CTE with readers that use the provided +/// cache and expose the provided partition count and statistics. +pub fn replace_materialized_cte_readers( + plan: Arc, + name: &str, + cache: &Arc, + partition_count: usize, + statistics: &Arc, +) -> Result> { + plan.transform_up(|plan| { + let Some(reader) = plan.downcast_ref::() else { + return Ok(Transformed::no(plan)); + }; + + if reader.cte_name() != name { + return Ok(Transformed::no(plan)); + } + + Ok(Transformed::yes(Arc::new(MaterializedCteReaderExec::new( + name.to_string(), + plan.schema(), + Arc::clone(cache), + partition_count, + Arc::clone(statistics), + )) as Arc)) + }) + .data() +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ArrayRef, Int32Array}; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::assert_batches_eq; + use datafusion_common::stats::Precision; + use futures::TryStreamExt; + + fn test_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])) + } + + fn test_batch(schema: &SchemaRef) -> RecordBatch { + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + RecordBatch::try_new(Arc::clone(schema), vec![array]).unwrap() + } + + fn test_statistics(schema: &SchemaRef) -> Arc { + Arc::new(Statistics::new_unknown(schema)) + } + + fn test_statistics_with_rows(schema: &SchemaRef, rows: usize) -> Arc { + Arc::new(Statistics::new_unknown(schema).with_num_rows(Precision::Exact(rows))) + } + + /// Helper: pre-populate the cache by triggering `try_once` with a ready value. + fn prepopulate_cache(cache: &MaterializedCteCache, batches: Vec>) { + cache + .try_once(move || Ok(async move { Ok(batches) })) + .expect("try_once should succeed on first call"); + } + + #[tokio::test] + async fn test_cache_try_once_populates() { + let cache = MaterializedCteCache::new("test".into()); + + let schema = test_schema(); + let batch = test_batch(&schema); + let data = vec![vec![batch.clone()]]; + let mut once_fut = cache.try_once(move || Ok(async move { Ok(data) })).unwrap(); + + let cached = std::future::poll_fn(|cx| once_fut.get_shared(cx)) + .await + .unwrap(); + assert_eq!(cached.len(), 1); + assert_eq!(cached[0].len(), 1); + assert_eq!(cached[0][0].num_rows(), 3); + } + + #[tokio::test] + async fn test_cache_try_once_returns_same_result() { + let cache = MaterializedCteCache::new("test".into()); + let schema = test_schema(); + let batch = test_batch(&schema); + + let data = vec![vec![batch.clone()]]; + // First call populates + let mut fut1 = cache.try_once(move || Ok(async move { Ok(data) })).unwrap(); + let result1 = std::future::poll_fn(|cx| fut1.get_shared(cx)) + .await + .unwrap(); + + // Second call returns the same result (closure is never invoked) + let mut fut2 = cache.try_once(|| Ok(async move { Ok(vec![]) })).unwrap(); + let result2 = std::future::poll_fn(|cx| fut2.get_shared(cx)) + .await + .unwrap(); + + assert_eq!(result1.len(), result2.len()); + assert_eq!(result1[0][0].num_rows(), result2[0][0].num_rows()); + } + + #[tokio::test] + async fn test_reader_exec_reads_from_cache() { + let schema = test_schema(); + let batch = test_batch(&schema); + let cache = Arc::new(MaterializedCteCache::new("test".into())); + prepopulate_cache(&cache, vec![vec![batch.clone()]]); + + let reader = MaterializedCteReaderExec::new( + "test".into(), + Arc::clone(&schema), + cache, + 1, + test_statistics(&schema), + ); + + let context = Arc::new(TaskContext::default()); + let stream = reader.execute(0, context).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let expected = [ + "+---+", "| a |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+", + ]; + assert_batches_eq!(expected, &batches); + } + + #[tokio::test] + async fn test_reader_exec_preserves_cache_partitions() { + let schema = test_schema(); + let batch = test_batch(&schema); + let cache = Arc::new(MaterializedCteCache::new("test".into())); + prepopulate_cache(&cache, vec![vec![batch.clone()], vec![batch.clone()]]); + + let reader = MaterializedCteReaderExec::new( + "test".into(), + Arc::clone(&schema), + cache, + 2, + test_statistics(&schema), + ); + + assert_eq!( + reader.properties().output_partitioning().partition_count(), + 2 + ); + + let context = Arc::new(TaskContext::default()); + let stream = reader.execute(1, context).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let expected = [ + "+---+", "| a |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+", + ]; + assert_batches_eq!(expected, &batches); + } + + #[tokio::test] + async fn test_reader_exec_coalesces_exact_scalar_cache() { + let schema = test_schema(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1]))], + ) + .unwrap(); + let cache = Arc::new(MaterializedCteCache::new("test".into())); + prepopulate_cache(&cache, vec![vec![], vec![batch.clone()]]); + + let reader = MaterializedCteReaderExec::new( + "test".into(), + Arc::clone(&schema), + cache, + 2, + test_statistics_with_rows(&schema, 1), + ); + + assert_eq!( + reader.properties().output_partitioning().partition_count(), + 1 + ); + + let context = Arc::new(TaskContext::default()); + let stream = reader.execute(0, context).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let expected = ["+---+", "| a |", "+---+", "| 1 |", "+---+"]; + assert_batches_eq!(expected, &batches); + } + + #[tokio::test] + async fn test_reader_exec_fails_when_cache_empty() { + let schema = test_schema(); + let cache = Arc::new(MaterializedCteCache::new("test".into())); + + let reader = MaterializedCteReaderExec::new( + "test".into(), + Arc::clone(&schema), + cache, + 1, + test_statistics(&schema), + ); + + let context = Arc::new(TaskContext::default()); + let result = reader.execute(0, context); + // With OnceAsync, the error is returned from try_once when the + // producer closure returns an error. The reader's closure produces + // an internal_err if no producer has initialized the cache first. + // However, since try_once returns the FIRST caller's result, and + // the reader IS the first caller here, the error closure fires. + assert!(result.is_err()); + } +} diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs index 18766d7056355..88985d86e6539 100644 --- a/datafusion/sql/src/cte.rs +++ b/datafusion/sql/src/cte.rs @@ -24,7 +24,7 @@ use datafusion_common::{ tree_node::{TreeNode, TreeNodeRecursion}, }; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, TableSource}; -use sqlparser::ast::{Query, SetExpr, SetOperator, With}; +use sqlparser::ast::{CteAsMaterialized, Query, SetExpr, SetOperator, With}; impl SqlToRel<'_, S> { pub(super) fn plan_with_clause( @@ -43,8 +43,21 @@ impl SqlToRel<'_, S> { ); } + // Track MATERIALIZED / NOT MATERIALIZED hints + if let Some(ref materialized) = cte.materialized { + match materialized { + CteAsMaterialized::Materialized => { + planner_context.insert_materialized_cte(&cte_name); + } + CteAsMaterialized::NotMaterialized => { + planner_context.insert_not_materialized_cte(&cte_name); + } + } + } + // Create a logical plan for the CTE let cte_plan = if is_recursive { + planner_context.insert_recursive_cte(&cte_name); self.recursive_cte(&cte_name, *cte.query, planner_context)? } else { self.non_recursive_cte(*cte.query, planner_context)? diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 01215ae3434cf..5e1ea46561638 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -16,7 +16,7 @@ // under the License. //! [`SqlToRel`]: SQL Query Planner (produces [`LogicalPlan`] from SQL AST) -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::str::FromStr; use std::sync::Arc; use std::vec; @@ -276,6 +276,14 @@ pub struct PlannerContext { set_expr_left_schema: Option, /// The parameters of all lambdas seen so far lambda_parameters: HashMap, + /// CTEs explicitly marked as MATERIALIZED + materialized_cte_names: HashSet, + /// CTEs explicitly marked as NOT MATERIALIZED + not_materialized_cte_names: HashSet, + /// CTEs that are recursive + recursive_cte_names: HashSet, + /// Reference counts for CTEs (how many times each CTE is referenced) + cte_ref_counts: HashMap, } impl Default for PlannerContext { @@ -295,6 +303,10 @@ impl PlannerContext { create_table_schema: None, set_expr_left_schema: None, lambda_parameters: HashMap::new(), + materialized_cte_names: HashSet::new(), + not_materialized_cte_names: HashSet::new(), + recursive_cte_names: HashSet::new(), + cte_ref_counts: HashMap::new(), } } @@ -430,6 +442,61 @@ impl PlannerContext { ) -> Option { std::mem::replace(&mut self.set_expr_left_schema, schema) } + + /// Mark a CTE as explicitly MATERIALIZED + pub fn insert_materialized_cte(&mut self, name: &str) { + self.materialized_cte_names.insert(name.to_string()); + } + + /// Mark a CTE as explicitly NOT MATERIALIZED + pub fn insert_not_materialized_cte(&mut self, name: &str) { + self.not_materialized_cte_names.insert(name.to_string()); + } + + /// Mark a CTE as recursive + pub fn insert_recursive_cte(&mut self, name: &str) { + self.recursive_cte_names.insert(name.to_string()); + } + + /// Check if a CTE is explicitly marked as MATERIALIZED + pub fn is_materialized_cte(&self, name: &str) -> bool { + self.materialized_cte_names.contains(name) + } + + /// Check if a CTE is explicitly marked as NOT MATERIALIZED + pub fn is_not_materialized_cte(&self, name: &str) -> bool { + self.not_materialized_cte_names.contains(name) + } + + /// Check if a CTE is recursive + pub fn is_recursive_cte(&self, name: &str) -> bool { + self.recursive_cte_names.contains(name) + } + + /// Increment the reference count for a CTE + pub fn increment_cte_ref_count(&mut self, name: &str) { + *self.cte_ref_counts.entry(name.to_string()).or_insert(0) += 1; + } + + /// Get the reference count for a CTE + pub fn get_cte_ref_count(&self, name: &str) -> usize { + self.cte_ref_counts.get(name).copied().unwrap_or(0) + } + + /// Get a reference to the materialized CTE names + pub fn materialized_cte_names(&self) -> &HashSet { + &self.materialized_cte_names + } + + /// Get a reference to the CTE reference counts + pub fn cte_ref_counts(&self) -> &HashMap { + &self.cte_ref_counts + } + + /// Returns an iterator over CTE names + pub fn cte_names(&self) -> impl Iterator { + self.ctes.keys() + } } /// SQL query planner and binder diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs index 76124cbc7eb59..f4a7669d258f6 100644 --- a/datafusion/sql/src/query.rs +++ b/datafusion/sql/src/query.rs @@ -20,8 +20,12 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::stack::StackGuard; -use datafusion_common::{Constraints, DFSchema, Result, not_impl_err}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::{Constraints, DFSchema, DFSchemaRef, Result, not_impl_err}; use datafusion_expr::expr::{Sort, WildcardOptions}; +use datafusion_expr::logical_plan::{ + Extension, MaterializedCteProducer, MaterializedCteReader, +}; use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ @@ -63,6 +67,7 @@ impl SqlToRel<'_, S> { return not_impl_err!("FETCH clause is not supported yet"); } + let has_with = with.is_some(); if let Some(with) = with { self.plan_with_clause(with, planner_context)?; } @@ -99,7 +104,108 @@ impl SqlToRel<'_, S> { } }?; - self.pipe_operators(plan, pipe_operators, planner_context) + let plan = self.pipe_operators(plan, pipe_operators, planner_context)?; + + // Apply CTE materialization if this query had a WITH clause + if has_with { + self.apply_cte_materialization(plan, planner_context) + } else { + Ok(plan) + } + } + + /// Apply CTE materialization to the plan. + /// + /// Materialize ALL multi-referenced CTEs upfront (DuckDB-style). + /// + /// The SQL planner wraps every multi-ref CTE in MaterializedCteProducer/Reader + /// nodes. The `InlineCte` optimizer rule then selectively inlines ones where + /// materialization is not beneficial (cheap CTEs, CTEs under LIMIT, etc.). + /// + /// This approach ensures: + /// 1. The optimizer has full context (explicit CTE nodes in the plan) + /// 2. The inlining decision can be revisited after other optimizer passes + /// 3. DataFrame API users benefit via the optimizer rule + fn apply_cte_materialization( + &self, + plan: LogicalPlan, + planner_context: &mut PlannerContext, + ) -> Result { + if !self + .context_provider + .options() + .execution + .enable_materialized_ctes + { + return Ok(plan); + } + + let cte_names: Vec = planner_context.cte_names().cloned().collect(); + let mut ctes_to_materialize: Vec<(String, LogicalPlan, bool)> = Vec::new(); + + for cte_name in &cte_names { + if planner_context.is_recursive_cte(cte_name) { + continue; + } + if planner_context.is_not_materialized_cte(cte_name) { + continue; + } + + let ref_count = count_cte_references(&plan, cte_name); + let force = planner_context.is_materialized_cte(cte_name); + + // Materialize multi-ref CTEs and explicitly MATERIALIZED CTEs. + // Skip cheap CTEs (literals/empty) — not worth materializing. + // The optimizer's InlineCte rule handles further inlining decisions. + if (ref_count > 1 || force) + && let Some(cte_plan) = planner_context.get_cte(cte_name) + && (force + || !is_cheap_to_inline(cte_plan) + || plan_contains_volatile_functions(cte_plan)) + { + ctes_to_materialize.push((cte_name.clone(), cte_plan.clone(), force)); + } + } + + if ctes_to_materialize.is_empty() { + return Ok(plan); + } + + // Sort by dependency order + ctes_to_materialize.sort_by(|(name_a, _, _), (name_b, _, _)| { + let a_deps_on_b = planner_context + .get_cte(name_a) + .is_some_and(|p| plan_references_cte(p, name_b)); + let b_deps_on_a = planner_context + .get_cte(name_b) + .is_some_and(|p| plan_references_cte(p, name_a)); + if a_deps_on_b { + std::cmp::Ordering::Less + } else if b_deps_on_a { + std::cmp::Ordering::Greater + } else { + std::cmp::Ordering::Equal + } + }); + + let mut result_plan = plan; + for (cte_name, cte_plan, force) in ctes_to_materialize { + result_plan = + replace_cte_with_reader(result_plan, &cte_name, cte_plan.schema())?; + + let producer = MaterializedCteProducer { + name: cte_name.clone(), + cte_plan: Arc::new(cte_plan), + continuation: Arc::new(result_plan.clone()), + schema: Arc::clone(result_plan.schema()), + force_materialized: force, + }; + result_plan = LogicalPlan::Extension(Extension { + node: Arc::new(producer), + }); + } + + Ok(result_plan) } /// Apply pipe operators to a plan @@ -381,6 +487,88 @@ impl SqlToRel<'_, S> { } } +fn plan_contains_volatile_functions(plan: &LogicalPlan) -> bool { + let mut has_volatile = false; + plan.apply(|node| { + for expr in node.expressions() { + if expr.is_volatile() { + has_volatile = true; + return Ok(TreeNodeRecursion::Stop); + } + } + Ok(TreeNodeRecursion::Continue) + }) + .unwrap(); + has_volatile +} + +fn is_cheap_to_inline(plan: &LogicalPlan) -> bool { + match plan { + LogicalPlan::EmptyRelation(_) => true, + LogicalPlan::SubqueryAlias(alias) => is_cheap_to_inline(alias.input.as_ref()), + _ => { + let inputs = plan.inputs(); + inputs.len() == 1 && is_cheap_to_inline(inputs[0]) + } + } +} + +/// Check if a plan contains a SubqueryAlias reference to a given CTE name. +fn plan_references_cte(plan: &LogicalPlan, cte_name: &str) -> bool { + let mut found = false; + plan.apply(|node| { + if let LogicalPlan::SubqueryAlias(alias) = node + && alias.alias.table() == cte_name + { + found = true; + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + }) + .unwrap(); + found +} + +/// Count how many times a CTE (by SubqueryAlias name) is referenced in the plan tree. +fn count_cte_references(plan: &LogicalPlan, cte_name: &str) -> usize { + let mut count = 0; + plan.apply(|node| { + if let LogicalPlan::SubqueryAlias(alias) = node + && alias.alias.table() == cte_name + { + count += 1; + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + }) + .unwrap(); + count +} + +/// Replace SubqueryAlias nodes matching a CTE name with a MaterializedCteReader. +fn replace_cte_with_reader( + plan: LogicalPlan, + cte_name: &str, + cte_schema: &DFSchemaRef, +) -> Result { + plan.transform_down(|node| { + if let LogicalPlan::SubqueryAlias(ref alias) = node + && alias.alias.table() == cte_name + { + let reader = MaterializedCteReader { + name: cte_name.to_string(), + schema: Arc::clone(cte_schema), + }; + let extension = LogicalPlan::Extension(Extension { + node: Arc::new(reader), + }); + return Ok(datafusion_common::tree_node::Transformed::yes(extension)); + } + Ok(datafusion_common::tree_node::Transformed::no(node)) + }) + .map(|t| t.data) +} + /// Returns the order by expressions from the query. fn to_order_by_exprs(order_by: Option) -> Result> { to_order_by_exprs_with_select(order_by, None) diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 08a292475fd72..8718437fa978b 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -187,13 +187,17 @@ impl SqlToRel<'_, S> { // Normalize name and alias let table_ref = self.object_name_to_table_reference(name)?; let table_name = table_ref.to_string(); - let cte = planner_context.get_cte(&table_name); + let cte_plan_cloned = planner_context.get_cte(&table_name).cloned(); + let is_cte = cte_plan_cloned.is_some(); + if is_cte { + planner_context.increment_cte_ref_count(&table_name); + } ( match ( - cte, + cte_plan_cloned, self.context_provider.get_table_source(table_ref.clone()), ) { - (Some(cte_plan), _) => Ok(cte_plan.clone()), + (Some(cte_plan), _) => Ok(cte_plan), (_, Ok(provider)) => LogicalPlanBuilder::scan( table_ref.clone(), provider, diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index d13e0d4f085e9..1dc0aa57e2dd8 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -1319,3 +1319,39 @@ RESET datafusion.execution.enable_recursive_ctes; statement ok RESET datafusion.sql_parser.enable_ident_normalization; + +# Materialized CTEs collect all input partitions before readers consume them. +query I +WITH t AS ( + SELECT 1 AS a + UNION ALL SELECT 2 AS a + UNION ALL SELECT 3 AS a + UNION ALL SELECT 4 AS a +) +SELECT sum(l.a + r.a) +FROM t l +JOIN t r ON l.a = r.a; +---- +20 + +# Materialized CTE readers can feed repartitioning join plans without +# re-entering a shared repartition output partition. +statement ok +set datafusion.optimizer.prefer_hash_join = false; + +query II rowsort +WITH t1 AS ( + SELECT 11 AS a, 12 AS b + UNION ALL + SELECT 11 AS a, 13 AS b +) +SELECT t2.* +FROM t1 +RIGHT SEMI JOIN t1 t2 +ON t1.a = t2.a AND t1.b = t2.b; +---- +11 12 +11 13 + +statement ok +RESET datafusion.optimizer.prefer_hash_join; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 24b1262e026f4..0df26c4274e1c 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -687,150 +687,3 @@ logical_plan statement ok drop table foo; - -# ------------------------------------------------------------------ -# Postgres-style `EXPLAIN (option, ...)` tests (dialect-gated). -# -# These require a dialect whose `supports_explain_with_utility_options()` -# returns true. DataFusion's default Generic dialect also declares this -# (mirroring sqlparser-rs 0.61.0), so the parenthesized form works there -# too. We set PostgreSQL explicitly for clarity. -# ------------------------------------------------------------------ - -statement ok -set datafusion.sql_parser.dialect = 'PostgreSQL'; - -# `EXPLAIN (FORMAT tree)` matches the legacy `EXPLAIN FORMAT tree` form. -query TT -EXPLAIN (FORMAT tree) SELECT 1; ----- -physical_plan -01)┌───────────────────────────┐ -02)│ ProjectionExec │ -03)│ -------------------- │ -04)│ Int64(1): 1 │ -05)└─────────────┬─────────────┘ -06)┌─────────────┴─────────────┐ -07)│ PlaceholderRowExec │ -08)└───────────────────────────┘ - -# Unknown options are rejected with a clear error. -statement error DataFusion error: Error during planning: unknown EXPLAIN option: FOO -EXPLAIN (FOO) SELECT 1; - -# Postgres-only options return a "not supported" message pointing at METRICS. -statement error DataFusion error: This feature is not implemented: EXPLAIN option BUFFERS is not supported by DataFusion -EXPLAIN (BUFFERS) SELECT 1; - -statement error DataFusion error: This feature is not implemented: EXPLAIN option WAL is not supported by DataFusion -EXPLAIN (WAL) SELECT 1; - -# LEVEL / METRICS / TIMING / SUMMARY all require ANALYZE. -statement error DataFusion error: Error during planning: EXPLAIN option LEVEL requires ANALYZE -EXPLAIN (LEVEL dev) SELECT 1; - -statement error DataFusion error: Error during planning: EXPLAIN option METRICS requires ANALYZE -EXPLAIN (METRICS 'rows') SELECT 1; - -# COSTS and ANALYZE are mutually exclusive (COSTS only applies to plan-only -# EXPLAIN). -statement error DataFusion error: Error during planning: EXPLAIN option COSTS cannot be combined with ANALYZE -EXPLAIN (ANALYZE, COSTS ON) SELECT 1; - -# TIMING and SUMMARY are sugar for METRICS/LEVEL and likewise need ANALYZE. -statement error DataFusion error: Error during planning: EXPLAIN option METRICS requires ANALYZE -EXPLAIN (TIMING ON) SELECT 1; - -statement error DataFusion error: Error during planning: EXPLAIN option LEVEL requires ANALYZE -EXPLAIN (SUMMARY ON) SELECT 1; - -# VERBOSE is incompatible with any FORMAT, and ANALYZE only supports the -# `indent` and `pgjson` formats — `tree` and `graphviz` are rejected (these -# mappings come from the planner, not the parser). -statement error DataFusion error: Error during planning: EXPLAIN ANALYZE with FORMAT tree is not supported -EXPLAIN (ANALYZE, FORMAT tree) SELECT 1; - -statement error DataFusion error: Error during planning: EXPLAIN VERBOSE with FORMAT is not supported -EXPLAIN (VERBOSE, FORMAT tree) SELECT 1; - -# FORMAT argument can be a bare identifier (already tested) or a quoted -# string and produces the same plan either way. -query TT -EXPLAIN (FORMAT 'tree') SELECT 1; ----- -physical_plan -01)┌───────────────────────────┐ -02)│ ProjectionExec │ -03)│ -------------------- │ -04)│ Int64(1): 1 │ -05)└─────────────┬─────────────┘ -06)┌─────────────┴─────────────┐ -07)│ PlaceholderRowExec │ -08)└───────────────────────────┘ - -# Bool option arguments accept bare/ON|OFF/TRUE|FALSE/1|0/=value forms. -# `ANALYZE OFF` is the same as a plain `EXPLAIN`. -query TT -EXPLAIN (ANALYZE OFF, FORMAT tree) SELECT 1; ----- -physical_plan -01)┌───────────────────────────┐ -02)│ ProjectionExec │ -03)│ -------------------- │ -04)│ Int64(1): 1 │ -05)└─────────────┬─────────────┘ -06)┌─────────────┴─────────────┐ -07)│ PlaceholderRowExec │ -08)└───────────────────────────┘ - -# `COSTS OFF` overrides `datafusion.explain.show_statistics` per-statement -# (ANALYZE+COSTS is rejected above). -query TT -EXPLAIN (COSTS OFF) SELECT 1; ----- -logical_plan -01)Projection: Int64(1) -02)--EmptyRelation: rows=1 -physical_plan -01)ProjectionExec: expr=[1 as Int64(1)] -02)--PlaceholderRowExec - -# Bool argument forms: ON / TRUE / 1 all enable the option. The parenthesized -# form does not support `= value` for booleans (sqlparser's utility option -# grammar). Quoted-string booleans are accepted by the option parser. -statement ok -EXPLAIN (COSTS ON) SELECT 1; - -statement ok -EXPLAIN (COSTS TRUE) SELECT 1; - -statement ok -EXPLAIN (COSTS 1) SELECT 1; - -statement ok -EXPLAIN (COSTS 'true') SELECT 1; - -# Unrecognized argument for a boolean option. -statement error DataFusion error: Error during planning: expected boolean for EXPLAIN option costs, got 'maybe' -EXPLAIN (COSTS maybe) SELECT 1; - -# Unrecognized argument for a string/ident option. -statement error DataFusion error: Invalid or Unsupported Configuration: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'bogus' -EXPLAIN (FORMAT bogus) SELECT 1; - -# Legacy keyword form still works on PostgreSQL dialect. -query TT -EXPLAIN FORMAT tree SELECT 1; ----- -physical_plan -01)┌───────────────────────────┐ -02)│ ProjectionExec │ -03)│ -------------------- │ -04)│ Int64(1): 1 │ -05)└─────────────┬─────────────┘ -06)┌─────────────┴─────────────┐ -07)│ PlaceholderRowExec │ -08)└───────────────────────────┘ - -statement ok -reset datafusion.sql_parser.dialect; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 3bf101f203fbd..8895af99d58d8 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -218,6 +218,7 @@ datafusion.execution.batch_size 8192 datafusion.execution.coalesce_batches true datafusion.execution.collect_statistics true datafusion.execution.enable_ansi_mode false +datafusion.execution.enable_materialized_ctes false datafusion.execution.enable_recursive_ctes true datafusion.execution.enforce_batch_size_in_joins false datafusion.execution.hash_join_buffering_capacity 0 @@ -325,7 +326,7 @@ datafusion.optimizer.prefer_existing_union false datafusion.optimizer.prefer_hash_join true datafusion.optimizer.preserve_file_partitions 0 datafusion.optimizer.repartition_aggregations true -datafusion.optimizer.repartition_file_min_size 1048576 +datafusion.optimizer.repartition_file_min_size 10485760 datafusion.optimizer.repartition_file_scans true datafusion.optimizer.repartition_joins true datafusion.optimizer.repartition_sorts true @@ -368,6 +369,7 @@ datafusion.execution.batch_size 8192 Default batch size while creating new batch datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. datafusion.execution.enable_ansi_mode false Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. +datafusion.execution.enable_materialized_ctes false Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are generally computed once and cached, except for cheap CTEs and CTEs consumed below a top-level limit. datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. datafusion.execution.hash_join_buffering_capacity 0 How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. @@ -475,7 +477,7 @@ datafusion.optimizer.prefer_existing_union false When set to true, the optimizer datafusion.optimizer.prefer_hash_join true When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory datafusion.optimizer.preserve_file_partitions 0 Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions. datafusion.optimizer.repartition_aggregations true Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level -datafusion.optimizer.repartition_file_min_size 1048576 Minimum total file size in bytes for file-group byte-range splitting to fire. Files (or merged file groups) smaller than this stay as one partition. Lower values produce more, smaller partitions — better at filling `target_partitions` worth of cores when files are modestly sized, at the cost of slightly more per-partition open / metadata-load overhead. +datafusion.optimizer.repartition_file_min_size 10485760 Minimum total files size in bytes to perform file scan repartitioning. datafusion.optimizer.repartition_file_scans true When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. datafusion.optimizer.repartition_joins true Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` @@ -895,7 +897,7 @@ show functions statement ok reset datafusion.catalog.information_schema; -# The SLT runner sets `target_partitions` to 4 instead of using the default, so +# The SLT runner sets `target_partitions` to 4 instead of using the default, so # reset it explicitly. statement ok set datafusion.execution.target_partitions = 4; diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 9856a13f00306..7aeba19a6ab8c 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -30,7 +30,6 @@ DataFusion configurations control various aspects of DataFusion planning and exe ## Setting Configuration Options ### Programmatically - You can set the options programmatically via the [`ConfigOptions`] object. For example, to configure the `datafusion.execution.target_partitions` using the API: @@ -58,151 +57,153 @@ example, to configure `datafusion.execution.target_partitions`: SET datafusion.execution.target_partitions = '1'; ``` -[`configoptions`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html -[`configoptions::from_env`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html#method.from_env +[`ConfigOptions`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html +[`ConfigOptions::from_env`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html#method.from_env The following configuration settings are available: -| key | default | description | -| ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | -| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | -| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | -| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | -| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | -| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | -| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | -| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | -| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | -| datafusion.execution.perfect_hash_join_small_build_threshold | 1024 | A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | -| datafusion.execution.perfect_hash_join_min_key_density | 0.15 | The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | -| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | -| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | -| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | -| datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | -| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | -| datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | -| datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | -| datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | -| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | -| datafusion.execution.parquet.force_filter_selections | false | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. | -| datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | -| datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | -| datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | -| datafusion.execution.parquet.coerce_int96_tz | NULL | (reading) Optional timezone applied to INT96 columns when `coerce_int96` is set. When `Some`, INT96 columns coerce to `Timestamp(, Some())` instead of the default `Timestamp(, None)`. Spark and other systems write INT96 values as UTC-adjusted instants, so callers that need the resulting Arrow type to be timezone-aware (e.g. for Spark `TimestampType` semantics) should set this to `"UTC"`. No effect when `coerce_int96` is `None`. | -| datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | -| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | -| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | -| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in rows | -| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | -| datafusion.execution.parquet.skip_arrow_metadata | false | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to | -| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | -| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | -| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 53.1.0 | (writing) Sets "created by" property | -| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | -| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | -| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | -| datafusion.execution.parquet.bloom_filter_fpp | NULL | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_ndv | NULL | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | -| datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.parquet.use_content_defined_chunking | NULL | (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing parquet files. When `Some`, CDC is enabled with the given options; when `None` (the default), CDC is disabled. When CDC is enabled, parallel writing is automatically disabled since the chunker state must persist across row groups. | -| datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | -| datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | -| datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | -| datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | -| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | -| datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | -| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | -| datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | -| datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | -| datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | -| datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | -| datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | -| datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | -| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | -| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | -| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | -| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | -| datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | -| datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | -| datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | -| datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | -| datafusion.execution.enable_ansi_mode | false | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. | -| datafusion.execution.hash_join_buffering_capacity | 0 | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. | -| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | -| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | -| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | -| datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | -| datafusion.optimizer.enable_window_topn | false | When set to true, the optimizer will replace Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a PartitionedTopKExec that maintains per-partition heaps, avoiding a full sort of the input. When the window partition key has low cardinality, enabling this optimization can improve performance. However, for high cardinality keys, it may cause regressions in both memory usage and runtime. | -| datafusion.optimizer.enable_topk_repartition | true | When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle. | -| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | -| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | -| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_file_min_size | 1048576 | Minimum total file size in bytes for file-group byte-range splitting to fire. Files (or merged file groups) smaller than this stay as one partition. Lower values produce more, smaller partitions — better at filling `target_partitions` worth of cores when files are modestly sized, at the cost of slightly more per-partition open / metadata-load overhead. | -| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | -| datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | -| datafusion.optimizer.repartition_file_scans | true | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. | -| datafusion.optimizer.preserve_file_partitions | 0 | Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions. | -| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | -| datafusion.optimizer.subset_repartition_threshold | 4 | Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): `text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ` | -| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | -| datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | -| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | -| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | -| datafusion.optimizer.join_reordering | true | When set to true, the physical plan optimizer may swap join inputs based on statistics. When set to false, statistics-driven join input reordering is disabled and the original join order in the query is used. | -| datafusion.optimizer.use_statistics_registry | false | When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for statistics propagation across operators. This enables more accurate cardinality estimates compared to each operator's built-in `partition_statistics`. | -| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | -| datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | -| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | -| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | -| datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | -| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | -| datafusion.optimizer.expand_views_at_output | false | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. | -| datafusion.optimizer.enable_sort_pushdown | true | Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true | -| datafusion.optimizer.enable_leaf_expression_pushdown | true | When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes. | -| datafusion.optimizer.enable_unions_to_filter | false | When set to true, the logical optimizer will rewrite `UNION DISTINCT` branches that read from the same source and differ only by filter predicates into a single branch with a combined filter. This optimization is conservative and only applies when the branches share the same source and compatible wrapper nodes such as identical projections or aliases. | -| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | -| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | -| datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans | -| datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | -| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | -| datafusion.explain.format | indent | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format. | -| datafusion.explain.tree_maximum_render_width | 240 | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit. | -| datafusion.explain.analyze_level | dev | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers. | -| datafusion.explain.analyze_categories | all | Which metric categories to include in "EXPLAIN ANALYZE" output. Comma-separated list of: "rows", "bytes", "timing", "uncategorized". Use "none" to show plan structure only, or "all" (default) to show everything. Metrics without a declared category are treated as "uncategorized". | -| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | -| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | -| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | -| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | -| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | -| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | -| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | -| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | -| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | -| datafusion.sql_parser.enable_subquery_sort_elimination | true | When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query. | -| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | -| datafusion.format.null | | Format string for nulls | -| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays | -| datafusion.format.datetime_format | %Y-%m-%dT%H:%M:%S%.f | Format for DateTime arrays | -| datafusion.format.timestamp_format | %Y-%m-%dT%H:%M:%S%.f | Timestamp format for timestamp arrays | -| datafusion.format.timestamp_tz_format | NULL | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used. | -| datafusion.format.time_format | %H:%M:%S%.f | Time format for time arrays | -| datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | -| datafusion.format.types_info | false | Show types in visual representation batches | +| key | default | description | +|-----|---------|-------------| +| datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | +| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | +| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | +| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | +| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | +| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | +| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | +| datafusion.execution.perfect_hash_join_small_build_threshold | 1024 | A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | +| datafusion.execution.perfect_hash_join_min_key_density | 0.15 | The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | +| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | +| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | +| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | +| datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | +| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | +| datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | +| datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | +| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | +| datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | +| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | +| datafusion.execution.parquet.force_filter_selections | false | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. | +| datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | +| datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | +| datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | +| datafusion.execution.parquet.coerce_int96_tz | NULL | (reading) Optional timezone applied to INT96 columns when `coerce_int96` is set. When `Some`, INT96 columns coerce to `Timestamp(, Some())` instead of the default `Timestamp(, None)`. Spark and other systems write INT96 values as UTC-adjusted instants, so callers that need the resulting Arrow type to be timezone-aware (e.g. for Spark `TimestampType` semantics) should set this to `"UTC"`. No effect when `coerce_int96` is `None`. | +| datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | +| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | +| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | +| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in rows | +| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | +| datafusion.execution.parquet.skip_arrow_metadata | false | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to | +| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | +| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | +| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | +| datafusion.execution.parquet.created_by | datafusion version 53.1.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | +| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | +| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | +| datafusion.execution.parquet.bloom_filter_fpp | NULL | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.bloom_filter_ndv | NULL | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | +| datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | +| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | +| datafusion.execution.parquet.use_content_defined_chunking | NULL | (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing parquet files. When `Some`, CDC is enabled with the given options; when `None` (the default), CDC is disabled. When CDC is enabled, parallel writing is automatically disabled since the chunker state must persist across row groups. | +| datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | +| datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | +| datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | +| datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | +| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | +| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | +| datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | +| datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | +| datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | +| datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | +| datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | +| datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | +| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | +| datafusion.execution.enable_materialized_ctes | false | Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are generally computed once and cached, except for cheap CTEs and CTEs consumed below a top-level limit. | +| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | +| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | +| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | +| datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | +| datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | +| datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | +| datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | +| datafusion.execution.enable_ansi_mode | false | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. | +| datafusion.execution.hash_join_buffering_capacity | 0 | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. | +| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | +| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | +| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | +| datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | +| datafusion.optimizer.enable_window_topn | false | When set to true, the optimizer will replace Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a PartitionedTopKExec that maintains per-partition heaps, avoiding a full sort of the input. When the window partition key has low cardinality, enabling this optimization can improve performance. However, for high cardinality keys, it may cause regressions in both memory usage and runtime. | +| datafusion.optimizer.enable_topk_repartition | true | When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle. | +| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | +| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | +| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | +| datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | +| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | +| datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | +| datafusion.optimizer.repartition_file_scans | true | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. | +| datafusion.optimizer.preserve_file_partitions | 0 | Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions. | +| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | +| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` | +| datafusion.optimizer.subset_repartition_threshold | 4 | Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): ```text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ``` | +| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | +| datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | +| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | +| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | +| datafusion.optimizer.join_reordering | true | When set to true, the physical plan optimizer may swap join inputs based on statistics. When set to false, statistics-driven join input reordering is disabled and the original join order in the query is used. | +| datafusion.optimizer.use_statistics_registry | false | When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for statistics propagation across operators. This enables more accurate cardinality estimates compared to each operator's built-in `partition_statistics`. | +| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | +| datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | +| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | +| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | +| datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | +| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | +| datafusion.optimizer.expand_views_at_output | false | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. | +| datafusion.optimizer.enable_sort_pushdown | true | Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true | +| datafusion.optimizer.enable_leaf_expression_pushdown | true | When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes. | +| datafusion.optimizer.enable_unions_to_filter | false | When set to true, the logical optimizer will rewrite `UNION DISTINCT` branches that read from the same source and differ only by filter predicates into a single branch with a combined filter. This optimization is conservative and only applies when the branches share the same source and compatible wrapper nodes such as identical projections or aliases. | +| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | +| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | +| datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans | +| datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | +| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | +| datafusion.explain.format | indent | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format. | +| datafusion.explain.tree_maximum_render_width | 240 | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit. | +| datafusion.explain.analyze_level | dev | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers. | +| datafusion.explain.analyze_categories | all | Which metric categories to include in "EXPLAIN ANALYZE" output. Comma-separated list of: "rows", "bytes", "timing", "uncategorized". Use "none" to show plan structure only, or "all" (default) to show everything. Metrics without a declared category are treated as "uncategorized". | +| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | +| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | +| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | +| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | +| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | +| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | +| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | +| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | +| datafusion.sql_parser.enable_subquery_sort_elimination | true | When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query. | +| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | +| datafusion.format.null | | Format string for nulls | +| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays | +| datafusion.format.datetime_format | %Y-%m-%dT%H:%M:%S%.f | Format for DateTime arrays | +| datafusion.format.timestamp_format | %Y-%m-%dT%H:%M:%S%.f | Timestamp format for timestamp arrays | +| datafusion.format.timestamp_tz_format | NULL | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used. | +| datafusion.format.time_format | %H:%M:%S%.f | Time format for time arrays | +| datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | +| datafusion.format.types_info | false | Show types in visual representation batches | + You can also reset configuration options to default settings via SQL using the `RESET` command. For example, to set and reset `datafusion.execution.batch_size`: @@ -231,15 +232,16 @@ SET datafusion.runtime.memory_limit = '2G'; The following runtime configuration settings are available: -| key | default | description | -| ---------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| datafusion.runtime.file_statistics_cache_limit | 20M | Maximum memory to use for file statistics cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.list_files_cache_limit | 1M | Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.list_files_cache_ttl | NULL | TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes. | -| datafusion.runtime.max_temp_directory_size | 100G | Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.memory_limit | NULL | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.metadata_cache_limit | 50M | Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.temp_directory | NULL | The path to the temporary file directory. | +| key | default | description | +|-----|---------|-------------| +| datafusion.runtime.file_statistics_cache_limit | 20M | Maximum memory to use for file statistics cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.list_files_cache_limit | 1M | Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.list_files_cache_ttl | NULL | TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes. | +| datafusion.runtime.max_temp_directory_size | 100G | Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.memory_limit | NULL | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.metadata_cache_limit | 50M | Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.temp_directory | NULL | The path to the temporary file directory. | + # Tuning Guide @@ -253,7 +255,7 @@ to enable parallelization can dominate the actual computation. You can find out how many cores are being used via the [`EXPLAIN`] command and look at the number of partitions in the plan. -[`explain`]: sql/explain.md +[`EXPLAIN`]: sql/explain.md The `datafusion.optimizer.repartition_file_min_size` option controls the minimum file size the [`ListingTable`] provider will attempt to repartition. However, this @@ -267,21 +269,21 @@ than 1MB), we recommend setting `target_partitions` to 1 to avoid repartitioning SET datafusion.execution.target_partitions = '1'; ``` -[`listingtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html +[`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html ## Memory-limited Queries -When executing a memory-consuming query under a tight memory limit, DataFusion +When executing a memory-consuming query under a tight memory limit, DataFusion will spill intermediate results to disk. -When the [`FairSpillPool`] is used, memory is divided evenly among partitions. -The higher the value of `datafusion.execution.target_partitions`, the less memory -is allocated to each partition, and the out-of-core execution path may trigger +When the [`FairSpillPool`] is used, memory is divided evenly among partitions. +The higher the value of `datafusion.execution.target_partitions`, the less memory +is allocated to each partition, and the out-of-core execution path may trigger more frequently, possibly slowing down execution. Additionally, while spilling, data is read back in `datafusion.execution.batch_size` size batches. The larger this value, the fewer spilled sorted runs can be merged. Decreasing this setting -can help reduce the number of subsequent spills required. +can help reduce the number of subsequent spills required. In conclusion, for queries under a very tight memory limit, it's recommended to set `target_partitions` and `batch_size` to smaller values. @@ -293,7 +295,7 @@ SET datafusion.execution.target_partitions = 4; SET datafusion.execution.batch_size = 1024; ``` -[`fairspillpool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html +[`FairSpillPool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html ## Join Queries @@ -313,13 +315,13 @@ condition of the two tables. You can modify join optimization behavior in your queries by setting specific configuration values. Use the following command to update a configuration: -```sql +``` sql SET datafusion.optimizer.; ``` Example -```sql +``` sql SET datafusion.optimizer.prefer_hash_join = false; ``` @@ -354,3 +356,4 @@ Enables the experimental Piecewise Merge Join algorithm. - Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter except for cases where it is joining two large tables (num_rows > 100,000) that are approximately equal in size. + From 98af4da548d8a226588cf121e5ad481728eaca4c Mon Sep 17 00:00:00 2001 From: Nathan Bezualem Date: Sun, 31 May 2026 13:04:37 -0400 Subject: [PATCH 2/6] revert and clean up + CI fix --- .../memory_pool_tracking.rs | 3 +- .../sqllogictest/test_files/explain.slt | 147 ++++++++ .../test_files/information_schema.slt | 6 +- docs/source/user-guide/configs.md | 328 +++++++++--------- 4 files changed, 314 insertions(+), 170 deletions(-) diff --git a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs index b723f05bad8b6..d849a033bc66b 100644 --- a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs +++ b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs @@ -54,8 +54,7 @@ async fn automatic_usage_example() -> Result<()> { .with_memory_limit(5_000_000, 1.0) // 5MB, 100% utilization .build_arc()?; - let mut config = SessionConfig::new(); - config.options_mut().execution.enable_materialized_ctes = false; + let config = SessionConfig::new(); let ctx = SessionContext::new_with_config_rt(config, runtime); // Create a simple table for demonstration diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 0df26c4274e1c..24b1262e026f4 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -687,3 +687,150 @@ logical_plan statement ok drop table foo; + +# ------------------------------------------------------------------ +# Postgres-style `EXPLAIN (option, ...)` tests (dialect-gated). +# +# These require a dialect whose `supports_explain_with_utility_options()` +# returns true. DataFusion's default Generic dialect also declares this +# (mirroring sqlparser-rs 0.61.0), so the parenthesized form works there +# too. We set PostgreSQL explicitly for clarity. +# ------------------------------------------------------------------ + +statement ok +set datafusion.sql_parser.dialect = 'PostgreSQL'; + +# `EXPLAIN (FORMAT tree)` matches the legacy `EXPLAIN FORMAT tree` form. +query TT +EXPLAIN (FORMAT tree) SELECT 1; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ Int64(1): 1 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ PlaceholderRowExec │ +08)└───────────────────────────┘ + +# Unknown options are rejected with a clear error. +statement error DataFusion error: Error during planning: unknown EXPLAIN option: FOO +EXPLAIN (FOO) SELECT 1; + +# Postgres-only options return a "not supported" message pointing at METRICS. +statement error DataFusion error: This feature is not implemented: EXPLAIN option BUFFERS is not supported by DataFusion +EXPLAIN (BUFFERS) SELECT 1; + +statement error DataFusion error: This feature is not implemented: EXPLAIN option WAL is not supported by DataFusion +EXPLAIN (WAL) SELECT 1; + +# LEVEL / METRICS / TIMING / SUMMARY all require ANALYZE. +statement error DataFusion error: Error during planning: EXPLAIN option LEVEL requires ANALYZE +EXPLAIN (LEVEL dev) SELECT 1; + +statement error DataFusion error: Error during planning: EXPLAIN option METRICS requires ANALYZE +EXPLAIN (METRICS 'rows') SELECT 1; + +# COSTS and ANALYZE are mutually exclusive (COSTS only applies to plan-only +# EXPLAIN). +statement error DataFusion error: Error during planning: EXPLAIN option COSTS cannot be combined with ANALYZE +EXPLAIN (ANALYZE, COSTS ON) SELECT 1; + +# TIMING and SUMMARY are sugar for METRICS/LEVEL and likewise need ANALYZE. +statement error DataFusion error: Error during planning: EXPLAIN option METRICS requires ANALYZE +EXPLAIN (TIMING ON) SELECT 1; + +statement error DataFusion error: Error during planning: EXPLAIN option LEVEL requires ANALYZE +EXPLAIN (SUMMARY ON) SELECT 1; + +# VERBOSE is incompatible with any FORMAT, and ANALYZE only supports the +# `indent` and `pgjson` formats — `tree` and `graphviz` are rejected (these +# mappings come from the planner, not the parser). +statement error DataFusion error: Error during planning: EXPLAIN ANALYZE with FORMAT tree is not supported +EXPLAIN (ANALYZE, FORMAT tree) SELECT 1; + +statement error DataFusion error: Error during planning: EXPLAIN VERBOSE with FORMAT is not supported +EXPLAIN (VERBOSE, FORMAT tree) SELECT 1; + +# FORMAT argument can be a bare identifier (already tested) or a quoted +# string and produces the same plan either way. +query TT +EXPLAIN (FORMAT 'tree') SELECT 1; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ Int64(1): 1 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ PlaceholderRowExec │ +08)└───────────────────────────┘ + +# Bool option arguments accept bare/ON|OFF/TRUE|FALSE/1|0/=value forms. +# `ANALYZE OFF` is the same as a plain `EXPLAIN`. +query TT +EXPLAIN (ANALYZE OFF, FORMAT tree) SELECT 1; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ Int64(1): 1 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ PlaceholderRowExec │ +08)└───────────────────────────┘ + +# `COSTS OFF` overrides `datafusion.explain.show_statistics` per-statement +# (ANALYZE+COSTS is rejected above). +query TT +EXPLAIN (COSTS OFF) SELECT 1; +---- +logical_plan +01)Projection: Int64(1) +02)--EmptyRelation: rows=1 +physical_plan +01)ProjectionExec: expr=[1 as Int64(1)] +02)--PlaceholderRowExec + +# Bool argument forms: ON / TRUE / 1 all enable the option. The parenthesized +# form does not support `= value` for booleans (sqlparser's utility option +# grammar). Quoted-string booleans are accepted by the option parser. +statement ok +EXPLAIN (COSTS ON) SELECT 1; + +statement ok +EXPLAIN (COSTS TRUE) SELECT 1; + +statement ok +EXPLAIN (COSTS 1) SELECT 1; + +statement ok +EXPLAIN (COSTS 'true') SELECT 1; + +# Unrecognized argument for a boolean option. +statement error DataFusion error: Error during planning: expected boolean for EXPLAIN option costs, got 'maybe' +EXPLAIN (COSTS maybe) SELECT 1; + +# Unrecognized argument for a string/ident option. +statement error DataFusion error: Invalid or Unsupported Configuration: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'bogus' +EXPLAIN (FORMAT bogus) SELECT 1; + +# Legacy keyword form still works on PostgreSQL dialect. +query TT +EXPLAIN FORMAT tree SELECT 1; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ Int64(1): 1 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ PlaceholderRowExec │ +08)└───────────────────────────┘ + +statement ok +reset datafusion.sql_parser.dialect; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 8895af99d58d8..7ccb7601584b1 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -326,7 +326,7 @@ datafusion.optimizer.prefer_existing_union false datafusion.optimizer.prefer_hash_join true datafusion.optimizer.preserve_file_partitions 0 datafusion.optimizer.repartition_aggregations true -datafusion.optimizer.repartition_file_min_size 10485760 +datafusion.optimizer.repartition_file_min_size 1048576 datafusion.optimizer.repartition_file_scans true datafusion.optimizer.repartition_joins true datafusion.optimizer.repartition_sorts true @@ -477,7 +477,7 @@ datafusion.optimizer.prefer_existing_union false When set to true, the optimizer datafusion.optimizer.prefer_hash_join true When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory datafusion.optimizer.preserve_file_partitions 0 Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions. datafusion.optimizer.repartition_aggregations true Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level -datafusion.optimizer.repartition_file_min_size 10485760 Minimum total files size in bytes to perform file scan repartitioning. +datafusion.optimizer.repartition_file_min_size 1048576 Minimum total file size in bytes for file-group byte-range splitting to fire. Files (or merged file groups) smaller than this stay as one partition. Lower values produce more, smaller partitions — better at filling `target_partitions` worth of cores when files are modestly sized, at the cost of slightly more per-partition open / metadata-load overhead. datafusion.optimizer.repartition_file_scans true When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. datafusion.optimizer.repartition_joins true Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` @@ -897,7 +897,7 @@ show functions statement ok reset datafusion.catalog.information_schema; -# The SLT runner sets `target_partitions` to 4 instead of using the default, so +# The SLT runner sets `target_partitions` to 4 instead of using the default, so # reset it explicitly. statement ok set datafusion.execution.target_partitions = 4; diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 7aeba19a6ab8c..099674c5a0b58 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -30,6 +30,7 @@ DataFusion configurations control various aspects of DataFusion planning and exe ## Setting Configuration Options ### Programmatically + You can set the options programmatically via the [`ConfigOptions`] object. For example, to configure the `datafusion.execution.target_partitions` using the API: @@ -57,153 +58,152 @@ example, to configure `datafusion.execution.target_partitions`: SET datafusion.execution.target_partitions = '1'; ``` -[`ConfigOptions`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html -[`ConfigOptions::from_env`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html#method.from_env +[`configoptions`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html +[`configoptions::from_env`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html#method.from_env The following configuration settings are available: -| key | default | description | -|-----|---------|-------------| -| datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | -| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | -| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | -| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | -| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | -| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | -| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | -| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | -| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | -| datafusion.execution.perfect_hash_join_small_build_threshold | 1024 | A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | -| datafusion.execution.perfect_hash_join_min_key_density | 0.15 | The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | -| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | -| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | -| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | -| datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | -| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | -| datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | -| datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | -| datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | -| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | -| datafusion.execution.parquet.force_filter_selections | false | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. | -| datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | -| datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | -| datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | -| datafusion.execution.parquet.coerce_int96_tz | NULL | (reading) Optional timezone applied to INT96 columns when `coerce_int96` is set. When `Some`, INT96 columns coerce to `Timestamp(, Some())` instead of the default `Timestamp(, None)`. Spark and other systems write INT96 values as UTC-adjusted instants, so callers that need the resulting Arrow type to be timezone-aware (e.g. for Spark `TimestampType` semantics) should set this to `"UTC"`. No effect when `coerce_int96` is `None`. | -| datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | -| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | -| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | -| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in rows | -| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | -| datafusion.execution.parquet.skip_arrow_metadata | false | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to | -| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | -| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | -| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 53.1.0 | (writing) Sets "created by" property | -| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | -| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | -| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | -| datafusion.execution.parquet.bloom_filter_fpp | NULL | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_ndv | NULL | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | -| datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.parquet.use_content_defined_chunking | NULL | (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing parquet files. When `Some`, CDC is enabled with the given options; when `None` (the default), CDC is disabled. When CDC is enabled, parallel writing is automatically disabled since the chunker state must persist across row groups. | -| datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | -| datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | -| datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | -| datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | -| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | -| datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | -| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | -| datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | -| datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | -| datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | -| datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | -| datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | -| datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | -| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | -| datafusion.execution.enable_materialized_ctes | false | Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are generally computed once and cached, except for cheap CTEs and CTEs consumed below a top-level limit. | -| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | -| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | -| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | -| datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | -| datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | -| datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | -| datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | -| datafusion.execution.enable_ansi_mode | false | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. | -| datafusion.execution.hash_join_buffering_capacity | 0 | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. | -| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | -| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | -| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | -| datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | -| datafusion.optimizer.enable_window_topn | false | When set to true, the optimizer will replace Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a PartitionedTopKExec that maintains per-partition heaps, avoiding a full sort of the input. When the window partition key has low cardinality, enabling this optimization can improve performance. However, for high cardinality keys, it may cause regressions in both memory usage and runtime. | -| datafusion.optimizer.enable_topk_repartition | true | When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle. | -| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | -| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | -| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | -| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | -| datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | -| datafusion.optimizer.repartition_file_scans | true | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. | -| datafusion.optimizer.preserve_file_partitions | 0 | Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions. | -| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` | -| datafusion.optimizer.subset_repartition_threshold | 4 | Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): ```text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ``` | -| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | -| datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | -| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | -| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | -| datafusion.optimizer.join_reordering | true | When set to true, the physical plan optimizer may swap join inputs based on statistics. When set to false, statistics-driven join input reordering is disabled and the original join order in the query is used. | -| datafusion.optimizer.use_statistics_registry | false | When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for statistics propagation across operators. This enables more accurate cardinality estimates compared to each operator's built-in `partition_statistics`. | -| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | -| datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | -| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | -| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | -| datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | -| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | -| datafusion.optimizer.expand_views_at_output | false | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. | -| datafusion.optimizer.enable_sort_pushdown | true | Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true | -| datafusion.optimizer.enable_leaf_expression_pushdown | true | When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes. | -| datafusion.optimizer.enable_unions_to_filter | false | When set to true, the logical optimizer will rewrite `UNION DISTINCT` branches that read from the same source and differ only by filter predicates into a single branch with a combined filter. This optimization is conservative and only applies when the branches share the same source and compatible wrapper nodes such as identical projections or aliases. | -| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | -| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | -| datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans | -| datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | -| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | -| datafusion.explain.format | indent | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format. | -| datafusion.explain.tree_maximum_render_width | 240 | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit. | -| datafusion.explain.analyze_level | dev | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers. | -| datafusion.explain.analyze_categories | all | Which metric categories to include in "EXPLAIN ANALYZE" output. Comma-separated list of: "rows", "bytes", "timing", "uncategorized". Use "none" to show plan structure only, or "all" (default) to show everything. Metrics without a declared category are treated as "uncategorized". | -| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | -| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | -| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | -| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | -| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | -| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | -| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | -| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | -| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | -| datafusion.sql_parser.enable_subquery_sort_elimination | true | When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query. | -| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | -| datafusion.format.null | | Format string for nulls | -| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays | -| datafusion.format.datetime_format | %Y-%m-%dT%H:%M:%S%.f | Format for DateTime arrays | -| datafusion.format.timestamp_format | %Y-%m-%dT%H:%M:%S%.f | Timestamp format for timestamp arrays | -| datafusion.format.timestamp_tz_format | NULL | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used. | -| datafusion.format.time_format | %H:%M:%S%.f | Time format for time arrays | -| datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | -| datafusion.format.types_info | false | Show types in visual representation batches | - +| key | default | description | +| ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | +| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | +| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | +| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | +| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | +| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | +| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | +| datafusion.execution.perfect_hash_join_small_build_threshold | 1024 | A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | +| datafusion.execution.perfect_hash_join_min_key_density | 0.15 | The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future. | +| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | +| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | +| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | +| datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | +| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | +| datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | +| datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | +| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | +| datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | +| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | +| datafusion.execution.parquet.force_filter_selections | false | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. | +| datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | +| datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | +| datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | +| datafusion.execution.parquet.coerce_int96_tz | NULL | (reading) Optional timezone applied to INT96 columns when `coerce_int96` is set. When `Some`, INT96 columns coerce to `Timestamp(, Some())` instead of the default `Timestamp(, None)`. Spark and other systems write INT96 values as UTC-adjusted instants, so callers that need the resulting Arrow type to be timezone-aware (e.g. for Spark `TimestampType` semantics) should set this to `"UTC"`. No effect when `coerce_int96` is `None`. | +| datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | +| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | +| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | +| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in rows | +| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | +| datafusion.execution.parquet.skip_arrow_metadata | false | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to | +| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | +| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | +| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | +| datafusion.execution.parquet.created_by | datafusion version 53.1.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | +| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | +| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | +| datafusion.execution.parquet.bloom_filter_fpp | NULL | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.bloom_filter_ndv | NULL | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | +| datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | +| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | +| datafusion.execution.parquet.use_content_defined_chunking | NULL | (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing parquet files. When `Some`, CDC is enabled with the given options; when `None` (the default), CDC is disabled. When CDC is enabled, parallel writing is automatically disabled since the chunker state must persist across row groups. | +| datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | +| datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | +| datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | +| datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | +| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | +| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | +| datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | +| datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | +| datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | +| datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | +| datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | +| datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | +| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | +| datafusion.execution.enable_materialized_ctes | false | Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are generally computed once and cached, except for cheap CTEs and CTEs consumed below a top-level limit. | +| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | +| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | +| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | +| datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | +| datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | +| datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | +| datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | +| datafusion.execution.enable_ansi_mode | false | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. | +| datafusion.execution.hash_join_buffering_capacity | 0 | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. | +| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | +| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | +| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | +| datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | +| datafusion.optimizer.enable_window_topn | false | When set to true, the optimizer will replace Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a PartitionedTopKExec that maintains per-partition heaps, avoiding a full sort of the input. When the window partition key has low cardinality, enabling this optimization can improve performance. However, for high cardinality keys, it may cause regressions in both memory usage and runtime. | +| datafusion.optimizer.enable_topk_repartition | true | When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle. | +| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | +| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | +| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | +| datafusion.optimizer.repartition_file_min_size | 1048576 | Minimum total file size in bytes for file-group byte-range splitting to fire. Files (or merged file groups) smaller than this stay as one partition. Lower values produce more, smaller partitions — better at filling `target_partitions` worth of cores when files are modestly sized, at the cost of slightly more per-partition open / metadata-load overhead. | +| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | +| datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | +| datafusion.optimizer.repartition_file_scans | true | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. | +| datafusion.optimizer.preserve_file_partitions | 0 | Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions. | +| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | +| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | +| datafusion.optimizer.subset_repartition_threshold | 4 | Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): `text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ` | +| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | +| datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | +| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | +| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | +| datafusion.optimizer.join_reordering | true | When set to true, the physical plan optimizer may swap join inputs based on statistics. When set to false, statistics-driven join input reordering is disabled and the original join order in the query is used. | +| datafusion.optimizer.use_statistics_registry | false | When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for statistics propagation across operators. This enables more accurate cardinality estimates compared to each operator's built-in `partition_statistics`. | +| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | +| datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | +| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | +| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | +| datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | +| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | +| datafusion.optimizer.expand_views_at_output | false | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. | +| datafusion.optimizer.enable_sort_pushdown | true | Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true | +| datafusion.optimizer.enable_leaf_expression_pushdown | true | When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes. | +| datafusion.optimizer.enable_unions_to_filter | false | When set to true, the logical optimizer will rewrite `UNION DISTINCT` branches that read from the same source and differ only by filter predicates into a single branch with a combined filter. This optimization is conservative and only applies when the branches share the same source and compatible wrapper nodes such as identical projections or aliases. | +| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | +| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | +| datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans | +| datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | +| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | +| datafusion.explain.format | indent | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format. | +| datafusion.explain.tree_maximum_render_width | 240 | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit. | +| datafusion.explain.analyze_level | dev | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers. | +| datafusion.explain.analyze_categories | all | Which metric categories to include in "EXPLAIN ANALYZE" output. Comma-separated list of: "rows", "bytes", "timing", "uncategorized". Use "none" to show plan structure only, or "all" (default) to show everything. Metrics without a declared category are treated as "uncategorized". | +| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | +| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | +| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | +| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | +| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | +| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | +| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | +| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | +| datafusion.sql_parser.enable_subquery_sort_elimination | true | When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query. | +| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | +| datafusion.format.null | | Format string for nulls | +| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays | +| datafusion.format.datetime_format | %Y-%m-%dT%H:%M:%S%.f | Format for DateTime arrays | +| datafusion.format.timestamp_format | %Y-%m-%dT%H:%M:%S%.f | Timestamp format for timestamp arrays | +| datafusion.format.timestamp_tz_format | NULL | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used. | +| datafusion.format.time_format | %H:%M:%S%.f | Time format for time arrays | +| datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | +| datafusion.format.types_info | false | Show types in visual representation batches | You can also reset configuration options to default settings via SQL using the `RESET` command. For example, to set and reset `datafusion.execution.batch_size`: @@ -232,16 +232,15 @@ SET datafusion.runtime.memory_limit = '2G'; The following runtime configuration settings are available: -| key | default | description | -|-----|---------|-------------| -| datafusion.runtime.file_statistics_cache_limit | 20M | Maximum memory to use for file statistics cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.list_files_cache_limit | 1M | Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.list_files_cache_ttl | NULL | TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes. | -| datafusion.runtime.max_temp_directory_size | 100G | Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.memory_limit | NULL | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.metadata_cache_limit | 50M | Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | -| datafusion.runtime.temp_directory | NULL | The path to the temporary file directory. | - +| key | default | description | +| ---------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| datafusion.runtime.file_statistics_cache_limit | 20M | Maximum memory to use for file statistics cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.list_files_cache_limit | 1M | Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.list_files_cache_ttl | NULL | TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes. | +| datafusion.runtime.max_temp_directory_size | 100G | Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.memory_limit | NULL | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.metadata_cache_limit | 50M | Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes) or '0' for 0. Example: '2G' for 2 gigabytes. | +| datafusion.runtime.temp_directory | NULL | The path to the temporary file directory. | # Tuning Guide @@ -255,7 +254,7 @@ to enable parallelization can dominate the actual computation. You can find out how many cores are being used via the [`EXPLAIN`] command and look at the number of partitions in the plan. -[`EXPLAIN`]: sql/explain.md +[`explain`]: sql/explain.md The `datafusion.optimizer.repartition_file_min_size` option controls the minimum file size the [`ListingTable`] provider will attempt to repartition. However, this @@ -269,21 +268,21 @@ than 1MB), we recommend setting `target_partitions` to 1 to avoid repartitioning SET datafusion.execution.target_partitions = '1'; ``` -[`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html +[`listingtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html ## Memory-limited Queries -When executing a memory-consuming query under a tight memory limit, DataFusion +When executing a memory-consuming query under a tight memory limit, DataFusion will spill intermediate results to disk. -When the [`FairSpillPool`] is used, memory is divided evenly among partitions. -The higher the value of `datafusion.execution.target_partitions`, the less memory -is allocated to each partition, and the out-of-core execution path may trigger +When the [`FairSpillPool`] is used, memory is divided evenly among partitions. +The higher the value of `datafusion.execution.target_partitions`, the less memory +is allocated to each partition, and the out-of-core execution path may trigger more frequently, possibly slowing down execution. Additionally, while spilling, data is read back in `datafusion.execution.batch_size` size batches. The larger this value, the fewer spilled sorted runs can be merged. Decreasing this setting -can help reduce the number of subsequent spills required. +can help reduce the number of subsequent spills required. In conclusion, for queries under a very tight memory limit, it's recommended to set `target_partitions` and `batch_size` to smaller values. @@ -295,7 +294,7 @@ SET datafusion.execution.target_partitions = 4; SET datafusion.execution.batch_size = 1024; ``` -[`FairSpillPool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html +[`fairspillpool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html ## Join Queries @@ -315,13 +314,13 @@ condition of the two tables. You can modify join optimization behavior in your queries by setting specific configuration values. Use the following command to update a configuration: -``` sql +```sql SET datafusion.optimizer.; ``` Example -``` sql +```sql SET datafusion.optimizer.prefer_hash_join = false; ``` @@ -356,4 +355,3 @@ Enables the experimental Piecewise Merge Join algorithm. - Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter except for cases where it is joining two large tables (num_rows > 100,000) that are approximately equal in size. - From 1e067d39df551a585c8f074c87c706228053d505 Mon Sep 17 00:00:00 2001 From: Nathan Bezualem Date: Sun, 31 May 2026 13:06:11 -0400 Subject: [PATCH 3/6] undo artifact --- datafusion/common/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 401e956a89e64..1ba0f10b8e12e 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1158,7 +1158,7 @@ config_namespace! { pub repartition_aggregations: bool, default = true /// Minimum total files size in bytes to perform file scan repartitioning. - pub repartition_file_min_size: usize, default = 10 * 1024 * 1024 + pub repartition_file_min_size: usize, default = 1024 * 1024 /// Should DataFusion repartition data using the join keys to execute joins in parallel /// using the provided `target_partitions` level From 7ed9a9ad7dc188db3f36ffb37830bd9b4d1c3e75 Mon Sep 17 00:00:00 2001 From: Nathan Bezualem Date: Sun, 31 May 2026 13:28:41 -0400 Subject: [PATCH 4/6] clean up unrelated config changes from POC branch --- datafusion/common/src/config.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1ba0f10b8e12e..8b196770ddf69 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1157,7 +1157,12 @@ config_namespace! { /// in parallel using the provided `target_partitions` level pub repartition_aggregations: bool, default = true - /// Minimum total files size in bytes to perform file scan repartitioning. + /// Minimum total file size in bytes for file-group byte-range + /// splitting to fire. Files (or merged file groups) smaller than this + /// stay as one partition. Lower values produce more, smaller + /// partitions — better at filling `target_partitions` worth of cores + /// when files are modestly sized, at the cost of slightly more + /// per-partition open / metadata-load overhead. pub repartition_file_min_size: usize, default = 1024 * 1024 /// Should DataFusion repartition data using the join keys to execute joins in parallel From 3dc4ab7cd37cea44bf3bc078ba6022194f414342 Mon Sep 17 00:00:00 2001 From: Nathan Bezualem Date: Sun, 31 May 2026 17:50:01 -0400 Subject: [PATCH 5/6] reset state for cache and doc corrections from POC --- datafusion/common/src/config.rs | 8 +++++--- .../physical-plan/src/materialized_cte.rs | 19 +++++++++++++++++++ datafusion/sql/src/query.rs | 15 ++++++--------- .../test_files/information_schema.slt | 2 +- docs/source/user-guide/configs.md | 2 +- 5 files changed, 32 insertions(+), 14 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 8b196770ddf69..6c0aeec6bb578 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -632,9 +632,11 @@ config_namespace! { pub enable_recursive_ctes: bool, default = true /// Should DataFusion materialize CTEs that are referenced multiple times. - /// When enabled, CTEs referenced more than once are generally computed - /// once and cached, except for cheap CTEs and CTEs consumed below a top-level - /// limit. + /// When enabled, CTEs referenced more than once are computed once and + /// cached, except for cheap CTEs (e.g. literal projections) which remain + /// inlined. Volatile CTEs are always materialized to preserve + /// single-evaluation semantics. Supports explicit MATERIALIZED / NOT + /// MATERIALIZED SQL hints. pub enable_materialized_ctes: bool, default = false /// Attempt to eliminate sorts by packing & sorting files with non-overlapping diff --git a/datafusion/physical-plan/src/materialized_cte.rs b/datafusion/physical-plan/src/materialized_cte.rs index 8cad77aa36993..2ac4881f39522 100644 --- a/datafusion/physical-plan/src/materialized_cte.rs +++ b/datafusion/physical-plan/src/materialized_cte.rs @@ -223,6 +223,25 @@ impl ExecutionPlan for MaterializedCteExec { &self.continuation.schema(), ))) } + + fn reset_state(self: Arc) -> Result> { + let cache = Arc::new(MaterializedCteCache::new(self.name.clone())); + let partition_count = self.cte_plan.output_partitioning().partition_count(); + let statistics = materialized_cte_statistics(self.cte_plan.as_ref())?; + let continuation = replace_materialized_cte_readers( + Arc::clone(&self.continuation), + &self.name, + &cache, + partition_count, + &statistics, + )?; + Ok(Arc::new(Self::new( + self.name.clone(), + Arc::clone(&self.cte_plan), + continuation, + cache, + ))) + } } /// Physical execution plan that reads from a previously materialized CTE cache. diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs index f4a7669d258f6..c77330cc35f3d 100644 --- a/datafusion/sql/src/query.rs +++ b/datafusion/sql/src/query.rs @@ -116,16 +116,13 @@ impl SqlToRel<'_, S> { /// Apply CTE materialization to the plan. /// - /// Materialize ALL multi-referenced CTEs upfront (DuckDB-style). + /// Wraps multi-referenced CTEs in MaterializedCteProducer/Reader nodes so + /// they are computed once and shared across all references. Cheap CTEs + /// (literal projections, empty relations) are left inlined unless they + /// contain volatile functions (which require single-evaluation semantics). /// - /// The SQL planner wraps every multi-ref CTE in MaterializedCteProducer/Reader - /// nodes. The `InlineCte` optimizer rule then selectively inlines ones where - /// materialization is not beneficial (cheap CTEs, CTEs under LIMIT, etc.). - /// - /// This approach ensures: - /// 1. The optimizer has full context (explicit CTE nodes in the plan) - /// 2. The inlining decision can be revisited after other optimizer passes - /// 3. DataFrame API users benefit via the optimizer rule + /// Respects explicit SQL hints: `AS MATERIALIZED` forces materialization, + /// `AS NOT MATERIALIZED` prevents it. fn apply_cte_materialization( &self, plan: LogicalPlan, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 7ccb7601584b1..5955e83e0541c 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -369,7 +369,7 @@ datafusion.execution.batch_size 8192 Default batch size while creating new batch datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. datafusion.execution.enable_ansi_mode false Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. -datafusion.execution.enable_materialized_ctes false Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are generally computed once and cached, except for cheap CTEs and CTEs consumed below a top-level limit. +datafusion.execution.enable_materialized_ctes false Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are computed once and cached, except for cheap CTEs (e.g. literal projections) which remain inlined. Volatile CTEs are always materialized to preserve single-evaluation semantics. Supports explicit MATERIALIZED / NOT MATERIALIZED SQL hints. datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. datafusion.execution.hash_join_buffering_capacity 0 How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 099674c5a0b58..b15d6d9e237c3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -128,7 +128,7 @@ The following configuration settings are available: | datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | | datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | | datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | -| datafusion.execution.enable_materialized_ctes | false | Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are generally computed once and cached, except for cheap CTEs and CTEs consumed below a top-level limit. | +| datafusion.execution.enable_materialized_ctes | false | Should DataFusion materialize CTEs that are referenced multiple times. When enabled, CTEs referenced more than once are computed once and cached, except for cheap CTEs (e.g. literal projections) which remain inlined. Volatile CTEs are always materialized to preserve single-evaluation semantics. Supports explicit MATERIALIZED / NOT MATERIALIZED SQL hints. | | datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | | datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | | datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | From 56cd454e1d16155de5c47298b37bbe104ffebca6 Mon Sep 17 00:00:00 2001 From: Nathan Bezualem Date: Sun, 31 May 2026 19:09:08 -0400 Subject: [PATCH 6/6] slt tests enable the feature --- datafusion/sqllogictest/test_files/cte.slt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index 1dc0aa57e2dd8..6e7f09b231a0e 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -1320,6 +1320,10 @@ RESET datafusion.execution.enable_recursive_ctes; statement ok RESET datafusion.sql_parser.enable_ident_normalization; +# Enable materialized CTEs for the following tests +statement ok +set datafusion.execution.enable_materialized_ctes = true; + # Materialized CTEs collect all input partitions before readers consume them. query I WITH t AS ( @@ -1355,3 +1359,6 @@ ON t1.a = t2.a AND t1.b = t2.b; statement ok RESET datafusion.optimizer.prefer_hash_join; + +statement ok +RESET datafusion.execution.enable_materialized_ctes;