diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index cf4bf2cd163fd..d9098cfd9673a 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -721,12 +721,13 @@ fn max_distinct_count( Some(non_null_count) => Precision::Inexact(non_null_count), } } - Precision::Exact(count) => { - let count = count - stats.null_count.get_value().unwrap_or(&0); + Precision::Exact(count) => { + let null_count = *stats.null_count.get_value().unwrap_or(&0); + let non_null_count = count.checked_sub(null_count).unwrap_or(0); if stats.null_count.is_exact().unwrap_or(false) { - Precision::Exact(count) + Precision::Exact(non_null_count) } else { - Precision::Inexact(count) + Precision::Inexact(non_null_count) } } }; @@ -2939,4 +2940,19 @@ mod tests { Ok(()) } -} + #[test] + fn test_max_distinct_count_no_overflow_when_null_count_exceeds_num_rows() { + let num_rows = Precision::Exact(2); + let stats = ColumnStatistics { + distinct_count: Precision::Absent, + null_count: Precision::Exact(5), + min_value: Precision::Absent, + max_value: Precision::Absent, + sum_value: Precision::Absent, + byte_size: Precision::Absent, + }; + let result = max_distinct_count(&num_rows, &stats); + assert_eq!(result, Precision::Exact(0)); + } + +} \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 228918c3855f2..3e81527738091 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -5361,3 +5361,26 @@ DROP TABLE t1; statement count 0 DROP TABLE t2; + +statement ok +CREATE TABLE t1(a INT, b INT) AS VALUES + (NULL, 1), (NULL, 2), (NULL, 3), (NULL, 4), (NULL, 5); + +statement ok +CREATE TABLE t2(c INT) AS VALUES (1), (2); + +# This query panicked before the fix: the ORDER BY forces a SortExec, +# the LIMIT gets pushed into SortExec.fetch, and the HashJoinExec +# calls partition_statistics() on the SortExec child during execution. +query II +SELECT sub.a, sub.b FROM ( + SELECT * FROM t1 ORDER BY b LIMIT 1 +) sub +JOIN t2 ON sub.a = t2.c; +---- + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; \ No newline at end of file