Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions datafusion/physical-plan/src/joins/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -721,12 +721,13 @@ fn max_distinct_count(
Some(non_null_count) => Precision::Inexact(non_null_count),
}
}
Precision::Exact(count) => {
let count = count - stats.null_count.get_value().unwrap_or(&0);
Precision::Exact(count) => {
let null_count = *stats.null_count.get_value().unwrap_or(&0);
let non_null_count = count.checked_sub(null_count).unwrap_or(0);
Comment on lines +725 to +726
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 nice, even if this is a good safeguard, the fact that this can even happen makes me think that there is some further work to be done in the stats propagation mechanism.

Ideally, this would not even be possible by construction, but that's a topic for another PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if stats.null_count.is_exact().unwrap_or(false) {
Precision::Exact(count)
Precision::Exact(non_null_count)
} else {
Precision::Inexact(count)
Precision::Inexact(non_null_count)
}
}
};
Expand Down Expand Up @@ -2939,4 +2940,19 @@ mod tests {

Ok(())
}
}
#[test]
Copy link
Contributor

@jonathanc-n jonathanc-n Mar 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created a test for this for sqllogictests @KARTIK64-rgb, can add:

statement ok
CREATE TABLE t1(a INT, b INT) AS VALUES 
  (NULL, 1), (NULL, 2), (NULL, 3), (NULL, 4), (NULL, 5);

statement ok
CREATE TABLE t2(c INT) AS VALUES (1), (2);

# This query panicked before the fix: the ORDER BY forces a SortExec,
# the LIMIT gets pushed into SortExec.fetch, and the HashJoinExec
# calls partition_statistics() on the SortExec child during execution.
query II
SELECT sub.a, sub.b FROM (
  SELECT * FROM t1 ORDER BY b LIMIT 1
) sub 
JOIN t2 ON sub.a = t2.c;
----

statement ok
DROP TABLE t1;

statement ok
DROP TABLE t2;

i verified it reproduces the bug

fn test_max_distinct_count_no_overflow_when_null_count_exceeds_num_rows() {
let num_rows = Precision::Exact(2);
let stats = ColumnStatistics {
distinct_count: Precision::Absent,
null_count: Precision::Exact(5),
min_value: Precision::Absent,
max_value: Precision::Absent,
sum_value: Precision::Absent,
byte_size: Precision::Absent,
};
let result = max_distinct_count(&num_rows, &stats);
assert_eq!(result, Precision::Exact(0));
}

}
23 changes: 23 additions & 0 deletions datafusion/sqllogictest/test_files/joins.slt
Original file line number Diff line number Diff line change
Expand Up @@ -5361,3 +5361,26 @@ DROP TABLE t1;

statement count 0
DROP TABLE t2;

statement ok
CREATE TABLE t1(a INT, b INT) AS VALUES
(NULL, 1), (NULL, 2), (NULL, 3), (NULL, 4), (NULL, 5);

statement ok
CREATE TABLE t2(c INT) AS VALUES (1), (2);

# This query panicked before the fix: the ORDER BY forces a SortExec,
# the LIMIT gets pushed into SortExec.fetch, and the HashJoinExec
# calls partition_statistics() on the SortExec child during execution.
query II
SELECT sub.a, sub.b FROM (
SELECT * FROM t1 ORDER BY b LIMIT 1
) sub
JOIN t2 ON sub.a = t2.c;
----

statement ok
DROP TABLE t1;

statement ok
DROP TABLE t2;
Loading