From b74f5ce02d4d06f0d9633b44fdc6308ea9b2e355 Mon Sep 17 00:00:00 2001 From: Xiaoxuan Li Date: Sat, 14 Mar 2026 21:01:43 -0700 Subject: [PATCH] feat: implement IsNotNull expression in vortex expression library Add a first-class IsNotNull scalar function instead of composing Not(IsNull(...)). This simplifies the expression tree, enables direct stat_falsification for zone map pruning, and updates all integration points (DataFusion, DuckDB, Python/Substrait). The stat_falsification uses is_constant && null_count > 0 as an approximation since there is no RowCount stat yet. Closes: #6040 --- vortex-array/src/builtins.rs | 17 + vortex-array/src/expr/exprs.rs | 15 + vortex-array/src/scalar_fn/erased.rs | 9 +- vortex-array/src/scalar_fn/fns/is_not_null.rs | 304 ++++++++++++++++++ vortex-array/src/scalar_fn/fns/mod.rs | 1 + vortex-array/src/scalar_fn/session.rs | 2 + vortex-datafusion/src/convert/exprs.rs | 3 +- vortex-duckdb/src/convert/expr.rs | 3 +- vortex-duckdb/src/convert/table_filter.rs | 4 +- vortex-layout/src/layouts/dict/reader.rs | 5 +- vortex-python/python/vortex/_lib/expr.pyi | 1 + vortex-python/python/vortex/substrait.py | 9 +- vortex-python/src/expr/mod.rs | 17 + 13 files changed, 368 insertions(+), 22 deletions(-) create mode 100644 vortex-array/src/scalar_fn/fns/is_not_null.rs diff --git a/vortex-array/src/builtins.rs b/vortex-array/src/builtins.rs index ba3b8542eb7..143637999ea 100644 --- a/vortex-array/src/builtins.rs +++ b/vortex-array/src/builtins.rs @@ -28,6 +28,7 @@ use crate::scalar_fn::fns::binary::Binary; use crate::scalar_fn::fns::cast::Cast; use crate::scalar_fn::fns::fill_null::FillNull; use crate::scalar_fn::fns::get_item::GetItem; +use crate::scalar_fn::fns::is_not_null::IsNotNull; use crate::scalar_fn::fns::is_null::IsNull; use crate::scalar_fn::fns::list_contains::ListContains; use crate::scalar_fn::fns::mask::Mask; @@ -49,6 +50,9 @@ pub trait ExprBuiltins: Sized { /// Is null check. fn is_null(&self) -> VortexResult; + /// Is not null check. + fn is_not_null(&self) -> VortexResult; + /// Mask the expression using the given boolean mask. /// The resulting expression's validity is the intersection of the original expression's /// validity. @@ -84,6 +88,10 @@ impl ExprBuiltins for Expression { IsNull.try_new_expr(EmptyOptions, [self.clone()]) } + fn is_not_null(&self) -> VortexResult { + IsNotNull.try_new_expr(EmptyOptions, [self.clone()]) + } + fn mask(&self, mask: Expression) -> VortexResult { Mask.try_new_expr(EmptyOptions, [self.clone(), mask]) } @@ -118,6 +126,9 @@ pub trait ArrayBuiltins: Sized { /// Is null check. fn is_null(&self) -> VortexResult; + /// Is not null check. + fn is_not_null(&self) -> VortexResult; + /// Mask the array using the given boolean mask. /// The resulting array's validity is the intersection of the original array's validity /// and the mask's validity. @@ -182,6 +193,12 @@ impl ArrayBuiltins for ArrayRef { .optimize() } + fn is_not_null(&self) -> VortexResult { + IsNotNull + .try_new_array(self.len(), EmptyOptions, [self.clone()])? + .optimize() + } + fn mask(self, mask: ArrayRef) -> VortexResult { Mask.try_new_array(self.len(), EmptyOptions, [self, mask])? .optimize() diff --git a/vortex-array/src/expr/exprs.rs b/vortex-array/src/expr/exprs.rs index bc30ba86ec4..24e8d4a51fd 100644 --- a/vortex-array/src/expr/exprs.rs +++ b/vortex-array/src/expr/exprs.rs @@ -29,6 +29,7 @@ use crate::scalar_fn::fns::dynamic::DynamicComparisonExpr; use crate::scalar_fn::fns::dynamic::Rhs; use crate::scalar_fn::fns::fill_null::FillNull; use crate::scalar_fn::fns::get_item::GetItem; +use crate::scalar_fn::fns::is_not_null::IsNotNull; use crate::scalar_fn::fns::is_null::IsNull; use crate::scalar_fn::fns::like::Like; use crate::scalar_fn::fns::like::LikeOptions; @@ -547,6 +548,20 @@ pub fn is_null(child: Expression) -> Expression { IsNull.new_expr(EmptyOptions, vec![child]) } +// ---- IsNotNull ---- + +/// Creates an expression that checks for non-null values. +/// +/// Returns a boolean array indicating which positions contain non-null values. +/// +/// ```rust +/// # use vortex_array::expr::{is_not_null, root}; +/// let expr = is_not_null(root()); +/// ``` +pub fn is_not_null(child: Expression) -> Expression { + IsNotNull.new_expr(EmptyOptions, vec![child]) +} + // ---- Like ---- /// Creates a SQL LIKE expression. diff --git a/vortex-array/src/scalar_fn/erased.rs b/vortex-array/src/scalar_fn/erased.rs index 154ad13bace..369627964e2 100644 --- a/vortex-array/src/scalar_fn/erased.rs +++ b/vortex-array/src/scalar_fn/erased.rs @@ -31,8 +31,7 @@ use crate::scalar_fn::ScalarFnId; use crate::scalar_fn::ScalarFnVTable; use crate::scalar_fn::ScalarFnVTableExt; use crate::scalar_fn::SimplifyCtx; -use crate::scalar_fn::fns::is_null::IsNull; -use crate::scalar_fn::fns::not::Not; +use crate::scalar_fn::fns::is_not_null::IsNotNull; use crate::scalar_fn::options::ScalarFnOptions; use crate::scalar_fn::signature::ScalarFnSignature; use crate::scalar_fn::typed::DynScalarFn; @@ -135,11 +134,7 @@ impl ScalarFnRef { pub fn validity(&self, expr: &Expression) -> VortexResult { Ok(self.0.validity(expr)?.unwrap_or_else(|| { // TODO(ngates): make validity a mandatory method on VTable to avoid this fallback. - // TODO(ngates): add an IsNotNull expression. - Not.new_expr( - EmptyOptions, - [IsNull.new_expr(EmptyOptions, [expr.clone()])], - ) + IsNotNull.new_expr(EmptyOptions, [expr.clone()]) })) } diff --git a/vortex-array/src/scalar_fn/fns/is_not_null.rs b/vortex-array/src/scalar_fn/fns/is_not_null.rs new file mode 100644 index 00000000000..96ce592cce0 --- /dev/null +++ b/vortex-array/src/scalar_fn/fns/is_not_null.rs @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Formatter; + +use vortex_error::VortexResult; +use vortex_session::VortexSession; + +use crate::ArrayRef; +use crate::ExecutionCtx; +use crate::IntoArray; +use crate::arrays::ConstantArray; +use crate::dtype::DType; +use crate::dtype::Nullability; +use crate::expr::Expression; +use crate::expr::StatsCatalog; +use crate::expr::and; +use crate::expr::eq; +use crate::expr::gt; +use crate::expr::lit; +use crate::expr::stats::Stat; +use crate::scalar_fn::Arity; +use crate::scalar_fn::ChildName; +use crate::scalar_fn::EmptyOptions; +use crate::scalar_fn::ExecutionArgs; +use crate::scalar_fn::ScalarFnId; +use crate::scalar_fn::ScalarFnVTable; +use crate::validity::Validity; + +/// Expression that checks for non-null values. +#[derive(Clone)] +pub struct IsNotNull; + +impl ScalarFnVTable for IsNotNull { + type Options = EmptyOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::new_ref("is_not_null") + } + + fn serialize(&self, _instance: &Self::Options) -> VortexResult>> { + Ok(Some(vec![])) + } + + fn deserialize( + &self, + _metadata: &[u8], + _session: &VortexSession, + ) -> VortexResult { + Ok(EmptyOptions) + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Exact(1) + } + + fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName { + match child_idx { + 0 => ChildName::from("input"), + _ => unreachable!("Invalid child index {} for IsNotNull expression", child_idx), + } + } + + fn fmt_sql( + &self, + _options: &Self::Options, + expr: &Expression, + f: &mut Formatter<'_>, + ) -> std::fmt::Result { + write!(f, "is_not_null(")?; + expr.child(0).fmt_sql(f)?; + write!(f, ")") + } + + fn return_dtype(&self, _options: &Self::Options, _arg_dtypes: &[DType]) -> VortexResult { + Ok(DType::Bool(Nullability::NonNullable)) + } + + fn execute( + &self, + _data: &Self::Options, + args: &dyn ExecutionArgs, + _ctx: &mut ExecutionCtx, + ) -> VortexResult { + let child = args.get(0)?; + if let Some(scalar) = child.as_constant() { + return Ok(ConstantArray::new(!scalar.is_null(), args.row_count()).into_array()); + } + + match child.validity()? { + Validity::NonNullable | Validity::AllValid => { + Ok(ConstantArray::new(true, args.row_count()).into_array()) + } + Validity::AllInvalid => Ok(ConstantArray::new(false, args.row_count()).into_array()), + Validity::Array(a) => Ok(a), + } + } + + fn is_null_sensitive(&self, _instance: &Self::Options) -> bool { + true + } + + fn is_fallible(&self, _instance: &Self::Options) -> bool { + false + } + + fn stat_falsification( + &self, + _options: &Self::Options, + expr: &Expression, + catalog: &dyn StatsCatalog, + ) -> Option { + // is_not_null is falsified when ALL values are null, i.e. null_count == row_count. + // Since there is no RowCount stat in the zone map, we approximate using IsConstant: + // if the zone is constant and has any nulls, then all values must be null. + // + // TODO(vortex-6040): Add a RowCount stat to enable the more general falsification: + // null_count == row_count => is_not_null is all false. + let null_count_expr = expr.child(0).stat_expression(Stat::NullCount, catalog)?; + let is_constant_expr = expr.child(0).stat_expression(Stat::IsConstant, catalog)?; + // If the zone is constant (is_constant == true) and has nulls (null_count > 0), + // then all values must be null, so is_not_null is all false. + Some(and( + eq(is_constant_expr, lit(true)), + gt(null_count_expr, lit(0u64)), + )) + } +} + +#[cfg(test)] +mod tests { + use vortex_buffer::buffer; + use vortex_error::VortexExpect as _; + + use crate::IntoArray; + use crate::arrays::PrimitiveArray; + use crate::arrays::StructArray; + use crate::dtype::DType; + use crate::dtype::Nullability; + use crate::expr::get_item; + use crate::expr::is_not_null; + use crate::expr::root; + use crate::expr::test_harness; + use crate::scalar::Scalar; + + #[test] + fn dtype() { + let dtype = test_harness::struct_dtype(); + assert_eq!( + is_not_null(root()).return_dtype(&dtype).unwrap(), + DType::Bool(Nullability::NonNullable) + ); + } + + #[test] + fn replace_children() { + let expr = is_not_null(root()); + expr.with_children([root()]) + .vortex_expect("operation should succeed in test"); + } + + #[test] + fn evaluate_mask() { + let test_array = + PrimitiveArray::from_option_iter(vec![Some(1), None, Some(2), None, Some(3)]) + .into_array(); + let expected = [true, false, true, false, true]; + + let result = test_array.clone().apply(&is_not_null(root())).unwrap(); + + assert_eq!(result.len(), test_array.len()); + assert_eq!(result.dtype(), &DType::Bool(Nullability::NonNullable)); + + for (i, expected_value) in expected.iter().enumerate() { + assert_eq!( + result.scalar_at(i).unwrap(), + Scalar::bool(*expected_value, Nullability::NonNullable) + ); + } + } + + #[test] + fn evaluate_all_true() { + let test_array = buffer![1, 2, 3, 4, 5].into_array(); + + let result = test_array.clone().apply(&is_not_null(root())).unwrap(); + + assert_eq!(result.len(), test_array.len()); + for i in 0..result.len() { + assert_eq!( + result.scalar_at(i).unwrap(), + Scalar::bool(true, Nullability::NonNullable) + ); + } + } + + #[test] + fn evaluate_all_false() { + let test_array = + PrimitiveArray::from_option_iter(vec![None::, None, None, None, None]) + .into_array(); + + let result = test_array.clone().apply(&is_not_null(root())).unwrap(); + + assert_eq!(result.len(), test_array.len()); + for i in 0..result.len() { + assert_eq!( + result.scalar_at(i).unwrap(), + Scalar::bool(false, Nullability::NonNullable) + ); + } + } + + #[test] + fn evaluate_struct() { + let test_array = StructArray::from_fields(&[( + "a", + PrimitiveArray::from_option_iter(vec![Some(1), None, Some(2), None, Some(3)]) + .into_array(), + )]) + .unwrap() + .into_array(); + let expected = [true, false, true, false, true]; + + let result = test_array + .clone() + .apply(&is_not_null(get_item("a", root()))) + .unwrap(); + + assert_eq!(result.len(), test_array.len()); + assert_eq!(result.dtype(), &DType::Bool(Nullability::NonNullable)); + + for (i, expected_value) in expected.iter().enumerate() { + assert_eq!( + result.scalar_at(i).unwrap(), + Scalar::bool(*expected_value, Nullability::NonNullable) + ); + } + } + + #[test] + fn test_display() { + let expr = is_not_null(get_item("name", root())); + assert_eq!(expr.to_string(), "is_not_null($.name)"); + + let expr2 = is_not_null(root()); + assert_eq!(expr2.to_string(), "is_not_null($)"); + } + + #[test] + fn test_is_not_null_sensitive() { + use crate::expr::col; + assert!(is_not_null(col("a")).signature().is_null_sensitive()); + } + + #[test] + fn test_is_not_null_falsification() { + use vortex_utils::aliases::hash_map::HashMap; + use vortex_utils::aliases::hash_set::HashSet; + + use crate::dtype::Field; + use crate::dtype::FieldPath; + use crate::dtype::FieldPathSet; + use crate::expr::and; + use crate::expr::col; + use crate::expr::eq; + use crate::expr::gt; + use crate::expr::lit; + use crate::expr::pruning::checked_pruning_expr; + use crate::expr::stats::Stat; + + let expr = is_not_null(col("a")); + + let (pruning_expr, st) = checked_pruning_expr( + &expr, + &FieldPathSet::from_iter([ + FieldPath::from_iter([ + Field::Name("a".into()), + Field::Name("null_count".into()), + ]), + FieldPath::from_iter([ + Field::Name("a".into()), + Field::Name("is_constant".into()), + ]), + ]), + ) + .unwrap(); + + assert_eq!( + &pruning_expr, + &and( + eq(col("a_is_constant"), lit(true)), + gt(col("a_null_count"), lit(0u64)), + ) + ); + assert_eq!( + st.map(), + &HashMap::from_iter([( + FieldPath::from_name("a"), + HashSet::from([Stat::NullCount, Stat::IsConstant]) + )]) + ); + } +} diff --git a/vortex-array/src/scalar_fn/fns/mod.rs b/vortex-array/src/scalar_fn/fns/mod.rs index 94fc8fb0384..8fa1b66532d 100644 --- a/vortex-array/src/scalar_fn/fns/mod.rs +++ b/vortex-array/src/scalar_fn/fns/mod.rs @@ -8,6 +8,7 @@ pub mod cast; pub mod dynamic; pub mod fill_null; pub mod get_item; +pub mod is_not_null; pub mod is_null; pub mod like; pub mod list_contains; diff --git a/vortex-array/src/scalar_fn/session.rs b/vortex-array/src/scalar_fn/session.rs index eef759bf8e3..3c78f56928d 100644 --- a/vortex-array/src/scalar_fn/session.rs +++ b/vortex-array/src/scalar_fn/session.rs @@ -14,6 +14,7 @@ use crate::scalar_fn::fns::binary::Binary; use crate::scalar_fn::fns::cast::Cast; use crate::scalar_fn::fns::fill_null::FillNull; use crate::scalar_fn::fns::get_item::GetItem; +use crate::scalar_fn::fns::is_not_null::IsNotNull; use crate::scalar_fn::fns::is_null::IsNull; use crate::scalar_fn::fns::like::Like; use crate::scalar_fn::fns::list_contains::ListContains; @@ -58,6 +59,7 @@ impl Default for ScalarFnSession { this.register(Cast); this.register(FillNull); this.register(GetItem); + this.register(IsNotNull); this.register(IsNull); this.register(Like); this.register(ListContains); diff --git a/vortex-datafusion/src/convert/exprs.rs b/vortex-datafusion/src/convert/exprs.rs index 455a5c65a71..55b64dabf10 100644 --- a/vortex-datafusion/src/convert/exprs.rs +++ b/vortex-datafusion/src/convert/exprs.rs @@ -26,6 +26,7 @@ use vortex::expr::Expression; use vortex::expr::and_collect; use vortex::expr::cast; use vortex::expr::get_item; +use vortex::expr::is_not_null; use vortex::expr::is_null; use vortex::expr::list_contains; use vortex::expr::lit; @@ -244,7 +245,7 @@ impl ExpressionConvertor for DefaultExpressionConvertor { if let Some(is_not_null_expr) = df.as_any().downcast_ref::() { let arg = self.convert(is_not_null_expr.arg().as_ref())?; - return Ok(not(is_null(arg))); + return Ok(is_not_null(arg)); } if let Some(in_list) = df.as_any().downcast_ref::() { diff --git a/vortex-duckdb/src/convert/expr.rs b/vortex-duckdb/src/convert/expr.rs index 80d92ede449..cf1e6640eba 100644 --- a/vortex-duckdb/src/convert/expr.rs +++ b/vortex-duckdb/src/convert/expr.rs @@ -13,6 +13,7 @@ use vortex::error::vortex_err; use vortex::expr::Expression; use vortex::expr::and_collect; use vortex::expr::col; +use vortex::expr::is_not_null; use vortex::expr::is_null; use vortex::expr::list_contains; use vortex::expr::lit; @@ -105,7 +106,7 @@ pub fn try_from_bound_expression( DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_NOT => not(child), DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_IS_NULL => is_null(child), DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_IS_NOT_NULL => { - not(is_null(child)) + is_not_null(child) } _ => unreachable!(), } diff --git a/vortex-duckdb/src/convert/table_filter.rs b/vortex-duckdb/src/convert/table_filter.rs index 7741fd64d0a..fa9003bc09c 100644 --- a/vortex-duckdb/src/convert/table_filter.rs +++ b/vortex-duckdb/src/convert/table_filter.rs @@ -12,10 +12,10 @@ use vortex::error::vortex_bail; use vortex::expr::Expression; use vortex::expr::and_collect; use vortex::expr::get_item; +use vortex::expr::is_not_null; use vortex::expr::is_null; use vortex::expr::list_contains; use vortex::expr::lit; -use vortex::expr::not; use vortex::expr::or_collect; use vortex::scalar::Scalar; use vortex::scalar_fn::ScalarFnVTableExt; @@ -61,7 +61,7 @@ pub fn try_from_table_filter( or_collect(children).unwrap_or_else(|| lit(false)) } TableFilterClass::IsNull => is_null(col.clone()), - TableFilterClass::IsNotNull => not(is_null(col.clone())), + TableFilterClass::IsNotNull => is_not_null(col.clone()), TableFilterClass::StructExtract(name, child_filter) => { return try_from_table_filter(child_filter, &get_item(name, col.clone()), scope_dtype); } diff --git a/vortex-layout/src/layouts/dict/reader.rs b/vortex-layout/src/layouts/dict/reader.rs index 5054fcd27f3..80ecceda198 100644 --- a/vortex-layout/src/layouts/dict/reader.rs +++ b/vortex-layout/src/layouts/dict/reader.rs @@ -252,9 +252,8 @@ mod tests { use vortex_array::dtype::FieldNames; use vortex_array::dtype::Nullability; use vortex_array::expr::eq; - use vortex_array::expr::is_null; + use vortex_array::expr::is_not_null; use vortex_array::expr::lit; - use vortex_array::expr::not; use vortex_array::expr::pack; use vortex_array::expr::root; use vortex_array::validity::Validity; @@ -466,7 +465,7 @@ mod tests { .await .unwrap(); - let expression = not(is_null(root())); // easier to test not_is_null b/c that's the validity array + let expression = is_not_null(root()); assert_eq!(layout.encoding_id(), LayoutId::new_ref("vortex.dict")); let actual = layout .new_reader("".into(), segments, &SESSION) diff --git a/vortex-python/python/vortex/_lib/expr.pyi b/vortex-python/python/vortex/_lib/expr.pyi index c1777095743..c69307266de 100644 --- a/vortex-python/python/vortex/_lib/expr.pyi +++ b/vortex-python/python/vortex/_lib/expr.pyi @@ -35,3 +35,4 @@ def not_(child: Expr) -> Expr: ... def and_(left: Expr, right: Expr) -> Expr: ... def cast(child: Expr, dtype: DType) -> Expr: ... def is_null(child: Expr) -> Expr: ... +def is_not_null(child: Expr) -> Expr: ... diff --git a/vortex-python/python/vortex/substrait.py b/vortex-python/python/vortex/substrait.py index 28a144f62ef..1bab4b87d2b 100644 --- a/vortex-python/python/vortex/substrait.py +++ b/vortex-python/python/vortex/substrait.py @@ -498,7 +498,7 @@ def extension_function( case "is_null": return _expr.is_null case "is_not_null": - return _is_not_null + return _expr.is_not_null case name: raise NotImplementedError(f"Function name {name} not supported") case "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml": @@ -517,13 +517,6 @@ def extension_function( raise NotImplementedError(f"Extension URI {uri} not supported") -def _is_not_null(e: _expr.Expr) -> _expr.Expr: - """ - Helper function to have a well-typed callable to return - """ - return _expr.not_(_expr.is_null(e)) - - def expression( substrait_object: Expression, functions: list[Callable[..., _expr.Expr]], diff --git a/vortex-python/src/expr/mod.rs b/vortex-python/src/expr/mod.rs index dfd306c6820..4aa01decaa6 100644 --- a/vortex-python/src/expr/mod.rs +++ b/vortex-python/src/expr/mod.rs @@ -35,6 +35,7 @@ pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(and_, &m)?)?; m.add_function(wrap_pyfunction!(cast, &m)?)?; m.add_function(wrap_pyfunction!(is_null, &m)?)?; + m.add_function(wrap_pyfunction!(is_not_null, &m)?)?; m.add_class::()?; Ok(()) @@ -422,3 +423,19 @@ pub fn is_null(child: PyExpr) -> PyResult { inner: expr::is_null(child.into_inner()), }) } + +/// Creates an expression that checks for non-null values. +/// +/// Parameters +/// ---------- +/// child : :class:`vortex.Expr` +/// +/// Returns +/// ------- +/// :class:`vortex.Expr` +#[pyfunction] +pub fn is_not_null(child: PyExpr) -> PyResult { + Ok(PyExpr { + inner: expr::is_not_null(child.into_inner()), + }) +}