diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index cf8217fe981de..d6cc711147b5d 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -31,13 +31,17 @@ use datafusion_common::ScalarValue; use datafusion_common::{ downcast_value, internal_err, not_impl_err, DataFusionError, Result, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; use std::any::Any; use std::fmt::{Debug, Formatter}; use std::hash::Hash; use std::marker::PhantomData; +use std::sync::OnceLock; make_udaf_expr_and_func!( ApproxDistinct, approx_distinct, @@ -303,4 +307,33 @@ impl AggregateUDFImpl for ApproxDistinct { }; Ok(accumulator) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_distinct_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_distinct_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", + ) + .with_syntax_example("approx_distinct(expression)") + .with_sql_example(r#"```sql +> SELECT approx_distinct(column_name) FROM table_name; ++-----------------------------------+ +| approx_distinct(column_name) | ++-----------------------------------+ +| 42 | ++-----------------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index 7a7b12432544a..84442fa5a2e65 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -19,15 +19,19 @@ use std::any::Any; use std::fmt::Debug; +use std::sync::OnceLock; use arrow::{datatypes::DataType, datatypes::Field}; use arrow_schema::DataType::{Float64, UInt64}; use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; use crate::approx_percentile_cont::ApproxPercentileAccumulator; @@ -116,4 +120,33 @@ impl AggregateUDFImpl for ApproxMedian { acc_args.exprs[0].data_type(acc_args.schema)?, ))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_median_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_median_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.", + ) + .with_syntax_example("approx_median(expression)") + .with_sql_example(r#"```sql +> SELECT approx_median(column_name) FROM table_name; ++-----------------------------------+ +| approx_median(column_name) | ++-----------------------------------+ +| 23.5 | ++-----------------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 5578aebbf4031..9b8a99e977d2d 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{Array, RecordBatch}; use arrow::compute::{filter, is_not_null}; @@ -34,12 +34,13 @@ use datafusion_common::{ downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, ColumnarValue, Expr, Signature, TypeSignature, - Volatility, + Accumulator, AggregateUDFImpl, ColumnarValue, Documentation, Expr, Signature, + TypeSignature, Volatility, }; use datafusion_functions_aggregate_common::tdigest::{ TDigest, TryIntoF64, DEFAULT_MAX_SIZE, @@ -268,6 +269,36 @@ impl AggregateUDFImpl for ApproxPercentileCont { } Ok(arg_types[0].clone()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_percentile_cont_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_percentile_cont_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the approximate percentile of input values using the t-digest algorithm.", + ) + .with_syntax_example("approx_percentile_cont(expression, percentile, centroids)") + .with_sql_example(r#"```sql +> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; ++-------------------------------------------------+ +| approx_percentile_cont(column_name, 0.75, 100) | ++-------------------------------------------------+ +| 65.0 | ++-------------------------------------------------+ +```"#) + .with_standard_argument("expression", "The") + .with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).") + .with_argument("centroids", "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index fee67ba1623db..a5362713a6fb7 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::{ array::ArrayRef, @@ -26,10 +26,13 @@ use arrow::{ use datafusion_common::ScalarValue; use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::Volatility::Immutable; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, TypeSignature}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, +}; use datafusion_functions_aggregate_common::tdigest::{ Centroid, TDigest, DEFAULT_MAX_SIZE, }; @@ -151,6 +154,37 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight { fn state_fields(&self, args: StateFieldsArgs) -> Result> { self.approx_percentile_cont.state_fields(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_percentile_cont_with_weight_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_percentile_cont_with_weight_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the weighted approximate percentile of input values using the t-digest algorithm.", + ) + .with_syntax_example("approx_percentile_cont_with_weight(expression, weight, percentile)") + .with_sql_example(r#"```sql +> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; ++----------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | ++----------------------------------------------------------------------+ +| 78.5 | ++----------------------------------------------------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .with_argument("weight", "Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 15146fc4a2d89..28ff6fb346e50 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -25,15 +25,16 @@ use datafusion_common::cast::as_list_array; use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx}; use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{internal_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; -use datafusion_expr::AggregateUDFImpl; use datafusion_expr::{Accumulator, Signature, Volatility}; +use datafusion_expr::{AggregateUDFImpl, Documentation}; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use std::collections::{HashSet, VecDeque}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; make_udaf_expr_and_func!( ArrayAgg, @@ -142,6 +143,35 @@ impl AggregateUDFImpl for ArrayAgg { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Reversed(array_agg_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_array_agg_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_array_agg_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order.", + ) + .with_syntax_example("array_agg(expression [ORDER BY expression])") + .with_sql_example(r#"```sql +> SELECT array_agg(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| array_agg(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| [element1, element2, element3] | ++-----------------------------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs index ddad76a8734b0..8782f8cfcc7c6 100644 --- a/datafusion/functions-aggregate/src/average.rs +++ b/datafusion/functions-aggregate/src/average.rs @@ -28,12 +28,14 @@ use arrow::datatypes::{ Float64Type, UInt64Type, }; use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{avg_return_type, coerce_avg_type}; use datafusion_expr::utils::format_state_name; use datafusion_expr::Volatility::Immutable; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, ReversedUDAF, Signature, + Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, + ReversedUDAF, Signature, }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState; @@ -45,7 +47,7 @@ use datafusion_functions_aggregate_common::utils::DecimalAverager; use log::debug; use std::any::Any; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; make_udaf_expr_and_func!( Avg, @@ -235,6 +237,36 @@ impl AggregateUDFImpl for Avg { } coerce_avg_type(self.name(), arg_types) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_avg_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_avg_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the average of numeric values in the specified column.", + ) + .with_syntax_example("avg(expression)") + .with_sql_example( + r#"```sql +> SELECT avg(column_name) FROM table_name; ++---------------------------+ +| avg(column_name) | ++---------------------------+ +| 42.75 | ++---------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } /// An accumulator to compute the average diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs b/datafusion/functions-aggregate/src/bool_and_or.rs index 7cc7d9ff7fec3..63ad1ea573d56 100644 --- a/datafusion/functions-aggregate/src/bool_and_or.rs +++ b/datafusion/functions-aggregate/src/bool_and_or.rs @@ -18,6 +18,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution use std::any::Any; +use std::sync::OnceLock; use arrow::array::ArrayRef; use arrow::array::BooleanArray; @@ -29,10 +30,12 @@ use arrow::datatypes::Field; use datafusion_common::internal_err; use datafusion_common::{downcast_value, not_impl_err}; use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::bool_op::BooleanGroupsAccumulator; @@ -172,6 +175,36 @@ impl AggregateUDFImpl for BoolAnd { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_bool_and_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_bool_and_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns true if all non-null input values are true, otherwise false.", + ) + .with_syntax_example("bool_and(expression)") + .with_sql_example( + r#"```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } #[derive(Debug, Default)] @@ -293,6 +326,34 @@ impl AggregateUDFImpl for BoolOr { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_bool_or_doc()) + } +} + +fn get_bool_or_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns true if any non-null input value is true, otherwise false.", + ) + .with_syntax_example("bool_or(expression)") + .with_sql_example( + r#"```sql +> SELECT bool_or(column_name) FROM table_name; ++----------------------------+ +| bool_or(column_name) | ++----------------------------+ +| true | ++----------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } #[derive(Debug, Default)] diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs index 88f01b06d2d9b..d5dc482d68d2c 100644 --- a/datafusion/functions-aggregate/src/correlation.rs +++ b/datafusion/functions-aggregate/src/correlation.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::compute::{and, filter, is_not_null}; use arrow::{ @@ -30,11 +30,12 @@ use arrow::{ use crate::covariance::CovarianceAccumulator; use crate::stddev::StddevAccumulator; use datafusion_common::{plan_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, type_coercion::aggregates::NUMERICS, utils::format_state_name, - Accumulator, AggregateUDFImpl, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; @@ -107,6 +108,37 @@ impl AggregateUDFImpl for Correlation { ), ]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_corr_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_corr_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the coefficient of correlation between two numeric values.", + ) + .with_syntax_example("corr(expression1, expression2)") + .with_sql_example( + r#"```sql +> SELECT corr(column1, column2) FROM table_name; ++--------------------------------+ +| corr(column1, column2) | ++--------------------------------+ +| 0.85 | ++--------------------------------+ +```"#, + ) + .with_standard_argument("expression1", "First") + .with_standard_argument("expression2", "Second") + .build() + .unwrap() + }) } /// An accumulator to compute correlation diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index cc245b3572ec6..2511c70c4608a 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -20,8 +20,9 @@ use datafusion_common::stats::Precision; use datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator; use datafusion_physical_expr::expressions; use std::collections::HashSet; +use std::fmt::Debug; use std::ops::BitAnd; -use std::{fmt::Debug, sync::Arc}; +use std::sync::{Arc, OnceLock}; use arrow::{ array::{ArrayRef, AsArray}, @@ -43,10 +44,11 @@ use arrow::{ use datafusion_common::{ downcast_value, internal_err, not_impl_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, - EmitTo, GroupsAccumulator, Signature, Volatility, + Documentation, EmitTo, GroupsAccumulator, Signature, Volatility, }; use datafusion_expr::{Expr, ReversedUDAF, StatisticsArgs, TypeSignature}; use datafusion_functions_aggregate_common::aggregate::count_distinct::{ @@ -324,6 +326,41 @@ impl AggregateUDFImpl for Count { } None } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_count_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_count_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`.", + ) + .with_syntax_example("count(expression)") + .with_sql_example(r#"```sql +> SELECT count(column_name) FROM table_name; ++-----------------------+ +| count(column_name) | ++-----------------------+ +| 100 | ++-----------------------+ + +> SELECT count(*) FROM table_name; ++------------------+ +| count(*) | ++------------------+ +| 120 | ++------------------+ +```"#) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/covariance.rs b/datafusion/functions-aggregate/src/covariance.rs index d0abb079ef155..f3b323d74d30a 100644 --- a/datafusion/functions-aggregate/src/covariance.rs +++ b/datafusion/functions-aggregate/src/covariance.rs @@ -18,6 +18,7 @@ //! [`CovarianceSample`]: covariance sample aggregations. use std::fmt::Debug; +use std::sync::OnceLock; use arrow::{ array::{ArrayRef, Float64Array, UInt64Array}, @@ -29,11 +30,12 @@ use datafusion_common::{ downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, type_coercion::aggregates::NUMERICS, utils::format_state_name, - Accumulator, AggregateUDFImpl, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; @@ -124,6 +126,35 @@ impl AggregateUDFImpl for CovarianceSample { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_covar_samp_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_covar_samp_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description("Returns the sample covariance of a set of number pairs.") + .with_syntax_example("covar_samp(expression1, expression2)") + .with_sql_example( + r#"```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +```"#, + ) + .with_standard_argument("expression1", "First") + .with_standard_argument("expression2", "Second") + .build() + .unwrap() + }) } pub struct CovariancePopulation { @@ -193,6 +224,35 @@ impl AggregateUDFImpl for CovariancePopulation { StatsType::Population, )?)) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_covar_pop_doc()) + } +} + +fn get_covar_pop_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the population covariance of a set of number pairs.", + ) + .with_syntax_example("covar_pop(expression1, expression2)") + .with_sql_example( + r#"```sql +> SELECT covar_pop(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_pop(column1, column2) | ++-----------------------------------+ +| 7.63 | ++-----------------------------------+ +```"#, + ) + .with_standard_argument("expression1", "First") + .with_standard_argument("expression2", "Second") + .build() + .unwrap() + }) } /// An accumulator to compute covariance diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 41ac7875795d1..f6a84c84dcb0b 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, AsArray, BooleanArray}; use arrow::compute::{self, lexsort_to_indices, SortColumn}; @@ -28,11 +28,12 @@ use datafusion_common::utils::{compare_rows, get_row_at_idx, take_arrays}; use datafusion_common::{ arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Expr, ExprFunctionExt, - Signature, SortExpr, TypeSignature, Volatility, + Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Documentation, Expr, + ExprFunctionExt, Signature, SortExpr, TypeSignature, Volatility, }; use datafusion_functions_aggregate_common::utils::get_sort_options; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -165,6 +166,35 @@ impl AggregateUDFImpl for FirstValue { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Reversed(last_value_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_first_value_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_first_value_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", + ) + .with_syntax_example("first_value(expression [ORDER BY expression])") + .with_sql_example(r#"```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } #[derive(Debug)] @@ -466,6 +496,33 @@ impl AggregateUDFImpl for LastValue { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Reversed(first_value_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_last_value_doc()) + } +} + +fn get_last_value_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", + ) + .with_syntax_example("last_value(expression [ORDER BY expression])") + .with_sql_example(r#"```sql +> SELECT last_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| last_value(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| last_element | ++-----------------------------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/grouping.rs b/datafusion/functions-aggregate/src/grouping.rs index 6fb7c3800f4ed..09e9b90b2e6d1 100644 --- a/datafusion/functions-aggregate/src/grouping.rs +++ b/datafusion/functions-aggregate/src/grouping.rs @@ -19,14 +19,18 @@ use std::any::Any; use std::fmt; +use std::sync::OnceLock; use arrow::datatypes::DataType; use arrow::datatypes::Field; use datafusion_common::{not_impl_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; make_udaf_expr_and_func!( Grouping, @@ -94,4 +98,37 @@ impl AggregateUDFImpl for Grouping { "physical plan is not yet implemented for GROUPING aggregate function" ) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_grouping_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_grouping_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set.", + ) + .with_syntax_example("grouping(expression)") + .with_sql_example(r#"```sql +> SELECT column_name, GROUPING(column_name) AS group_column + FROM table_name + GROUP BY GROUPING SETS ((column_name), ()); ++-------------+-------------+ +| column_name | group_column | ++-------------+-------------+ +| value1 | 0 | +| value2 | 0 | +| NULL | 1 | ++-------------+-------------+ +```"#, + ) + .with_argument("expression", "Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index 7dd0de14c3c0c..5a0cac2c829ed 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -16,8 +16,8 @@ // under the License. use std::collections::HashSet; -use std::fmt::Formatter; -use std::{fmt::Debug, sync::Arc}; +use std::fmt::{Debug, Formatter}; +use std::sync::{Arc, OnceLock}; use arrow::array::{downcast_integer, ArrowNumericType}; use arrow::{ @@ -33,10 +33,11 @@ use arrow::array::ArrowNativeTypeOp; use arrow::datatypes::ArrowNativeType; use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, - Signature, Volatility, + Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::utils::Hashable; @@ -152,6 +153,34 @@ impl AggregateUDFImpl for Median { fn aliases(&self) -> &[String] { &[] } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_median_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_median_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Returns the median value in the specified column.") + .with_syntax_example("median(expression)") + .with_sql_example( + r#"```sql +> SELECT median(column_name) FROM table_name; ++----------------------+ +| median(column_name) | ++----------------------+ +| 45.5 | ++----------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } /// The median accumulator accumulates the raw input values diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 7d2e8e66e2cd4..5586675209cb2 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -38,6 +38,7 @@ use datafusion_common::stats::Precision; use datafusion_common::{ downcast_value, exec_err, internal_err, ColumnStatistics, DataFusionError, Result, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use datafusion_physical_expr::expressions; use std::fmt::Debug; @@ -51,11 +52,13 @@ use arrow::datatypes::{ use datafusion_common::ScalarValue; use datafusion_expr::{ - function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Signature, Volatility, + function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Documentation, Signature, + Volatility, }; use datafusion_expr::{GroupsAccumulator, StatisticsArgs}; use half::f16; use std::ops::Deref; +use std::sync::OnceLock; fn get_min_max_result_type(input_types: &[DataType]) -> Result> { // make sure that the input types only has one element. @@ -321,6 +324,34 @@ impl AggregateUDFImpl for Max { fn value_from_stats(&self, statistics_args: &StatisticsArgs) -> Option { self.value_from_statistics(statistics_args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_max_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_max_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Returns the maximum value in the specified column.") + .with_syntax_example("max(expression)") + .with_sql_example( + r#"```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } // Statically-typed version of min/max(array) -> ScalarValue for string types @@ -1119,7 +1150,34 @@ impl AggregateUDFImpl for Min { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_min_doc()) + } +} + +fn get_min_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Returns the minimum value in the specified column.") + .with_syntax_example("min(expression)") + .with_sql_example( + r#"```sql +> SELECT min(column_name) FROM table_name; ++----------------------+ +| min(column_name) | ++----------------------+ +| 12 | ++----------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } + /// An accumulator to compute the minimum value #[derive(Debug)] pub struct MinAccumulator { diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index bbfe56914c910..6d8cea8f0531f 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -20,18 +20,19 @@ use std::any::Any; use std::collections::VecDeque; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{new_empty_array, ArrayRef, AsArray, StructArray}; use arrow_schema::{DataType, Field, Fields}; use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx}; use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - lit, Accumulator, AggregateUDFImpl, ExprFunctionExt, ReversedUDAF, Signature, - SortExpr, Volatility, + lit, Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt, ReversedUDAF, + Signature, SortExpr, Volatility, }; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; @@ -161,6 +162,40 @@ impl AggregateUDFImpl for NthValueAgg { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Reversed(nth_value_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nth_value_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nth_value_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the nth value in a group of values.", + ) + .with_syntax_example("nth_value(expression, n ORDER BY expression)") + .with_sql_example(r#"```sql +> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id ORDER BY salary ASC) AS second_salary_by_dept + FROM employee; ++---------+--------+-------------------------+ +| dept_id | salary | second_salary_by_dept | ++---------+--------+-------------------------+ +| 1 | 30000 | NULL | +| 1 | 40000 | 40000 | +| 1 | 50000 | 40000 | +| 2 | 35000 | NULL | +| 2 | 45000 | 45000 | ++---------+--------+-------------------------+ +```"#) + .with_standard_argument("expression", "The column or expression to retrieve the nth value from.") + .with_argument("n", "The position (nth) of the value to retrieve, based on the ordering.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs index a25ab5e319915..332a8efcc0f98 100644 --- a/datafusion/functions-aggregate/src/stddev.rs +++ b/datafusion/functions-aggregate/src/stddev.rs @@ -19,17 +19,19 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::Float64Array; use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field}; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_common::{plan_err, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature, + Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; @@ -132,6 +134,34 @@ impl AggregateUDFImpl for Stddev { ) -> Result> { Ok(Box::new(StddevGroupsAccumulator::new(StatsType::Sample))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_stddev_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_stddev_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description("Returns the standard deviation of a set of numbers.") + .with_syntax_example("stddev(expression)") + .with_sql_example( + r#"```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } make_udaf_expr_and_func!( @@ -228,6 +258,34 @@ impl AggregateUDFImpl for StddevPop { StatsType::Population, ))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_stddev_pop_doc()) + } +} + +fn get_stddev_pop_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the population standard deviation of a set of numbers.", + ) + .with_syntax_example("stddev_pop(expression)") + .with_sql_example( + r#"```sql +> SELECT stddev_pop(column_name) FROM table_name; ++--------------------------+ +| stddev_pop(column_name) | ++--------------------------+ +| 10.56 | ++--------------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } /// An accumulator to compute the average diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs index a7e9a37e23ad6..66fc19910696c 100644 --- a/datafusion/functions-aggregate/src/string_agg.rs +++ b/datafusion/functions-aggregate/src/string_agg.rs @@ -22,12 +22,14 @@ use arrow_schema::DataType; use datafusion_common::cast::as_generic_string_array; use datafusion_common::Result; use datafusion_common::{not_impl_err, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Signature, TypeSignature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, Volatility, }; use datafusion_physical_expr::expressions::Literal; use std::any::Any; +use std::sync::OnceLock; make_udaf_expr_and_func!( StringAgg, @@ -98,6 +100,37 @@ impl AggregateUDFImpl for StringAgg { not_impl_err!("expect literal") } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_string_agg_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_string_agg_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Concatenates the values of string expressions and places separator values between them." + ) + .with_syntax_example("string_agg(expression, delimiter)") + .with_sql_example(r#"```sql +> SELECT string_agg(name, ', ') AS names_list + FROM employee; ++--------------------------+ +| names_list | ++--------------------------+ +| Alice, Bob, Charlie | ++--------------------------+ +```"#, + ) + .with_argument("expression", "The string expression to concatenate. Can be a column or any valid string expression.") + .with_argument("delimiter", "A literal string used as a separator between the concatenated values.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index 7e40c1bd17a8d..3b561f3028dea 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -21,6 +21,7 @@ use ahash::RandomState; use datafusion_expr::utils::AggregateOrderSensitivity; use std::any::Any; use std::collections::HashSet; +use std::sync::OnceLock; use arrow::array::Array; use arrow::array::ArrowNativeTypeOp; @@ -33,11 +34,13 @@ use arrow::datatypes::{ }; use arrow::{array::ArrayRef, datatypes::Field}; use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use datafusion_functions_aggregate_common::utils::Hashable; @@ -233,6 +236,34 @@ impl AggregateUDFImpl for Sum { fn order_sensitivity(&self) -> AggregateOrderSensitivity { AggregateOrderSensitivity::Insensitive } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_sum_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_sum_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Returns the sum of all values in the specified column.") + .with_syntax_example("sum(expression)") + .with_sql_example( + r#"```sql +> SELECT sum(column_name) FROM table_name; ++-----------------------+ +| sum(column_name) | ++-----------------------+ +| 12345 | ++-----------------------+ +```"#, + ) + .with_standard_argument("expression", "The") + .build() + .unwrap() + }) } /// This accumulator computes SUM incrementally diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index 1c25874c08064..4f774fe6d0f0c 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -27,183 +27,9 @@ the rest of the documentation. [automatically created from the codebase]: https://github.com/apache/datafusion/issues/12740 -## General - -- [avg](#avg) -- [bool_and](#bool_and) -- [bool_or](#bool_or) -- [count](#count) -- [max](#max) -- [mean](#mean) -- [median](#median) -- [min](#min) -- [sum](#sum) -- [array_agg](#array_agg) -- [first_value](#first_value) -- [last_value](#last_value) - -### `avg` - -Returns the average of numeric values in the specified column. - -``` -avg(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -#### Aliases - -- `mean` - -### `bool_and` - -Returns true if all non-null input values are true, otherwise false. - -``` -bool_and(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `bool_or` - -Returns true if any non-null input value is true, otherwise false. - -``` -bool_or(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `count` - -Returns the number of non-null values in the specified column. - -To include _null_ values in the total count, use `count(*)`. - -``` -count(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `max` - -Returns the maximum value in the specified column. - -``` -max(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `mean` - -_Alias of [avg](#avg)._ - -### `median` - -Returns the median value in the specified column. - -``` -median(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `min` - -Returns the minimum value in the specified column. - -``` -min(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `sum` - -Returns the sum of all values in the specified column. - -``` -sum(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `array_agg` - -Returns an array created from the expression elements. If ordering requirement is given, elements are inserted in the order of required ordering. - -``` -array_agg(expression [ORDER BY expression]) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `first_value` - -Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. - -``` -first_value(expression [ORDER BY expression]) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `last_value` - -Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. - -``` -last_value(expression [ORDER BY expression]) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - ## Statistical -- [corr](#corr) - [covar](#covar) -- [covar_pop](#covar_pop) -- [covar_samp](#covar_samp) -- [stddev](#stddev) -- [stddev_pop](#stddev_pop) -- [stddev_samp](#stddev_samp) - [regr_avgx](#regr_avgx) - [regr_avgy](#regr_avgy) - [regr_count](#regr_count) @@ -214,21 +40,6 @@ last_value(expression [ORDER BY expression]) - [regr_syy](#regr_syy) - [regr_sxy](#regr_sxy) -### `corr` - -Returns the coefficient of correlation between two numeric values. - -``` -corr(expression1, expression2) -``` - -#### Arguments - -- **expression1**: First expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Second expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - ### `covar` Returns the covariance of a set of number pairs. @@ -244,75 +55,6 @@ covar(expression1, expression2) - **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. -### `covar_pop` - -Returns the population covariance of a set of number pairs. - -``` -covar_pop(expression1, expression2) -``` - -#### Arguments - -- **expression1**: First expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Second expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `covar_samp` - -Returns the sample covariance of a set of number pairs. - -``` -covar_samp(expression1, expression2) -``` - -#### Arguments - -- **expression1**: First expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Second expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `stddev` - -Returns the standard deviation of a set of numbers. - -``` -stddev(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `stddev_pop` - -Returns the population standard deviation of a set of numbers. - -``` -stddev_pop(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `stddev_samp` - -Returns the sample standard deviation of a set of numbers. - -``` -stddev_samp(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - ### `regr_slope` Returns the slope of the linear regression line for non-null pairs in aggregate columns. @@ -448,74 +190,3 @@ regr_sxy(expression_y, expression_x) Can be a constant, column, or function, and any combination of arithmetic operators. - **expression_x**: Independent variable. Can be a constant, column, or function, and any combination of arithmetic operators. - -## Approximate - -- [approx_distinct](#approx_distinct) -- [approx_median](#approx_median) -- [approx_percentile_cont](#approx_percentile_cont) -- [approx_percentile_cont_with_weight](#approx_percentile_cont_with_weight) - -### `approx_distinct` - -Returns the approximate number of distinct input values calculated using the -HyperLogLog algorithm. - -``` -approx_distinct(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `approx_median` - -Returns the approximate median (50th percentile) of input values. -It is an alias of `approx_percentile_cont(x, 0.5)`. - -``` -approx_median(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `approx_percentile_cont` - -Returns the approximate percentile of input values using the t-digest algorithm. - -``` -approx_percentile_cont(expression, percentile, centroids) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). -- **centroids**: Number of centroids to use in the t-digest algorithm. _Default is 100_. - - If there are this number or fewer unique values, you can expect an exact result. - A higher number of centroids results in a more accurate approximation, but - requires more memory to compute. - -### `approx_percentile_cont_with_weight` - -Returns the weighted approximate percentile of input values using the -t-digest algorithm. - -``` -approx_percentile_cont_with_weight(expression, weight, percentile) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **weight**: Expression to use as weight. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md index 08cdb0a9867c4..fc918c3b15eaf 100644 --- a/docs/source/user-guide/sql/aggregate_functions_new.md +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -37,15 +37,80 @@ Aggregate functions operate on a set of values to compute a single result. ## General Functions +- [array_agg](#array_agg) +- [avg](#avg) - [bit_and](#bit_and) - [bit_or](#bit_or) - [bit_xor](#bit_xor) +- [bool_and](#bool_and) +- [bool_or](#bool_or) +- [count](#count) +- [first_value](#first_value) +- [grouping](#grouping) +- [last_value](#last_value) +- [max](#max) +- [mean](#mean) +- [median](#median) +- [min](#min) +- [string_agg](#string_agg) +- [sum](#sum) - [var](#var) - [var_pop](#var_pop) - [var_population](#var_population) - [var_samp](#var_samp) - [var_sample](#var_sample) +### `array_agg` + +Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order. + +``` +array_agg(expression [ORDER BY expression]) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT array_agg(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| array_agg(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| [element1, element2, element3] | ++-----------------------------------------------+ +``` + +### `avg` + +Returns the average of numeric values in the specified column. + +``` +avg(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **Aliases: **: `mean` + +#### Example + +```sql +> SELECT avg(column_name) FROM table_name; ++---------------------------+ +| avg(column_name) | ++---------------------------+ +| 42.75 | ++---------------------------+ +``` + +#### Aliases + +- mean + ### `bit_and` Computes the bitwise AND of all non-null input values. @@ -82,6 +147,276 @@ bit_xor(expression) - **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. +### `bool_and` + +Returns true if all non-null input values are true, otherwise false. + +``` +bool_and(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +``` + +### `bool_or` + +Returns true if all non-null input values are true, otherwise false. + +``` +bool_and(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +``` + +### `count` + +Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`. + +``` +count(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT count(column_name) FROM table_name; ++-----------------------+ +| count(column_name) | ++-----------------------+ +| 100 | ++-----------------------+ + +> SELECT count(*) FROM table_name; ++------------------+ +| count(*) | ++------------------+ +| 120 | ++------------------+ +``` + +### `first_value` + +Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. + +``` +first_value(expression [ORDER BY expression]) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +``` + +### `grouping` + +Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set. + +``` +grouping(expression) +``` + +#### Arguments + +- **expression**: Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function. + +#### Example + +```sql +> SELECT column_name, GROUPING(column_name) AS group_column + FROM table_name + GROUP BY GROUPING SETS ((column_name), ()); ++-------------+-------------+ +| column_name | group_column | ++-------------+-------------+ +| value1 | 0 | +| value2 | 0 | +| NULL | 1 | ++-------------+-------------+ +``` + +### `last_value` + +Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. + +``` +first_value(expression [ORDER BY expression]) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +``` + +### `max` + +Returns the maximum value in the specified column. + +``` +max(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +``` + +### `mean` + +_Alias of [avg](#avg)._ + +### `median` + +Returns the median value in the specified column. + +``` +median(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT median(column_name) FROM table_name; ++----------------------+ +| median(column_name) | ++----------------------+ +| 45.5 | ++----------------------+ +``` + +### `min` + +Returns the maximum value in the specified column. + +``` +max(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +``` + +### `string_agg` + +Concatenates the values of string expressions and places separator values between them. + +``` +string_agg(expression, delimiter) +``` + +#### Arguments + +- **expression**: The string expression to concatenate. Can be a column or any valid string expression. +- **delimiter**: A literal string used as a separator between the concatenated values. + +#### Example + +```sql +> SELECT string_agg(name, ', ') AS names_list + FROM employee; ++--------------------------+ +| names_list | ++--------------------------+ +| Alice, Bob, Charlie | ++--------------------------+ +``` + +### `sum` + +Returns the sum of all values in the specified column. + +``` +sum(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT sum(column_name) FROM table_name; ++-----------------------+ +| sum(column_name) | ++-----------------------+ +| 12345 | ++-----------------------+ +``` + ### `var` Returns the statistical sample variance of a set of numbers. @@ -126,3 +461,280 @@ _Alias of [var](#var)._ ### `var_sample` _Alias of [var](#var)._ + +## Statistical Functions + +- [corr](#corr) +- [covar](#covar) +- [covar_pop](#covar_pop) +- [covar_samp](#covar_samp) +- [nth_value](#nth_value) +- [stddev](#stddev) +- [stddev_pop](#stddev_pop) +- [stddev_samp](#stddev_samp) + +### `corr` + +Returns the coefficient of correlation between two numeric values. + +``` +corr(expression1, expression2) +``` + +#### Arguments + +- **expression1**: First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT corr(column1, column2) FROM table_name; ++--------------------------------+ +| corr(column1, column2) | ++--------------------------------+ +| 0.85 | ++--------------------------------+ +``` + +### `covar` + +_Alias of [covar_samp](#covar_samp)._ + +### `covar_pop` + +Returns the sample covariance of a set of number pairs. + +``` +covar_samp(expression1, expression2) +``` + +#### Arguments + +- **expression1**: First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +``` + +### `covar_samp` + +Returns the sample covariance of a set of number pairs. + +``` +covar_samp(expression1, expression2) +``` + +#### Arguments + +- **expression1**: First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +``` + +#### Aliases + +- covar + +### `nth_value` + +Returns the nth value in a group of values. + +``` +nth_value(expression, n ORDER BY expression) +``` + +#### Arguments + +- **expression**: The column or expression to retrieve the nth value from. expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: The position (nth) of the value to retrieve, based on the ordering. + +#### Example + +```sql +> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id ORDER BY salary ASC) AS second_salary_by_dept + FROM employee; ++---------+--------+-------------------------+ +| dept_id | salary | second_salary_by_dept | ++---------+--------+-------------------------+ +| 1 | 30000 | NULL | +| 1 | 40000 | 40000 | +| 1 | 50000 | 40000 | +| 2 | 35000 | NULL | +| 2 | 45000 | 45000 | ++---------+--------+-------------------------+ +``` + +### `stddev` + +Returns the standard deviation of a set of numbers. + +``` +stddev(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +``` + +#### Aliases + +- stddev_samp + +### `stddev_pop` + +Returns the standard deviation of a set of numbers. + +``` +stddev(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +``` + +### `stddev_samp` + +_Alias of [stddev](#stddev)._ + +## Approximate Functions + +- [approx_distinct](#approx_distinct) +- [approx_median](#approx_median) +- [approx_percentile_cont](#approx_percentile_cont) +- [approx_percentile_cont_with_weight](#approx_percentile_cont_with_weight) + +### `approx_distinct` + +Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm. + +``` +approx_distinct(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT approx_distinct(column_name) FROM table_name; ++-----------------------------------+ +| approx_distinct(column_name) | ++-----------------------------------+ +| 42 | ++-----------------------------------+ +``` + +### `approx_median` + +Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`. + +``` +approx_median(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT approx_median(column_name) FROM table_name; ++-----------------------------------+ +| approx_median(column_name) | ++-----------------------------------+ +| 23.5 | ++-----------------------------------+ +``` + +### `approx_percentile_cont` + +Returns the approximate percentile of input values using the t-digest algorithm. + +``` +approx_percentile_cont(expression, percentile, centroids) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). +- **centroids**: Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory. + +#### Example + +```sql +> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; ++-------------------------------------------------+ +| approx_percentile_cont(column_name, 0.75, 100) | ++-------------------------------------------------+ +| 65.0 | ++-------------------------------------------------+ +``` + +### `approx_percentile_cont_with_weight` + +Returns the weighted approximate percentile of input values using the t-digest algorithm. + +``` +approx_percentile_cont_with_weight(expression, weight, percentile) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **weight**: Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators. +- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). + +#### Example + +```sql +> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; ++----------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | ++----------------------------------------------------------------------+ +| 78.5 | ++----------------------------------------------------------------------+ +```