diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 5e0567eafea2e..4be47754716c9 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -22,11 +22,13 @@ use std::sync::Arc; use crate::strings::make_and_append_view; use arrow::array::{ - new_null_array, Array, ArrayRef, GenericStringArray, GenericStringBuilder, - NullBufferBuilder, OffsetSizeTrait, StringBuilder, StringViewArray, + as_run_array, new_null_array, Array, ArrayRef, GenericStringArray, + GenericStringBuilder, NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, RunArray, + StringBuilder, StringViewArray, }; use arrow::buffer::{Buffer, ScalarBuffer}; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, RunEndIndexType}; +use arrow::datatypes::{Int16Type, Int32Type, Int64Type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; @@ -342,6 +344,25 @@ where Ok(ColumnarValue::Array(Arc::new(string_builder.finish()))) } + DataType::RunEndEncoded(run_index, value_index) => { + if value_index.data_type() == &DataType::Utf8 { + case_conversion_run_array::( + array, + op, + name, + &run_index.data_type(), + ) + } else if value_index.data_type() == &DataType::LargeUtf8 { + case_conversion_run_array::( + array, + op, + name, + &run_index.data_type(), + ) + } else { + exec_err!("Unsupported data type") + } + } other => exec_err!("Unsupported data type {other:?} for function {name}"), }, ColumnarValue::Scalar(scalar) => match scalar { @@ -423,3 +444,41 @@ where GenericStringArray::::new_unchecked(offsets, values, nulls) })) } +fn case_conversion_run_array<'a, O, F>( + array: &'a ArrayRef, + op: F, + name: &str, + index_type: &DataType, +) -> Result +where + O: OffsetSizeTrait, + F: Fn(&'a str) -> String, +{ + match index_type { + DataType::Int16 => process_run_array::(array, &op), + DataType::Int32 => process_run_array::(array, &op), + DataType::Int64 => process_run_array::(array, &op), + _ => exec_err!("Unsupported data type {index_type:?} for function {name}"), + } +} + +fn process_run_array<'a, T, O, F>(array: &'a ArrayRef, op: &F) -> Result +where + T: RunEndIndexType, + O: OffsetSizeTrait, + F: Fn(&'a str) -> String, +{ + let run_array = as_run_array::(array); + let origin_values = run_array.values(); + + let converted_values = case_conversion_array::(origin_values, op)?; + + // Convert RunEndBuffer into PrimitiveArray and wrap in ArrayRef + let run_ends: PrimitiveArray = + PrimitiveArray::new(run_array.run_ends().inner().clone(), None); + + // Construct new RunArray + let new_run_array = RunArray::::try_new(&run_ends, &converted_values)?; + + Ok(ColumnarValue::Array(Arc::new(new_run_array))) +} diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index 536c29a7cb253..038f1bffa5e73 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -195,4 +195,44 @@ mod tests { to_lower(input, expected) } + + mod ree_lower_test { + use super::*; + use arrow::array::{Int32Array, RunArray, StringArray}; + use arrow::datatypes::Int32Type; + #[test] + fn test_lower_on_run_array_all_caps() -> Result<()> { + let run_ends = Int32Array::from(vec![3, 6, 9]); + let values = + StringArray::from(vec![Some("ARROW"), Some("DATA"), Some("XYZ123")]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let expected_run_ends = Int32Array::from(vec![3, 6, 9]); + let expected_values = + StringArray::from(vec![Some("arrow"), Some("data"), Some("xyz123")]); + let expected_run_array = + RunArray::::try_new(&expected_run_ends, &expected_values) + .unwrap(); + + to_lower(Arc::new(run_array), Arc::new(expected_run_array)).unwrap(); + Ok(()) + } + + #[test] + fn test_lower_on_run_array_with_nulls_and_symbols() -> Result<()> { + let run_ends = Int32Array::from(vec![2, 4, 7]); + let values = StringArray::from(vec![Some("SNAKE"), None, Some("TeSt!@#")]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let expected_run_ends = Int32Array::from(vec![2, 4, 7]); + let expected_values = + StringArray::from(vec![Some("snake"), None, Some("test!@#")]); + let expected_run_array = + RunArray::::try_new(&expected_run_ends, &expected_values) + .unwrap(); + + to_lower(Arc::new(run_array), Arc::new(expected_run_array)).unwrap(); + Ok(()) + } + } } diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index 882fb45eda4af..402f8795b82a6 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -103,7 +103,6 @@ mod tests { fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> { let func = UpperFunc::new(); - let arg_field = Field::new("a", input.data_type().clone(), true).into(); let args = ScalarFunctionArgs { number_rows: input.len(), @@ -194,4 +193,63 @@ mod tests { to_upper(input, expected) } + #[cfg(test)] + mod ree_upper_test { + use super::*; + use arrow::array::{Int32Array, RunArray, StringArray}; + use arrow::datatypes::Int32Type; + #[test] + fn test_upper_on_run_array() -> Result<()> { + let run_ends = Int32Array::from(vec![4, 6, 9, 11, 15]); + let values = StringArray::from(vec![ + Some("arrow"), + None, + Some("datafusion"), + Some("@_"), + Some("0123456789"), + ]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let expected_run_ends = Int32Array::from(vec![4, 6, 9, 11, 15]); + let expected_values = StringArray::from(vec![ + Some("ARROW"), + None, + Some("DATAFUSION"), + Some("@_"), + Some("0123456789"), + ]); + let expected_run_array = + RunArray::::try_new(&expected_run_ends, &expected_values) + .unwrap(); + + to_upper(Arc::new(run_array), Arc::new(expected_run_array)).unwrap(); + Ok(()) + } + + #[test] + fn test_upper_on_run_array_mixed_case() -> Result<()> { + let run_ends = Int32Array::from(vec![2, 5, 8, 10]); + let values = StringArray::from(vec![ + Some("Hello"), + Some("world"), + None, + Some("Test123"), + ]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let expected_run_ends = Int32Array::from(vec![2, 5, 8, 10]); + let expected_values = StringArray::from(vec![ + Some("HELLO"), + Some("WORLD"), + None, + Some("TEST123"), + ]); + let expected_run_array = + RunArray::::try_new(&expected_run_ends, &expected_values) + .unwrap(); + + to_upper(Arc::new(run_array), Arc::new(expected_run_array)).unwrap(); + Ok(()) + } + } }