Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 62 additions & 3 deletions datafusion/functions/src/string/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ use std::sync::Arc;

use crate::strings::make_and_append_view;
use arrow::array::{
new_null_array, Array, ArrayRef, GenericStringArray, GenericStringBuilder,
NullBufferBuilder, OffsetSizeTrait, StringBuilder, StringViewArray,
as_run_array, new_null_array, Array, ArrayRef, GenericStringArray,
GenericStringBuilder, NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, RunArray,
StringBuilder, StringViewArray,
};
use arrow::buffer::{Buffer, ScalarBuffer};
use arrow::datatypes::DataType;
use arrow::datatypes::{DataType, RunEndIndexType};
use arrow::datatypes::{Int16Type, Int32Type, Int64Type};
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::Result;
use datafusion_common::{exec_err, ScalarValue};
Expand Down Expand Up @@ -342,6 +344,25 @@ where

Ok(ColumnarValue::Array(Arc::new(string_builder.finish())))
}
DataType::RunEndEncoded(run_index, value_index) => {
if value_index.data_type() == &DataType::Utf8 {
case_conversion_run_array::<i32, _>(
array,
op,
name,
&run_index.data_type(),
)
} else if value_index.data_type() == &DataType::LargeUtf8 {
Comment on lines +348 to +355
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about Utf8View? As I'm thinking about it, it doesn't seem to make sense to have a REE with value data type of Utf8View. I haven't dug deep enough to verify this though. I ask mostly for my own understanding.

case_conversion_run_array::<i64, _>(
array,
op,
name,
&run_index.data_type(),
)
} else {
exec_err!("Unsupported data type")
}
}
other => exec_err!("Unsupported data type {other:?} for function {name}"),
},
ColumnarValue::Scalar(scalar) => match scalar {
Expand Down Expand Up @@ -423,3 +444,41 @@ where
GenericStringArray::<O>::new_unchecked(offsets, values, nulls)
}))
}
fn case_conversion_run_array<'a, O, F>(
array: &'a ArrayRef,
op: F,
name: &str,
index_type: &DataType,
) -> Result<ColumnarValue>
where
O: OffsetSizeTrait,
F: Fn(&'a str) -> String,
{
match index_type {
DataType::Int16 => process_run_array::<Int16Type, O, _>(array, &op),
DataType::Int32 => process_run_array::<Int32Type, O, _>(array, &op),
DataType::Int64 => process_run_array::<Int64Type, O, _>(array, &op),
_ => exec_err!("Unsupported data type {index_type:?} for function {name}"),
}
}

fn process_run_array<'a, T, O, F>(array: &'a ArrayRef, op: &F) -> Result<ColumnarValue>
where
T: RunEndIndexType,
O: OffsetSizeTrait,
F: Fn(&'a str) -> String,
{
let run_array = as_run_array::<T>(array);
let origin_values = run_array.values();

let converted_values = case_conversion_array::<O, _>(origin_values, op)?;

// Convert RunEndBuffer<T> into PrimitiveArray<T> and wrap in ArrayRef
let run_ends: PrimitiveArray<T> =
PrimitiveArray::new(run_array.run_ends().inner().clone(), None);

// Construct new RunArray
let new_run_array = RunArray::<T>::try_new(&run_ends, &converted_values)?;

Ok(ColumnarValue::Array(Arc::new(new_run_array)))
}
40 changes: 40 additions & 0 deletions datafusion/functions/src/string/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,44 @@ mod tests {

to_lower(input, expected)
}

mod ree_lower_test {
use super::*;
use arrow::array::{Int32Array, RunArray, StringArray};
use arrow::datatypes::Int32Type;
#[test]
fn test_lower_on_run_array_all_caps() -> Result<()> {
let run_ends = Int32Array::from(vec![3, 6, 9]);
let values =
StringArray::from(vec![Some("ARROW"), Some("DATA"), Some("XYZ123")]);
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();

let expected_run_ends = Int32Array::from(vec![3, 6, 9]);
let expected_values =
StringArray::from(vec![Some("arrow"), Some("data"), Some("xyz123")]);
let expected_run_array =
RunArray::<Int32Type>::try_new(&expected_run_ends, &expected_values)
.unwrap();

to_lower(Arc::new(run_array), Arc::new(expected_run_array)).unwrap();
Ok(())
}

#[test]
fn test_lower_on_run_array_with_nulls_and_symbols() -> Result<()> {
let run_ends = Int32Array::from(vec![2, 4, 7]);
let values = StringArray::from(vec![Some("SNAKE"), None, Some("TeSt!@#")]);
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();

let expected_run_ends = Int32Array::from(vec![2, 4, 7]);
let expected_values =
StringArray::from(vec![Some("snake"), None, Some("test!@#")]);
let expected_run_array =
RunArray::<Int32Type>::try_new(&expected_run_ends, &expected_values)
.unwrap();

to_lower(Arc::new(run_array), Arc::new(expected_run_array)).unwrap();
Ok(())
}
}
}
60 changes: 59 additions & 1 deletion datafusion/functions/src/string/upper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ mod tests {

fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> {
let func = UpperFunc::new();

let arg_field = Field::new("a", input.data_type().clone(), true).into();
let args = ScalarFunctionArgs {
number_rows: input.len(),
Expand Down Expand Up @@ -194,4 +193,63 @@ mod tests {

to_upper(input, expected)
}
#[cfg(test)]
mod ree_upper_test {
use super::*;
use arrow::array::{Int32Array, RunArray, StringArray};
use arrow::datatypes::Int32Type;
#[test]
fn test_upper_on_run_array() -> Result<()> {
let run_ends = Int32Array::from(vec![4, 6, 9, 11, 15]);
let values = StringArray::from(vec![
Some("arrow"),
None,
Some("datafusion"),
Some("@_"),
Some("0123456789"),
]);
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();

let expected_run_ends = Int32Array::from(vec![4, 6, 9, 11, 15]);
let expected_values = StringArray::from(vec![
Some("ARROW"),
None,
Some("DATAFUSION"),
Some("@_"),
Some("0123456789"),
]);
let expected_run_array =
RunArray::<Int32Type>::try_new(&expected_run_ends, &expected_values)
.unwrap();

to_upper(Arc::new(run_array), Arc::new(expected_run_array)).unwrap();
Ok(())
}

#[test]
fn test_upper_on_run_array_mixed_case() -> Result<()> {
let run_ends = Int32Array::from(vec![2, 5, 8, 10]);
let values = StringArray::from(vec![
Some("Hello"),
Some("world"),
None,
Some("Test123"),
]);
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();

let expected_run_ends = Int32Array::from(vec![2, 5, 8, 10]);
let expected_values = StringArray::from(vec![
Some("HELLO"),
Some("WORLD"),
None,
Some("TEST123"),
]);
let expected_run_array =
RunArray::<Int32Type>::try_new(&expected_run_ends, &expected_values)
.unwrap();

to_upper(Arc::new(run_array), Arc::new(expected_run_array)).unwrap();
Ok(())
}
}
}