Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions rust/arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,11 +580,11 @@ mod tests {

let sliced_array = list_array.slice(1, 6);
assert_eq!(6, sliced_array.len());
assert_eq!(1, sliced_array.offset());
assert_eq!(0, sliced_array.offset());
assert_eq!(3, sliced_array.null_count());

for i in 0..sliced_array.len() {
if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
if bit_util::get_bit(&null_bits, 1 + i) {
assert!(sliced_array.is_valid(i));
} else {
assert!(sliced_array.is_null(i));
Expand All @@ -594,11 +594,11 @@ mod tests {
// Check offset and length for each non-null value.
let sliced_list_array =
sliced_array.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(2, sliced_list_array.value_offset(2));
assert_eq!(0, sliced_list_array.value_offset(2));
assert_eq!(2, sliced_list_array.value_length(2));
assert_eq!(4, sliced_list_array.value_offset(3));
assert_eq!(2, sliced_list_array.value_offset(3));
assert_eq!(2, sliced_list_array.value_length(3));
assert_eq!(6, sliced_list_array.value_offset(5));
assert_eq!(4, sliced_list_array.value_offset(5));
assert_eq!(3, sliced_list_array.value_length(5));
}

Expand Down Expand Up @@ -645,11 +645,11 @@ mod tests {

let sliced_array = list_array.slice(1, 6);
assert_eq!(6, sliced_array.len());
assert_eq!(1, sliced_array.offset());
assert_eq!(0, sliced_array.offset());
assert_eq!(3, sliced_array.null_count());

for i in 0..sliced_array.len() {
if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
if bit_util::get_bit(&null_bits, 1 + i) {
assert!(sliced_array.is_valid(i));
} else {
assert!(sliced_array.is_null(i));
Expand All @@ -661,11 +661,11 @@ mod tests {
.as_any()
.downcast_ref::<LargeListArray>()
.unwrap();
assert_eq!(2, sliced_list_array.value_offset(2));
assert_eq!(0, sliced_list_array.value_offset(2));
assert_eq!(2, sliced_list_array.value_length(2));
assert_eq!(4, sliced_list_array.value_offset(3));
assert_eq!(2, sliced_list_array.value_offset(3));
assert_eq!(2, sliced_list_array.value_length(3));
assert_eq!(6, sliced_list_array.value_offset(5));
assert_eq!(4, sliced_list_array.value_offset(5));
assert_eq!(3, sliced_list_array.value_length(5));
}

Expand Down
6 changes: 3 additions & 3 deletions rust/arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ mod tests {

let arr2 = arr.slice(2, 5);
assert_eq!(5, arr2.len());
assert_eq!(2, arr2.offset());
assert_eq!(0, arr2.offset());
assert_eq!(1, arr2.null_count());

for i in 0..arr2.len() {
Expand All @@ -683,7 +683,7 @@ mod tests {

let arr3 = arr2.slice(2, 3);
assert_eq!(3, arr3.len());
assert_eq!(4, arr3.offset());
assert_eq!(0, arr3.offset());
assert_eq!(0, arr3.null_count());

let int_arr = arr3.as_any().downcast_ref::<Int32Array>().unwrap();
Expand Down Expand Up @@ -713,7 +713,7 @@ mod tests {

let arr2 = arr.slice(3, 5);
assert_eq!(5, arr2.len());
assert_eq!(3, arr2.offset());
assert_eq!(0, arr2.offset());
assert_eq!(1, arr2.null_count());

let bool_arr = arr2.as_any().downcast_ref::<BooleanArray>().unwrap();
Expand Down
16 changes: 8 additions & 8 deletions rust/arrow/src/array/array_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
len = Some(child_datum_len)
}
child_data.push(child_datum.clone());
fields.push(Field::new(
field_name,
array.data_type().clone(),
child_datum.null_buffer().is_some(),
));
let null_buffer = child_datum.null_buffer();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// A field is nullable if the null buffer is Some and not all bits are set.
let nullable = null_buffer.is_some()
&& null_buffer.unwrap().count_set_bits() < child_datum.len();
fields.push(Field::new(field_name, array.data_type().clone(), nullable));

if let Some(child_null_buffer) = child_datum.null_buffer() {
let child_datum_offset = child_datum.offset();
Expand Down Expand Up @@ -485,7 +485,7 @@ mod tests {
let sliced_array = struct_array.slice(2, 3);
let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
assert_eq!(3, sliced_array.len());
assert_eq!(2, sliced_array.offset());
assert_eq!(0, sliced_array.offset());
assert_eq!(1, sliced_array.null_count());
assert!(sliced_array.is_valid(0));
assert!(sliced_array.is_null(1));
Expand All @@ -494,7 +494,7 @@ mod tests {
let sliced_c0 = sliced_array.column(0);
let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(3, sliced_c0.len());
assert_eq!(2, sliced_c0.offset());
assert_eq!(0, sliced_c0.offset());
assert!(sliced_c0.is_null(0));
assert!(sliced_c0.is_null(1));
assert!(sliced_c0.is_valid(2));
Expand All @@ -503,7 +503,7 @@ mod tests {
let sliced_c1 = sliced_array.column(1);
let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(3, sliced_c1.len());
assert_eq!(2, sliced_c1.offset());
assert_eq!(0, sliced_c1.offset());
assert!(sliced_c1.is_valid(0));
assert_eq!(42, sliced_c1.value(0));
assert!(sliced_c1.is_null(1));
Expand Down
49 changes: 32 additions & 17 deletions rust/arrow/src/array/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::buffer::Buffer;
use crate::datatypes::DataType;
use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};

use super::equal::equal;
use super::{equal::equal, MutableArrayData};

#[inline]
pub(crate) fn count_nulls(
Expand Down Expand Up @@ -219,17 +219,15 @@ impl ArrayData {
///
/// Panics if `offset + length > self.len()`.
pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
assert!((offset + length) <= self.len());

let mut new_data = self.clone();

new_data.len = length;
new_data.offset = offset + self.offset;

new_data.null_count =
count_nulls(new_data.null_buffer(), new_data.offset, new_data.len);
self.copy_range(offset, length)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the attempt to implement slice with MutableArrayData, but triggers ARROW-11263

}

new_data
fn copy_range(&self, offset: usize, length: usize) -> ArrayData {
assert!((offset + length) <= self.len());
let arrays = vec![self];
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this hints the right direction:

  • Possible performance speedup because no longer clone all array data and use only part of it (think about the total length is large and the slice length very small)
  • Make the resulting ArrayData self-contained, thus less chances to mystery bugs. In this case, perhaps it's better to rename slice as copy_range to avoid mis-understanding.

The problem is: there are still 5 failures due to "not yet implemented" or "not supported"

Copy link
Contributor Author

@mqy mqy Jan 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible performance speedup

Totally wrong, the benchmark of array_slice shows 4x - 11x regression.

let mut mutable = MutableArrayData::new(arrays, false, length);
mutable.extend(0, offset, offset + length);
mutable.freeze()
}

/// Returns the `buffer` as a slice of type `T` starting at self.offset
Expand Down Expand Up @@ -428,28 +426,45 @@ mod tests {
bit_util::set_bit(&mut bit_v, 0);
bit_util::set_bit(&mut bit_v, 3);
bit_util::set_bit(&mut bit_v, 10);
let data_vec = vec![0_u8; 64]; // align to 64 bytes
let data = ArrayData::builder(DataType::Int32)
.len(16)
.null_bit_buffer(Buffer::from(bit_v))
.add_buffer(Buffer::from(data_vec))
.build();
let data = data.as_ref();
let new_data = data.slice(1, 15);
assert_eq!(data.len() - 1, new_data.len());
assert_eq!(1, new_data.offset());
assert_eq!(0, new_data.offset());
assert_eq!(data.null_count(), new_data.null_count());

// slice of a slice (removes one null)
let new_data = new_data.slice(1, 14);
assert_eq!(data.len() - 2, new_data.len());
assert_eq!(2, new_data.offset());
assert_eq!(0, new_data.offset());
assert_eq!(data.null_count() - 1, new_data.null_count());
}

#[test]
fn test_equality() {
let int_data = ArrayData::builder(DataType::Int32).build();
let float_data = ArrayData::builder(DataType::Float32).build();
assert_ne!(int_data, float_data);
#[should_panic(expected = "index out of bounds: the len is 0 but the index is 0")]
fn test_slice_equality() {
let mut bit_v: [u8; 1] = [0; 1];
bit_util::set_bit(&mut bit_v, 1);
let data = ArrayData::builder(DataType::Int32)
.len(2)
.null_bit_buffer(Buffer::from(bit_v))
.build();
let data_slice = data.slice(1, 1);

let mut bit_v: [u8; 1] = [0; 1];
bit_util::set_bit(&mut bit_v, 0);
let arc_data = ArrayData::builder(DataType::Int32)
.len(1)
.null_bit_buffer(Buffer::from(bit_v))
.build();
let expected = arc_data.as_ref();

assert_eq!(&data_slice, expected);
}

#[test]
Expand Down
22 changes: 22 additions & 0 deletions rust/arrow/src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,28 @@ mod tests {
test_equal(a.as_ref(), b.as_ref(), false);
}

#[test]
// Code created by @alamb, copied from https://github.com/apache/arrow/pull/9211
fn test_string_offset_larger() {
let a =
StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]).data();
let b = StringArray::from(vec![None, Some("b"), None, Some("c")]).data();

test_equal(&a.slice(2, 2), &b.slice(0, 2), false);
test_equal(&a.slice(2, 2), &b.slice(1, 2), true);
test_equal(&a.slice(2, 2), &b.slice(2, 2), false);
}

#[test]
// Code created by @jhorstman copied from https://issues.apache.org/jira/browse/ARROW-11267
fn test_list_different_offsets() {
let a = create_list_array(&[Some(&[0, 0]), Some(&[1, 2]), Some(&[3, 4])]);
let b = create_list_array(&[Some(&[1, 2]), Some(&[3, 4]), Some(&[5, 6])]);
let a_slice = a.slice(1, 2);
let b_slice = b.slice(0, 2);
test_equal(&a_slice, &b_slice, true);
}

// Test the case where null_count > 0
#[test]
fn test_list_null() {
Expand Down
2 changes: 1 addition & 1 deletion rust/arrow/src/array/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ mod tests {
let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 16);
assert_eq!(array2.offset(), 8);
assert_eq!(array2.offset(), 0);
}

#[test]
Expand Down
104 changes: 103 additions & 1 deletion rust/arrow/src/array/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -814,14 +814,116 @@ mod tests {
}

#[test]
fn test_struct() {
// should fail with assert error: not equal.
fn array_bug_1() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![None, Some("b")]));
// ArrayData::slice() is not bug-free
let strings_slice = strings.slice(1, 1);
let strings2: ArrayRef = Arc::new(StringArray::from(vec![Some("b")]));

assert_eq!(
&strings2,
&strings_slice,
"\n##1: {:?}\n##2: {:?}",
strings2.data(),
strings_slice.data()
);
}

#[test]
// should fail with panic: index out of range.
fn array_bug_2() {
let strings: ArrayRef =
Arc::new(StringArray::from(vec![Some("a"), None, Some("b")]));
// ArrayData::slice() is not bug-free
let strings_slice = strings.slice(2, 1);
let strings2: ArrayRef = Arc::new(StringArray::from(vec![Some("b")]));

assert_eq!(
&strings2,
&strings_slice,
"\n##1: {:?}\n##2: {:?}",
strings2.data(),
strings_slice.data()
);
}

// should pass.
#[test]
fn array_bug_3() {
let strings: ArrayRef =
Arc::new(StringArray::from(vec![Some("a"), None, Some("b")]));

// use copy_range(), see bellow.
let strings_slice = copy_range(&strings, 2, 1);
let strings2: ArrayRef = Arc::new(StringArray::from(vec![Some("b")]));

// failed: index out of range
assert_eq!(&strings2, &strings_slice);
}

fn copy_range(array_ref: &ArrayRef, offset: usize, length: usize) -> ArrayRef {
let data_ref = array_ref.data();
let arrays = vec![&*data_ref];
let mut mutable = MutableArrayData::new(arrays, false, length);
mutable.extend(0, offset, offset + length);
let new_array = mutable.freeze();
crate::array::make_array(Arc::new(new_array))
}

#[test]
fn struct_array_bug() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![
Some("joe"),
None,
None,
Some("mark"),
Some("doe"),
]));

let array = StructArray::try_from(vec![("f1", strings.clone())])
.unwrap()
.data();
let arrays = vec![array.as_ref()];
let mut mutable = MutableArrayData::new(arrays, false, 0);

mutable.extend(0, 1, 3);
let data = mutable.freeze();
let array = StructArray::from(Arc::new(data));

let expected = StructArray::try_from(vec![("f1", strings.slice(1, 2))]).unwrap();
assert_eq!(array, expected)
}

#[test]
fn test_struct_bug_2() {
let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]));

let array = StructArray::try_from(vec![("f1", ints.clone())])
.unwrap()
.data();

let arrays = vec![array.as_ref()];
let mut mutable = MutableArrayData::new(arrays, false, 0);

mutable.extend(0, 1, 3);
let data = mutable.freeze();
let array = StructArray::from(Arc::new(data));

let expected = StructArray::try_from(vec![("f1", ints.slice(1, 2))]).unwrap();

assert_eq!(array, expected);
}

#[test]
fn test_struct() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![
None,
Some("joe"),
None,
Some("mark"),
Some("doe"),
]));
let ints: ArrayRef = Arc::new(Int32Array::from(vec![
Some(1),
Some(2),
Expand Down
2 changes: 1 addition & 1 deletion rust/arrow/src/compute/kernels/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1225,7 +1225,7 @@ mod tests {
let array = Arc::new(a) as ArrayRef;
assert_eq!(0, array.offset());
let array = array.slice(2, 3);
assert_eq!(2, array.offset());
assert_eq!(0, array.offset());
let b = cast(&array, &DataType::UInt8).unwrap();
assert_eq!(3, b.len());
assert_eq!(0, b.offset());
Expand Down