From b86d27db1eeaf23bde170c60f6eeecc5427ebb5c Mon Sep 17 00:00:00 2001 From: Anastasios Bakogiannis Date: Thu, 16 Apr 2026 15:26:32 +0200 Subject: [PATCH] Use ListArray nullability instead of offsets for `array_element`, `array_any_value`. (#21672) ## Which issue does this PR close? - Closes #. https://github.com/apache/datafusion/issues/21671 ## Rationale for this change Align the implementation of `array_element`, `array_any_value` with Arrow's spec concerning Null rows. Specifically allow the below from [Arrow's spec](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) > It should be noted that a null value may have a positive slot length. That is, a null value may occupy a non-empty memory space in the data buffer. When this is true, the content of the corresponding memory space is undefined. ## Are these changes tested? Unit tests included. (cherry picked from commit 4c7bb08f9339c890cca8b0ba59fff702924b4225) --- datafusion/functions-nested/src/extract.rs | 63 +++++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index 0f7246c8589cf..6e4b3d971a566 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -263,7 +263,7 @@ where let len = end - start; // array is null - if len == O::usize_as(0) { + if array.is_null(row_index) { mutable.extend_nulls(1); continue; } @@ -1090,11 +1090,9 @@ where for (row_index, offset_window) in array.offsets().windows(2).enumerate() { let start = offset_window[0]; - let end = offset_window[1]; - let len = end - start; // array is null - if len == O::usize_as(0) { + if array.is_null(row_index) { mutable.extend_nulls(1); continue; } @@ -1127,14 +1125,18 @@ where #[cfg(test)] mod tests { - use super::{array_element_udf, general_list_view_array_slice}; + use super::{ + array_element_udf, general_array_any_value, general_array_element, + general_list_view_array_slice, + }; use arrow::array::{ Array, ArrayRef, GenericListViewArray, Int32Array, Int64Array, ListViewArray, cast::AsArray, }; - use arrow::buffer::ScalarBuffer; + use arrow::array::{ListArray, RecordBatch}; + use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{DataType, Field}; - use datafusion_common::{Column, DFSchema, Result}; + use datafusion_common::{Column, DFSchema, Result, assert_batches_eq}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{Expr, ExprSchemable}; use std::collections::HashMap; @@ -1195,6 +1197,53 @@ mod tests { ); } + #[test] + fn test_array_element_null_handling() -> Result<()> { + let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 4, 5])); + let nulls = NullBuffer::from(vec![true, false, true]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + + let list_array = ListArray::new(field, offsets, values, Some(nulls)); + let indexes = Int64Array::from(vec![1, 1, 1]); + + let result = general_array_element(&list_array, &indexes)?; + + let expected = [ + "+--------+", + "| result |", + "+--------+", + "| 1 |", + "| |", + "| 5 |", + "+--------+", + ]; + + let batch = RecordBatch::try_from_iter([("result", result)])?; + + assert_batches_eq!(expected, &[batch]); + + Ok(()) + } + + #[test] + fn test_array_any_null_handling() -> Result<()> { + let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 4, 5])); + let nulls = NullBuffer::from(vec![true, false, true]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + + let list_array = ListArray::new(field, offsets, values, Some(nulls)); + + let result = general_array_any_value(&list_array)?; + + assert!(!result.is_null(0)); + assert!(result.is_null(1)); + assert!(!result.is_null(2)); + + Ok(()) + } + #[test] fn test_array_slice_list_view_basic() -> Result<()> { let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));