Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-35289: [Python] Support large variable width types in numpy conversion #36701

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
75 changes: 59 additions & 16 deletions python/pyarrow/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,16 @@ class NumPyConverter {
Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }

// NumPy ascii string arrays
template <typename T>
Status VisitBinary(T* builder);
Status Visit(const BinaryType& type);
Status Visit(const LargeBinaryType& type);

// NumPy unicode arrays
template <typename T>
Status VisitString(T* builder);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small nitpick, but could you move those declarations of the helpers into protected: (eg just below VisitNative definition)

(I don't think anyone outside of pyarrow is using this, but just to keep it consistent)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just the VisitBinary and VisitString declarations specifically?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly as what you did

Status Visit(const StringType& type);
Status Visit(const LargeStringType& type);

Status Visit(const StructType& type);

Expand Down Expand Up @@ -553,24 +559,23 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
// Create 16MB chunks for binary data
constexpr int32_t kBinaryChunksize = 1 << 24;

Status NumPyConverter::Visit(const BinaryType& type) {
::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);

template <typename T>
Status NumPyConverter::VisitBinary(T* builder) {
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

auto AppendNotNull = [&builder, this](const uint8_t* data) {
auto AppendNotNull = [builder, this](const uint8_t* data) {
// This is annoying. NumPy allows strings to have nul-terminators, so
// we must check for them here
const size_t item_size =
strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
return builder.Append(data, static_cast<int32_t>(item_size));
return builder->Append(data, static_cast<int32_t>(item_size));
};

if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
RETURN_NOT_OK(builder.AppendNull());
RETURN_NOT_OK(builder->AppendNull());
} else {
RETURN_NOT_OK(AppendNotNull(data));
}
Expand All @@ -583,6 +588,14 @@ Status NumPyConverter::Visit(const BinaryType& type) {
}
}

return Status::OK();
}

Status NumPyConverter::Visit(const BinaryType& type) {
::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);

RETURN_NOT_OK(VisitBinary(&builder));

ArrayVector result;
RETURN_NOT_OK(builder.Finish(&result));
for (auto arr : result) {
Expand All @@ -591,6 +604,16 @@ Status NumPyConverter::Visit(const BinaryType& type) {
return Status::OK();
}

Status NumPyConverter::Visit(const LargeBinaryType& type) {
::arrow::LargeBinaryBuilder builder(pool_);

RETURN_NOT_OK(VisitBinary(&builder));

std::shared_ptr<Array> result;
RETURN_NOT_OK(builder.Finish(&result));
return PushArray(result->data());
}

Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
auto byte_width = type.byte_width();

Expand Down Expand Up @@ -630,8 +653,8 @@ namespace {
// NumPy unicode is UCS4/UTF32 always
constexpr int kNumPyUnicodeSize = 4;

Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
::arrow::internal::ChunkedStringBuilder* builder) {
template <typename T>
Status AppendUTF32(const char* data, int64_t itemsize, int byteorder, T* builder) {
// The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
// so we need to detect that here to truncate if necessary. Yep.
Py_ssize_t actual_length = 0;
Expand Down Expand Up @@ -659,11 +682,8 @@ Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,

} // namespace

Status NumPyConverter::Visit(const StringType& type) {
util::InitializeUTF8();

::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);

template <typename T>
Status NumPyConverter::VisitString(T* builder) {
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

char numpy_byteorder = dtype_->byteorder;
Expand Down Expand Up @@ -707,23 +727,23 @@ Status NumPyConverter::Visit(const StringType& type) {
auto AppendNonNullValue = [&](const uint8_t* data) {
if (is_binary_type) {
if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
return builder.Append(data, static_cast<int32_t>(itemsize_));
return builder->Append(data, static_cast<int32_t>(itemsize_));
} else {
return Status::Invalid("Encountered non-UTF8 binary value: ",
HexEncode(data, itemsize_));
}
} else {
// is_unicode_type case
return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
&builder);
builder);
}
};

if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
RETURN_NOT_OK(builder.AppendNull());
RETURN_NOT_OK(builder->AppendNull());
} else {
RETURN_NOT_OK(AppendNonNullValue(data));
}
Expand All @@ -736,6 +756,16 @@ Status NumPyConverter::Visit(const StringType& type) {
}
}

return Status::OK();
}

Status NumPyConverter::Visit(const StringType& type) {
util::InitializeUTF8();

::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);

RETURN_NOT_OK(VisitString(&builder));

ArrayVector result;
RETURN_NOT_OK(builder.Finish(&result));
for (auto arr : result) {
Expand All @@ -744,6 +774,19 @@ Status NumPyConverter::Visit(const StringType& type) {
return Status::OK();
}

Status NumPyConverter::Visit(const LargeStringType& type) {
util::InitializeUTF8();

::arrow::LargeStringBuilder builder(pool_);

RETURN_NOT_OK(VisitString(&builder));

std::shared_ptr<Array> result;
RETURN_NOT_OK(builder.Finish(&result));
RETURN_NOT_OK(PushArray(result->data()));
return Status::OK();
}

Status NumPyConverter::Visit(const StructType& type) {
std::vector<NumPyConverter> sub_converters;
std::vector<OwnedRefNoGIL> sub_arrays;
Expand Down
39 changes: 25 additions & 14 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3113,8 +3113,9 @@ def test_array_from_numpy_str_utf8():

@pytest.mark.slow
@pytest.mark.large_memory
def test_numpy_binary_overflow_to_chunked():
# ARROW-3762, ARROW-5966
@pytest.mark.parametrize('large_types', [False, True])
def test_numpy_binary_overflow_to_chunked(large_types):
# ARROW-3762, ARROW-5966, GH-35289

# 2^31 + 1 bytes
values = [b'x']
Expand All @@ -3131,24 +3132,34 @@ def test_numpy_binary_overflow_to_chunked():
unicode_values += [unicode_unique_strings[i % 10]
for i in range(1 << 11)]

for case, ex_type in [(values, pa.binary()),
(unicode_values, pa.utf8())]:
binary_type = pa.large_binary() if large_types else pa.binary()
string_type = pa.large_utf8() if large_types else pa.utf8()
for case, ex_type in [(values, binary_type),
(unicode_values, string_type)]:
arr = np.array(case)
arrow_arr = pa.array(arr)
arrow_arr = pa.array(arr, ex_type)
arr = None

assert isinstance(arrow_arr, pa.ChunkedArray)
assert arrow_arr.type == ex_type
if large_types:
# Large types shouldn't be chunked
assert isinstance(arrow_arr, pa.Array)

for i in range(len(arrow_arr)):
val = arrow_arr[i]
assert val.as_py() == case[i]
else:
assert isinstance(arrow_arr, pa.ChunkedArray)

# Split up into 16MB chunks. 128 * 16 = 2048, so 129
assert arrow_arr.num_chunks == 129
# Split up into 16MB chunks. 128 * 16 = 2048, so 129
assert arrow_arr.num_chunks == 129

value_index = 0
for i in range(arrow_arr.num_chunks):
chunk = arrow_arr.chunk(i)
for val in chunk:
assert val.as_py() == case[value_index]
value_index += 1
value_index = 0
for i in range(arrow_arr.num_chunks):
chunk = arrow_arr.chunk(i)
for val in chunk:
assert val.as_py() == case[value_index]
value_index += 1


@pytest.mark.large_memory
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ def test_type_to_pandas_dtype():
(pa.date64(), M8),
(pa.timestamp('ms'), M8),
(pa.binary(), np.object_),
(pa.large_binary(), np.object_),
(pa.binary(12), np.object_),
(pa.string(), np.object_),
(pa.large_string(), np.object_),
(pa.list_(pa.int8()), np.object_),
# (pa.list_(pa.int8(), 2), np.object_), # TODO needs pandas conversion
(pa.map_(pa.int64(), pa.float64()), np.object_),
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ cdef dict _pandas_type_map = {
'ns': np.dtype('timedelta64[ns]'),
},
_Type_BINARY: np.object_,
_Type_LARGE_BINARY: np.object_,
_Type_FIXED_SIZE_BINARY: np.object_,
_Type_STRING: np.object_,
_Type_LARGE_STRING: np.object_,
_Type_LIST: np.object_,
_Type_MAP: np.object_,
_Type_DECIMAL128: np.object_,
Expand Down
Loading