Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix[RowContainer]: Fix bug in RowContainer::extractValuesWithNulls #12301

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions velox/exec/RowContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ class RowContainer {
/// invalidated. Any row erase operations will invalidate column stats.
std::optional<RowColumn::Stats> columnStats(int32_t columnIndex) const;

uint32_t columnNullCount(int32_t columnIndex) {
uint32_t columnNullCount(int32_t columnIndex) const {
return rowColumnsStats_[columnIndex].nullCount();
}

Expand Down Expand Up @@ -1108,7 +1108,7 @@ class RowContainer {
auto maxRows = numRows + resultOffset;
VELOX_DCHECK_LE(maxRows, result->size());

BufferPtr& nullBuffer = result->mutableNulls(maxRows);
BufferPtr& nullBuffer = result->mutableNulls(maxRows, true);
auto nulls = nullBuffer->asMutable<uint64_t>();
BufferPtr valuesBuffer = result->mutableValues(maxRows);
[[maybe_unused]] auto values = valuesBuffer->asMutableRange<T>();
Expand Down
94 changes: 94 additions & 0 deletions velox/exec/tests/RowContainerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "velox/expression/VectorReaders.h"
#include "velox/type/tests/utils/CustomTypesForTesting.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"
#include "velox/vector/tests/utils/VectorMaker.h"

using namespace facebook::velox;
using namespace facebook::velox::exec;
Expand Down Expand Up @@ -934,6 +935,99 @@ static int32_t sign(int32_t n) {
}
} // namespace

TEST_F(RowContainerTest, extractWithNullsAndTargetOffset) {
constexpr int32_t kNumRows = 100;
// The second column must have no nulls in the first batch.
auto rowVector1 = makeRowVector({
makeFlatVector<bool>(
kNumRows, [](auto row) { return row % 11 + 1; }, nullEvery(17)),
makeFlatVector<StringView>(
kNumRows, [](auto /* row */) { return StringView("abcd"); }),
makeFlatVector<int8_t>(
kNumRows, [](auto /* row */) { return 4; }, nullEvery(1)),
});
// The second column must have at least one null in the second batch.
auto rowVector2 = makeRowVector({
makeFlatVector<bool>(
kNumRows, [](auto row) { return row % 11 + 1; }, nullEvery(23)),
makeFlatVector<StringView>(
kNumRows,
[](auto /* row */) { return StringView(""); },
nullEvery(3)),
makeFlatVector<int8_t>(
kNumRows, [](auto /* row */) { return 5; }, nullEvery(5)),
});

// Create and fill up two row containers from two row vectors.
std::vector<TypePtr> vecTypes = {BOOLEAN(), VARCHAR(), TINYINT()};
RowTypePtr rowType = VectorMaker::rowType({BOOLEAN(), VARCHAR(), TINYINT()});
auto data1 = makeRowContainer({}, vecTypes);
auto data2 = makeRowContainer({}, vecTypes);
for (auto i = 0; i < kNumRows; i++) {
data1->newRow();
data2->newRow();
}

std::vector<char*> rows1(kNumRows);
RowContainerIterator iter1;
EXPECT_EQ(data1->listRows(&iter1, kNumRows, rows1.data()), kNumRows);
SelectivityVector allRows1(kNumRows);
for (int i = 0; i < rowVector1->childrenSize(); i++) {
DecodedVector decoded(*rowVector1->childAt(i), allRows1);
for (auto j = 0; j < kNumRows; ++j) {
data1->store(decoded, j, rows1[j], i);
}
}

std::vector<char*> rows2(kNumRows);
RowContainerIterator iter2;
EXPECT_EQ(data2->listRows(&iter2, kNumRows, rows2.data()), kNumRows);
SelectivityVector allRows2(kNumRows);
for (int i = 0; i < rowVector2->childrenSize(); i++) {
DecodedVector decoded(*rowVector2->childAt(i), allRows2);
for (auto j = 0; j < kNumRows; ++j) {
data2->store(decoded, j, rows2[j], i);
}
}

// Now create the result row vector and extract two row containers into it.
auto result =
BaseVector::create<RowVector>(rowType, kNumRows * 2, pool_.get());

for (int32_t col = 0; col < 3; col++) {
RowContainer::extractColumn(
rows1.data(),
kNumRows,
data1->columnAt(col),
data1->columnHasNulls(col),
0,
result->childAt(col));
}
for (int32_t col = 0; col < 3; col++) {
RowContainer::extractColumn(
rows2.data(),
kNumRows,
data2->columnAt(col),
data2->columnHasNulls(col),
kNumRows,
result->childAt(col));
}

// Check we have all nulls correct.
ASSERT_EQ(result->size(), kNumRows * 2);
for (int32_t col = 0; col < 3; col++) {
auto resultColVector = result->childAt(col);
auto batch1ColVector = rowVector1->childAt(col);
auto batch2ColVector = rowVector2->childAt(col);
for (int32_t row = 0; row < kNumRows; row++) {
ASSERT_EQ(resultColVector->isNullAt(row), batch1ColVector->isNullAt(row));
ASSERT_EQ(
resultColVector->isNullAt(row + kNumRows),
batch2ColVector->isNullAt(row));
}
}
}

TEST_F(RowContainerTest, storeExtractArrayOfVarchar) {
// Make a string vector with two rows each having 2 elements.
// Here it is important, that one of the 1st row's elements has more than 12
Expand Down
8 changes: 6 additions & 2 deletions velox/vector/BaseVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,12 @@ class BaseVector {
return const_cast<uint64_t*>(rawNulls_);
}

BufferPtr& mutableNulls(vector_size_t size) {
ensureNullsCapacity(size);
/// Ensures the vector has capacity for the nulls and returns the shared
/// pointer to the buffer containing them.
/// Optional parameter 'setNotNull' is passed to ensureNullsCapacity() and is
/// used to ensure all the rows will be 'not nulls' if set to true.
BufferPtr& mutableNulls(vector_size_t size, bool setNotNull = false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's oddly inconsistent that mutableRawNulls calls ensureNulls() which always sets "setNotNull" to true and mutableNulls (prior to this change) calls ensureNullsCapacity directly which defaults "setNotNull" to false.

This is probably the safest approach to addressing it, given how core this is to Vectors, but it seems like it was an accident waiting to happen.

Copy link
Contributor Author

@spershin spershin Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, in this PR I strove to make as small change as possible to avoid breaking something else or introduce an overhead where we don't want.

Our bug occurs in the situation when we preallocate a vector and then can initialize its rows with multiple batches.
I don't know how common this case is compared to "allocate and fully initialize vector" where we probably don't want to fill the buffer upfront.

Most optimizations introduce a risk, it is a trade off.

If we want to streamline this oddness you mention, we should probably do it in a separate change.

ensureNullsCapacity(size, setNotNull);
return nulls_;
}

Expand Down
Loading