From e4f31462dbd668c3bcb6ce96442f3c1632c4d8c8 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 30 Apr 2024 06:38:40 +0200 Subject: [PATCH] GH-41317: [C++] Fix crash on invalid Parquet file (#41366) ### Rationale for this change Fixes the crash detailed in #41317 in TableBatchReader::ReadNext() on a corrupted Parquet file ### What changes are included in this PR? Add a validation that all read columns have the same size ### Are these changes tested? I've tested on the reproducer I provided in #41317 that it now triggers a clean error: ``` Traceback (most recent call last): File "test.py", line 3, in [_ for _ in parquet_file.iter_batches()] File "test.py", line 3, in [_ for _ in parquet_file.iter_batches()] File "pyarrow/_parquet.pyx", line 1587, in iter_batches File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowInvalid: columns do not have the same size ``` I'm not sure if/how unit tests for corrupted datasets should be added ### Are there any user-facing changes? No **This PR contains a "Critical Fix".** * GitHub Issue: #41317 Authored-by: Even Rouault Signed-off-by: mwish --- cpp/src/arrow/table.cc | 2 ++ cpp/src/arrow/table.h | 2 ++ cpp/src/parquet/arrow/reader.cc | 10 ++++++++++ 3 files changed, 14 insertions(+) diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 967e78f6b4db1..5dc5e4c1a9a8c 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -619,6 +619,7 @@ TableBatchReader::TableBatchReader(const Table& table) for (int i = 0; i < table.num_columns(); ++i) { column_data_[i] = table.column(i).get(); } + DCHECK(table_.Validate().ok()); } TableBatchReader::TableBatchReader(std::shared_ptr table) @@ -632,6 +633,7 @@ TableBatchReader::TableBatchReader(std::shared_ptr
table) for (int i = 0; i < owned_table_->num_columns(); ++i) { column_data_[i] = owned_table_->column(i).get(); } + DCHECK(table_.Validate().ok()); } std::shared_ptr TableBatchReader::schema() const { return table_.schema(); } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index a7508430c132b..79675fa92b1f3 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -241,6 +241,8 @@ class ARROW_EXPORT Table { /// /// The conversion is zero-copy: each record batch is a view over a slice /// of the table's columns. +/// +/// The table is expected to be valid prior to using it with the batch reader. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: /// \brief Construct a TableBatchReader for the given table diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index d6ad7c25bc7c1..285e2a597389d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1043,6 +1043,16 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, } } + // Check all columns has same row-size + if (!columns.empty()) { + int64_t row_size = columns[0]->length(); + for (size_t i = 1; i < columns.size(); ++i) { + if (columns[i]->length() != row_size) { + return ::arrow::Status::Invalid("columns do not have the same size"); + } + } + } + auto table = ::arrow::Table::Make(batch_schema, std::move(columns)); auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);