Skip to content

Commit

Permalink
apacheGH-44042: [C++][Parquet] Limit num-of row-groups when building …
Browse files Browse the repository at this point in the history
…parquet for encrypted file (# 44043)

### Rationale for this change

Limit num-of row-groups when build parquet

### What changes are included in this PR?

Limit num-of row-groups when build parquet

### Are these changes tested?

No

### Are there any user-facing changes?

No

* GitHub Issue: apache#44042

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: mwish <maplewish117@gmail.com>
  • Loading branch information
mapleFU and pitrou authored Jan 24, 2025
1 parent 17a0ff5 commit f4a63d4
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 18 deletions.
12 changes: 8 additions & 4 deletions cpp/src/parquet/file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,17 @@ class SerializedRowGroup : public RowGroupReader::Contents {
ARROW_DCHECK_NE(meta_decryptor, nullptr);
ARROW_DCHECK_NE(data_decryptor, nullptr);

constexpr auto kEncryptedRowGroupsLimit = 32767;
if (i > kEncryptedRowGroupsLimit) {
constexpr auto kEncryptedOrdinalLimit = 32767;
if (ARROW_PREDICT_FALSE(row_group_ordinal_ > kEncryptedOrdinalLimit)) {
throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
}
if (ARROW_PREDICT_FALSE(i > kEncryptedOrdinalLimit)) {
throw ParquetException("Encrypted files cannot contain more than 32767 columns");
}

CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
static_cast<int16_t>(i), meta_decryptor, data_decryptor);
CryptoContext ctx(col->has_dictionary_page(),
static_cast<int16_t>(row_group_ordinal_), static_cast<int16_t>(i),
meta_decryptor, data_decryptor);
return PageReader::Open(stream, col->num_values(), col->compression(), properties_,
always_compressed, &ctx);
}
Expand Down
15 changes: 13 additions & 2 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,25 @@ class FileSerializer : public ParquetFileWriter::Contents {
if (row_group_writer_) {
row_group_writer_->Close();
}
int16_t row_group_ordinal = -1; // row group ordinal not set
if (file_encryptor_ != nullptr) {
// Parquet thrifts using int16 for row group ordinal, so we can't have more than
// 32767 row groups in a file.
if (num_row_groups_ <= std::numeric_limits<int16_t>::max()) {
row_group_ordinal = static_cast<int16_t>(num_row_groups_);
} else {
throw ParquetException(
"Cannot write more than 32767 row groups in an encrypted file");
}
}
num_row_groups_++;
auto rg_metadata = metadata_->AppendRowGroup();
if (page_index_builder_) {
page_index_builder_->AppendRowGroup();
}
std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), properties_.get(),
buffered_row_group, file_encryptor_.get(), page_index_builder_.get()));
sink_, rg_metadata, row_group_ordinal, properties_.get(), buffered_row_group,
file_encryptor_.get(), page_index_builder_.get()));
row_group_writer_ = std::make_unique<RowGroupWriter>(std::move(contents));
return row_group_writer_.get();
}
Expand Down
27 changes: 15 additions & 12 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,30 +219,29 @@ const std::string& ColumnCryptoMetaData::key_metadata() const {
// ColumnChunk metadata
class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
public:
explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
const ColumnDescriptor* descr,
int16_t row_group_ordinal, int16_t column_ordinal,
const ReaderProperties& properties,
const ApplicationVersion* writer_version,
std::shared_ptr<InternalFileDecryptor> file_decryptor)
explicit ColumnChunkMetaDataImpl(
const format::ColumnChunk* column, const ColumnDescriptor* descr,
int16_t row_group_ordinal, int16_t column_ordinal,
const ReaderProperties& properties, const ApplicationVersion* writer_version,
const std::shared_ptr<InternalFileDecryptor>& file_decryptor)
: column_(column),
descr_(descr),
properties_(properties),
writer_version_(writer_version) {
column_metadata_ = &column->meta_data;
if (column->__isset.crypto_metadata) { // column metadata is encrypted
format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
const format::ColumnCryptoMetaData& ccmd = column->crypto_metadata;

if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) {
// should decrypt metadata
std::shared_ptr<schema::ColumnPath> path = std::make_shared<schema::ColumnPath>(
ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
const std::string& key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;

std::string aad_column_metadata = encryption::CreateModuleAad(
file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal,
column_ordinal, static_cast<int16_t>(-1));
column_ordinal, /*page_ordinal=*/static_cast<int16_t>(-1));
auto decryptor = file_decryptor->GetColumnMetaDecryptor(
path->ToDotString(), key_metadata, aad_column_metadata);
auto len = static_cast<uint32_t>(column->encrypted_column_metadata.size());
Expand Down Expand Up @@ -565,9 +564,11 @@ class RowGroupMetaData::RowGroupMetaDataImpl {

std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
if (i >= 0 && i < num_columns()) {
int16_t row_group_ordinal =
row_group_->__isset.ordinal ? row_group_->ordinal : static_cast<int16_t>(-1);
return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
properties_, writer_version_, row_group_->ordinal,
i, file_decryptor_);
properties_, writer_version_, row_group_ordinal, i,
file_decryptor_);
}
throw ParquetException("The file only has ", num_columns(),
" columns, requested metadata for column: ", i);
Expand Down Expand Up @@ -1854,7 +1855,9 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
row_group_->__set_file_offset(file_offset);
row_group_->__set_total_compressed_size(total_compressed_size);
row_group_->__set_total_byte_size(total_bytes_written);
row_group_->__set_ordinal(row_group_ordinal);
if (row_group_ordinal >= 0) {
row_group_->__set_ordinal(row_group_ordinal);
}
}

void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }
Expand Down

0 comments on commit f4a63d4

Please sign in to comment.