diff --git a/cpp/src/parquet/column/levels-test.cc b/cpp/src/parquet/column/levels-test.cc index b15c0af9bf482..cb7cc3863cdf6 100644 --- a/cpp/src/parquet/column/levels-test.cc +++ b/cpp/src/parquet/column/levels-test.cc @@ -183,4 +183,27 @@ TEST(TestLevels, TestLevelsDecodeMultipleSetData) { } } +TEST(TestLevelEncoder, MinimumBufferSize) { + // PARQUET-676, PARQUET-698 + const int kNumToEncode = 1024; + + std::vector levels; + for (int i = 0; i < kNumToEncode; ++i) { + if (i % 9 == 0) { + levels.push_back(0); + } else { + levels.push_back(1); + } + } + + std::vector output( + LevelEncoder::MaxBufferSize(Encoding::RLE, 1, kNumToEncode)); + + LevelEncoder encoder; + encoder.Init(Encoding::RLE, 1, kNumToEncode, output.data(), output.size()); + int encode_count = encoder.Encode(kNumToEncode, levels.data()); + + ASSERT_EQ(kNumToEncode, encode_count); +} + } // namespace parquet diff --git a/cpp/src/parquet/file/reader.cc b/cpp/src/parquet/file/reader.cc index b6de168c2583e..50db1c0f2f105 100644 --- a/cpp/src/parquet/file/reader.cc +++ b/cpp/src/parquet/file/reader.cc @@ -162,8 +162,7 @@ void ParquetFileReader::DebugPrint( const ColumnStatistics stats = column_chunk->statistics(); const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i); - stream << "Column " << i << std::endl - << ", values: " << column_chunk->num_values(); + stream << "Column " << i << std::endl << ", values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { stream << ", null values: " << stats.null_count << ", distinct values: " << stats.distinct_count << std::endl @@ -174,17 +173,15 @@ void ParquetFileReader::DebugPrint( stream << " Statistics Not Set"; } stream << std::endl - << " compression: " - << compression_to_string(column_chunk->compression()) + << " compression: " << compression_to_string(column_chunk->compression()) << ", encodings: "; for (auto encoding : column_chunk->encodings()) { stream << encoding_to_string(encoding) << " "; } stream << std::endl - << " uncompressed size: " - << column_chunk->total_uncompressed_size() - << ", compressed size: " - << column_chunk->total_compressed_size() << std::endl; + << " uncompressed size: " << column_chunk->total_uncompressed_size() + << ", compressed size: " << column_chunk->total_compressed_size() + << std::endl; } if (!print_values) { continue; } diff --git a/cpp/src/parquet/util/rle-encoding.h b/cpp/src/parquet/util/rle-encoding.h index b0fb8d1395e68..15fd5504ba17f 100644 --- a/cpp/src/parquet/util/rle-encoding.h +++ b/cpp/src/parquet/util/rle-encoding.h @@ -174,7 +174,14 @@ class RleEncoder { int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0); int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN); int literal_max_size = num_runs + num_runs * bytes_per_run; - return std::max(MinBufferSize(bit_width), literal_max_size); + + // In the very worst case scenario, the data is a concatenation of repeated + // runs of 8 values. Repeated run has a 1 byte varint followed by the + // bit-packed repeated value + int min_repeated_run_size = 1 + BitUtil::Ceil(bit_width, 8); + int repeated_max_size = BitUtil::Ceil(num_values, 8) * min_repeated_run_size; + + return std::max(literal_max_size, repeated_max_size); } /// Encode value. Returns true if the value fits in buffer, false otherwise.