From 5dcab58ba04fc160ae69d93ca7a40a67c80ca667 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 21 Jan 2025 16:18:19 +0800 Subject: [PATCH 1/3] GH-3133: Fix SizeStatistics to handle omitted histogram (#3134) --- .../column/statistics/SizeStatistics.java | 6 ++++-- .../column/statistics/TestSizeStatistics.java | 16 ++++++++++++++++ .../converter/ParquetMetadataConverter.java | 10 ++++++++-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java index 97a49be652..0dbb20e660 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java @@ -136,8 +136,10 @@ public SizeStatistics( List definitionLevelHistogram) { this.type = type; this.unencodedByteArrayDataBytes = unencodedByteArrayDataBytes; - this.repetitionLevelHistogram = repetitionLevelHistogram; - this.definitionLevelHistogram = definitionLevelHistogram; + this.repetitionLevelHistogram = + repetitionLevelHistogram == null ? Collections.emptyList() : repetitionLevelHistogram; + this.definitionLevelHistogram = + definitionLevelHistogram == null ? Collections.emptyList() : definitionLevelHistogram; } /** diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java index 6c166b0e7f..ba50745f40 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java @@ -124,4 +124,20 @@ public void testCopyStatistics() { Assert.assertEquals(Arrays.asList(1L, 1L, 1L), copy.getRepetitionLevelHistogram()); Assert.assertEquals(Arrays.asList(1L, 1L, 1L), copy.getDefinitionLevelHistogram()); } + + @Test + public void testOmittedHistogram() { + PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("a"); + SizeStatistics statistics = new SizeStatistics(type, 1024L, null, null); + Assert.assertEquals(Optional.of(1024L), statistics.getUnencodedByteArrayDataBytes()); + Assert.assertEquals(Collections.emptyList(), statistics.getRepetitionLevelHistogram()); + Assert.assertEquals(Collections.emptyList(), statistics.getDefinitionLevelHistogram()); + + SizeStatistics copy = statistics.copy(); + Assert.assertEquals(Optional.of(1024L), copy.getUnencodedByteArrayDataBytes()); + Assert.assertEquals(Collections.emptyList(), copy.getRepetitionLevelHistogram()); + Assert.assertEquals(Collections.emptyList(), copy.getDefinitionLevelHistogram()); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index d1c6b01c93..e72f2c33a2 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -2382,8 +2382,14 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s formatStats.setUnencoded_byte_array_data_bytes( stats.getUnencodedByteArrayDataBytes().get()); } - formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram()); - formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram()); + List repLevelHistogram = stats.getRepetitionLevelHistogram(); + if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) { + formatStats.setRepetition_level_histogram(repLevelHistogram); + } + List defLevelHistogram = stats.getDefinitionLevelHistogram(); + if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) { + formatStats.setDefinition_level_histogram(defLevelHistogram); + } return formatStats; } } From 9d9f0ca34bc3a0c868b0c8a56a2ed8fd1a7c2a9a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 21 Jan 2025 16:30:24 +0800 Subject: [PATCH 2/3] fix import --- .../org/apache/parquet/column/statistics/TestSizeStatistics.java | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java index ba50745f40..6e2b68167d 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.statistics; import java.util.Arrays; +import java.util.Collections; import java.util.Optional; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.LogicalTypeAnnotation; From 2551de08ddee2098446400adfeb432699e46beb4 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 21 Jan 2025 17:10:03 +0800 Subject: [PATCH 3/3] fix parquet-plugins/ --- parquet-plugins/parquet-encoding-vector/pom.xml | 2 +- parquet-plugins/parquet-plugins-benchmarks/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-plugins/parquet-encoding-vector/pom.xml b/parquet-plugins/parquet-encoding-vector/pom.xml index 4b79efdeb6..390ac88007 100644 --- a/parquet-plugins/parquet-encoding-vector/pom.xml +++ b/parquet-plugins/parquet-encoding-vector/pom.xml @@ -22,7 +22,7 @@ org.apache.parquet parquet - 1.15.0-SNAPSHOT + 1.15.1-SNAPSHOT ../../pom.xml diff --git a/parquet-plugins/parquet-plugins-benchmarks/pom.xml b/parquet-plugins/parquet-plugins-benchmarks/pom.xml index 19e500bdc3..99f779c8fa 100644 --- a/parquet-plugins/parquet-plugins-benchmarks/pom.xml +++ b/parquet-plugins/parquet-plugins-benchmarks/pom.xml @@ -22,7 +22,7 @@ org.apache.parquet parquet - 1.15.0-SNAPSHOT + 1.15.1-SNAPSHOT ../../pom.xml