From c4afd0b7e53899ded94fa1fbfe06c9e4c6e2a5c7 Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Thu, 28 Mar 2024 21:27:31 -0700 Subject: [PATCH] [Iceberg] Add histogram statistic support Utilizes the sketch_kll function to generate histograms and store them into the Iceberg table's puffin files for table-level statistic storage. Histograms are always collected by ANALYZE, but they are not used by the cost calculator unless enabled via optimizer.use-histograms --- .../com/facebook/presto/common/Utils.java | 70 +++++- .../presto/common/predicate/Marker.java | 5 + .../presto/common/predicate/Range.java | 10 + .../common/predicate/SortedRangeSet.java | 22 ++ .../com/facebook/presto/common/TestUtils.java | 148 ++++++++++++ .../presto/common/predicate/TestMarker.java | 10 + .../common/predicate/TestSortedRangeSet.java | 62 +++++ presto-iceberg/pom.xml | 9 +- .../iceberg/IcebergAbstractMetadata.java | 4 +- .../presto/iceberg/IcebergConfig.java | 17 ++ .../iceberg/IcebergSessionProperties.java | 10 + .../presto/iceberg/TableStatisticsMaker.java | 115 +++++++-- .../iceberg/statistics/KllHistogram.java | 210 +++++++++++++++++ .../iceberg/IcebergDistributedTestBase.java | 220 ++++++++++++++++++ .../presto/iceberg/TestIcebergConfig.java | 7 +- .../iceberg/statistics/TestKllHistogram.java | 166 +++++++++++++ .../cost/ComparisonStatsCalculator.java | 7 +- ...ConnectorFilterStatsCalculatorService.java | 5 + .../facebook/presto/cost/JoinStatsRule.java | 6 +- .../cost/PlanNodeStatsEstimateMath.java | 9 +- .../facebook/presto/cost/StatisticRange.java | 28 +-- .../sql/planner/BasePlanFragmenter.java | 3 +- .../presto/sql/planner/LogicalPlanner.java | 3 +- .../planner/StatisticsAggregationPlanner.java | 5 +- .../presto/sql/rewrite/ShowStatsRewrite.java | 5 + .../presto/cost/TestHistogramCalculator.java | 100 -------- .../cost/TestPlanNodeStatsEstimateMath.java | 18 +- .../cost/TestVariableStatsEstimate.java | 4 +- ...ApproximateStatsOutputRowCountMatcher.java | 54 +++++ .../planner/assertions/PlanMatchPattern.java | 6 + presto-spi/pom.xml | 6 + .../spi/statistics/ColumnStatistics.java | 7 + .../DisjointRangeDomainHistogram.java | 139 +++++------ .../spi/statistics}/HistogramCalculator.java | 65 +++--- .../UniformDistributionHistogram.java | 20 +- .../TestDisjointRangeDomainHistogram.java | 84 ++++--- .../presto/spi/statistics}/TestHistogram.java | 3 +- .../statistics/TestHistogramCalculator.java | 101 ++++++++ .../spi/statistics}/TestUniformHistogram.java | 7 +- 39 files changed, 1441 insertions(+), 329 deletions(-) create mode 100644 presto-common/src/test/java/com/facebook/presto/common/TestUtils.java create mode 100644 presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java create mode 100644 presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java create mode 100644 presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java rename {presto-main/src/main/java/com/facebook/presto/cost => presto-spi/src/main/java/com/facebook/presto/spi/statistics}/DisjointRangeDomainHistogram.java (74%) rename {presto-main/src/main/java/com/facebook/presto/cost => presto-spi/src/main/java/com/facebook/presto/spi/statistics}/HistogramCalculator.java (68%) rename {presto-main/src/main/java/com/facebook/presto/cost => presto-spi/src/main/java/com/facebook/presto/spi/statistics}/UniformDistributionHistogram.java (85%) rename {presto-main/src/test/java/com/facebook/presto/cost => presto-spi/src/test/java/com/facebook/presto/spi/statistics}/TestDisjointRangeDomainHistogram.java (81%) rename {presto-main/src/test/java/com/facebook/presto/cost => presto-spi/src/test/java/com/facebook/presto/spi/statistics}/TestHistogram.java (97%) create mode 100644 presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java rename {presto-main/src/test/java/com/facebook/presto/cost => presto-spi/src/test/java/com/facebook/presto/spi/statistics}/TestUniformHistogram.java (93%) diff --git a/presto-common/src/main/java/com/facebook/presto/common/Utils.java b/presto-common/src/main/java/com/facebook/presto/common/Utils.java index a2b88be85bfca..dc31b1778252f 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/Utils.java +++ b/presto-common/src/main/java/com/facebook/presto/common/Utils.java @@ -18,8 +18,14 @@ import com.facebook.presto.common.predicate.Primitives; import com.facebook.presto.common.type.Type; +import javax.annotation.Nullable; + +import java.util.function.Supplier; + import static com.facebook.presto.common.type.TypeUtils.readNativeValue; import static com.facebook.presto.common.type.TypeUtils.writeNativeValue; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; public final class Utils { @@ -30,7 +36,7 @@ private Utils() public static Block nativeValueToBlock(Type type, Object object) { if (object != null && !Primitives.wrap(type.getJavaType()).isInstance(object)) { - throw new IllegalArgumentException(String.format("Object '%s' does not match type %s", object, type.getJavaType())); + throw new IllegalArgumentException(format("Object '%s' does not match type %s", object, type.getJavaType())); } BlockBuilder blockBuilder = type.createBlockBuilder(null, 1); writeNativeValue(type, blockBuilder, object); @@ -49,10 +55,68 @@ public static void checkArgument(boolean expression) } } - public static void checkArgument(boolean expression, String errorMessage) + public static void checkArgument(boolean expression, String message, Object... args) { if (!expression) { - throw new IllegalArgumentException(errorMessage); + throw new IllegalArgumentException(format(message, args)); + } + } + + /** + * Returns a supplier which caches the instance retrieved during the first call to {@code get()} + * and returns that value on subsequent calls to {@code get()}. + */ + public static Supplier memoizedSupplier(Supplier delegate) + { + if (delegate instanceof MemoizingSupplier) { + return delegate; + } + return new MemoizingSupplier<>(delegate); + } + + /** + * Vendored from Guava + */ + static class MemoizingSupplier + implements Supplier + { + volatile Supplier delegate; + volatile boolean initialized; + // "value" does not need to be volatile; visibility piggy-backs + // on volatile read of "initialized". + @Nullable T value; + + MemoizingSupplier(Supplier delegate) + { + this.delegate = requireNonNull(delegate); + } + + @Override + public T get() + { + // A 2-field variant of Double Checked Locking. + if (!initialized) { + synchronized (this) { + if (!initialized) { + T t = delegate.get(); + value = t; + initialized = true; + // Release the delegate to GC. + delegate = null; + return t; + } + } + } + return value; + } + + @Override + public String toString() + { + Supplier delegate = this.delegate; + return "Suppliers.memoize(" + + (delegate == null ? "" : delegate) + + ")"; } } diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java index f20a87065bcfa..76a58d147a281 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java @@ -129,6 +129,11 @@ public Object getValue() return Utils.blockToNativeValue(type, valueBlock.get()); } + public Optional getObjectValue() + { + return valueBlock.map(block -> Utils.blockToNativeValue(type, block)); + } + public Object getPrintableValue(SqlFunctionProperties properties) { if (!valueBlock.isPresent()) { diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java index d00a3c77df0e2..501996cc6aa82 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java @@ -247,6 +247,16 @@ public boolean equals(Object obj) Objects.equals(this.high, other.high); } + @Override + public String toString() + { + return (low.getBound() == Marker.Bound.EXACTLY ? "[" : "(") + + low.getObjectValue().orElse(Double.NEGATIVE_INFINITY) + + ".." + + high.getObjectValue().orElse(Double.POSITIVE_INFINITY) + + (high.getBound() == Marker.Bound.EXACTLY ? "]" : ")"); + } + private void appendQuotedValue(StringBuilder buffer, Marker marker, SqlFunctionProperties properties) { buffer.append('"'); diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java index 5f1988be005d1..4af54a8e2a685 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java @@ -168,6 +168,28 @@ public Object getSingleValue() return lowIndexedRanges.values().iterator().next().getSingleValue(); } + /** + * Build a new {@link SortedRangeSet} that contains ranges which lie within the argument range + * + * @param span the range which the new set should span + * @return a new range set + */ + public SortedRangeSet subRangeSet(Range span) + { + Builder builder = new Builder(type); + + for (Range range : getOrderedRanges()) { + if (span.contains(range)) { + builder.add(range); + } + else if (span.overlaps(range)) { + builder.add(range.intersect(span)); + } + } + + return builder.build(); + } + @Override public boolean containsValue(Object value) { diff --git a/presto-common/src/test/java/com/facebook/presto/common/TestUtils.java b/presto-common/src/test/java/com/facebook/presto/common/TestUtils.java new file mode 100644 index 0000000000000..646a44fc919ce --- /dev/null +++ b/presto-common/src/test/java/com/facebook/presto/common/TestUtils.java @@ -0,0 +1,148 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.common; + +import org.testng.annotations.Test; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.function.Supplier; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertSame; +import static org.testng.Assert.fail; + +public class TestUtils +{ + @Test + public void testCheckArgumentFailWithMessage() + { + try { + Utils.checkArgument(false, "test %s", "test"); + fail(); + } + catch (IllegalArgumentException e) { + assertEquals(e.getMessage(), "test test"); + } + } + + @Test + public void testCheckArgumentPassWithMessage() + { + try { + Utils.checkArgument(true, "test %s", "test"); + } + catch (IllegalArgumentException e) { + fail(); + } + } + + @Test + public void testMemoizedSupplierThreadSafe() + throws Throwable + { + Function, Supplier> memoizer = + supplier -> Utils.memoizedSupplier(supplier); + testSupplierThreadSafe(memoizer); + } + + /** + * Vendored from Guava + */ + private void testSupplierThreadSafe(Function, Supplier> memoizer) + throws Throwable + { + final AtomicInteger count = new AtomicInteger(0); + final AtomicReference thrown = new AtomicReference<>(null); + final int numThreads = 3; + final Thread[] threads = new Thread[numThreads]; + final long timeout = TimeUnit.SECONDS.toNanos(60); + + final Supplier supplier = + new Supplier() + { + boolean isWaiting(Thread thread) + { + switch (thread.getState()) { + case BLOCKED: + case WAITING: + case TIMED_WAITING: + return true; + default: + return false; + } + } + + int waitingThreads() + { + int waitingThreads = 0; + for (Thread thread : threads) { + if (isWaiting(thread)) { + waitingThreads++; + } + } + return waitingThreads; + } + + @Override + @SuppressWarnings("ThreadPriorityCheck") // doing our best to test for races + public Boolean get() + { + // Check that this method is called exactly once, by the first + // thread to synchronize. + long t0 = System.nanoTime(); + while (waitingThreads() != numThreads - 1) { + if (System.nanoTime() - t0 > timeout) { + thrown.set( + new TimeoutException( + "timed out waiting for other threads to block" + + " synchronizing on supplier")); + break; + } + Thread.yield(); + } + count.getAndIncrement(); + return Boolean.TRUE; + } + }; + + final Supplier memoizedSupplier = memoizer.apply(supplier); + + for (int i = 0; i < numThreads; i++) { + threads[i] = + new Thread() + { + @Override + public void run() + { + assertSame(Boolean.TRUE, memoizedSupplier.get()); + } + }; + } + for (Thread t : threads) { + t.start(); + } + for (Thread t : threads) { + t.join(); + } + + if (thrown.get() != null) { + throw thrown.get(); + } + assertEquals(1, count.get()); + } +} diff --git a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java index 362fc8f29c976..1eb4085e7222a 100644 --- a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java +++ b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java @@ -191,6 +191,16 @@ public void testCanonicalize() assertDifferentMarker(Marker.upperUnbounded(BIGINT), Marker.lowerUnbounded(BIGINT), true); } + @Test + public void testGetValue() + { + assertTrue(Marker.exactly(BIGINT, 1L).getObjectValue().isPresent()); + assertTrue(Marker.above(BIGINT, 1L).getObjectValue().isPresent()); + assertTrue(Marker.below(BIGINT, 1L).getObjectValue().isPresent()); + assertFalse(Marker.upperUnbounded(BIGINT).getObjectValue().isPresent()); + assertFalse(Marker.lowerUnbounded(BIGINT).getObjectValue().isPresent()); + } + private void assertSameMarker(Marker marker1, Marker marker2, boolean removeConstants) throws Exception { diff --git a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java index f754681359a5a..087073c432bd5 100644 --- a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java +++ b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java @@ -26,6 +26,9 @@ import com.google.common.collect.Iterables; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.stream.Collectors; + import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.BooleanType.BOOLEAN; import static com.facebook.presto.common.type.DoubleType.DOUBLE; @@ -500,6 +503,65 @@ public void testCanonicalize() assertDifferentSet(SortedRangeSet.all(BIGINT), SortedRangeSet.all(BOOLEAN), true); } + @Test + public void testSubRangeSet() + { + // test subrange no overlap below and above + assertEquals(SortedRangeSet.of(Range.lessThan(BIGINT, 10L)) + .subRangeSet(Range.greaterThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.greaterThan(BIGINT, 10L)) + .subRangeSet(Range.lessThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.greaterThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.lessThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + + // test with equal bounds + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThanOrEqual(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 1); + assertEquals(SortedRangeSet.of(Range.greaterThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.lessThanOrEqual(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 1); + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThanOrEqual(BIGINT, 10L)) + .getOrderedRanges().get(0), Range.range(BIGINT, 10L, true, 10L, true)); + // two ranges + assertEquals(SortedRangeSet.of(Range.lessThan(BIGINT, -10L), Range.greaterThan(BIGINT, 10L)) + .subRangeSet(Range.range(BIGINT, -20L, true, 20L, true)).getOrderedRanges(), + Arrays.stream(new Range[] { + Range.range(BIGINT, -20L, true, -10L, false), + Range.range(BIGINT, 10L, false, 20L, true)}) + .collect(Collectors.toList())); + // range entirely contained + assertEquals(SortedRangeSet.of( + Range.lessThan(BIGINT, -10L), + Range.greaterThan(BIGINT, 10L), + Range.range(BIGINT, -5L, true, 5L, true)) + .subRangeSet(Range.range(BIGINT, -20L, true, 20L, true)).getOrderedRanges(), + Arrays.stream(new Range[] { + Range.range(BIGINT, -20L, true, -10L, false), + Range.range(BIGINT, -5L, true, 5L, true), + Range.range(BIGINT, 10L, false, 20L, true)}) + .collect(Collectors.toList())); + } + private void assertSameSet(SortedRangeSet set1, SortedRangeSet set2, boolean removeSafeConstants) throws Exception { diff --git a/presto-iceberg/pom.xml b/presto-iceberg/pom.xml index 7372a8956f93f..36cbe08db3b03 100644 --- a/presto-iceberg/pom.xml +++ b/presto-iceberg/pom.xml @@ -510,19 +510,16 @@ presto-cache compile - com.facebook.presto presto-main test - com.facebook.presto presto-parser test - com.facebook.presto presto-analyzer @@ -604,7 +601,7 @@ org.apache.iceberg iceberg-core - 1.5.0 + ${dep.iceberg.version} tests test @@ -634,6 +631,10 @@ + + org.apache.commons + commons-math3 + diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java index c98e726b09493..842bf00f8f199 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java @@ -657,7 +657,7 @@ public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession MetricsConfig metricsConfig = MetricsConfig.forTable(table); Set columnStatistics = tableMetadata.getColumns().stream() .filter(column -> !column.isHidden() && metricsConfig.columnMode(column.getName()) != None.get()) - .flatMap(meta -> getSupportedColumnStatistics(meta.getName(), meta.getType()).stream()) + .flatMap(meta -> getSupportedColumnStatistics(session, meta.getName(), meta.getType()).stream()) .collect(toImmutableSet()); Set tableStatistics = ImmutableSet.of(ROW_COUNT); @@ -675,7 +675,7 @@ public void finishStatisticsCollection(ConnectorSession session, ConnectorTableH { IcebergTableHandle icebergTableHandle = (IcebergTableHandle) tableHandle; Table icebergTable = getIcebergTable(session, icebergTableHandle.getSchemaTableName()); - TableStatisticsMaker.writeTableStatistics(nodeVersion, icebergTableHandle, icebergTable, session, computedStatistics); + TableStatisticsMaker.writeTableStatistics(nodeVersion, typeManager, icebergTableHandle, icebergTable, session, computedStatistics); } public void rollback() diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java index fe11f4b2fefbe..42328ddb4e7f4 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java @@ -24,6 +24,7 @@ import javax.validation.constraints.DecimalMax; import javax.validation.constraints.DecimalMin; +import javax.validation.constraints.Max; import javax.validation.constraints.Min; import javax.validation.constraints.NotNull; @@ -62,6 +63,7 @@ public class IcebergConfig private int metadataPreviousVersionsMax = METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT; private boolean metadataDeleteAfterCommit = METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT; private int metricsMaxInferredColumn = METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; + private int statisticsKllSketchKParameter = 1024; private EnumSet hiveStatisticsMergeFlags = EnumSet.noneOf(ColumnStatisticType.class); private String fileIOImpl = HadoopFileIO.class.getName(); @@ -412,4 +414,19 @@ public IcebergConfig setMaxStatisticsFileCacheSize(DataSize maxStatisticsFileCac this.maxStatisticsFileCacheSize = maxStatisticsFileCacheSize; return this; } + + public int getStatisticsKllSketchKParameter() + { + return this.statisticsKllSketchKParameter; + } + + @Config("iceberg.statistics-kll-sketch-k-parameter") + @Min(8) + @Max(65535) + @ConfigDescription("K parameter for KLL sketches when generating histogram statistics") + public IcebergConfig setStatisticsKllSketchKParameter(int kllSketchKParameter) + { + this.statisticsKllSketchKParameter = kllSketchKParameter; + return this; + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java index 5a597d97051b4..57f954801f2f7 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java @@ -64,6 +64,7 @@ public final class IcebergSessionProperties public static final String HIVE_METASTORE_STATISTICS_MERGE_STRATEGY = "hive_statistics_merge_strategy"; public static final String STATISTIC_SNAPSHOT_RECORD_DIFFERENCE_WEIGHT = "statistic_snapshot_record_difference_weight"; public static final String ROWS_FOR_METADATA_OPTIMIZATION_THRESHOLD = "rows_for_metadata_optimization_threshold"; + public static final String STATISTICS_KLL_SKETCH_K_PARAMETER = "statistics_kll_sketch_k_parameter"; private final List> sessionProperties; @@ -184,6 +185,10 @@ public IcebergSessionProperties( "of an Iceberg table exceeds this threshold, metadata optimization would be skipped for " + "the table. A value of 0 means skip metadata optimization directly.", icebergConfig.getRowsForMetadataOptimizationThreshold(), + false)) + .add(integerProperty(STATISTICS_KLL_SKETCH_K_PARAMETER, + "The K parameter for the Apache DataSketches KLL sketch when computing histogram statistics", + icebergConfig.getStatisticsKllSketchKParameter(), false)); nessieConfig.ifPresent((config) -> propertiesBuilder @@ -313,4 +318,9 @@ public static String getNessieReferenceHash(ConnectorSession session) { return session.getProperty(NESSIE_REFERENCE_HASH, String.class); } + + public static int getStatisticsKllSketchKParameter(ConnectorSession session) + { + return session.getProperty(STATISTICS_KLL_SKETCH_K_PARAMETER, Integer.class); + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index 238eb09095f2e..153c511be0f60 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -16,10 +16,14 @@ import com.facebook.airlift.log.Logger; import com.facebook.presto.common.RuntimeUnit; import com.facebook.presto.common.block.Block; +import com.facebook.presto.common.predicate.Range; import com.facebook.presto.common.predicate.TupleDomain; +import com.facebook.presto.common.type.DecimalType; import com.facebook.presto.common.type.FixedWidthType; +import com.facebook.presto.common.type.KllSketchType; import com.facebook.presto.common.type.TypeManager; import com.facebook.presto.hive.NodeVersion; +import com.facebook.presto.iceberg.statistics.KllHistogram; import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.iceberg.statistics.StatisticsFileCacheKey; import com.facebook.presto.spi.ConnectorSession; @@ -29,12 +33,15 @@ import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.ComputedStatistics; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.theta.CompactSketch; import org.apache.iceberg.ContentFile; @@ -62,6 +69,8 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; +import javax.annotation.Nullable; + import java.io.IOException; import java.io.UncheckedIOException; import java.nio.ByteBuffer; @@ -80,6 +89,7 @@ import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; import static com.facebook.presto.common.type.TypeUtils.isNumericType; @@ -89,10 +99,14 @@ import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticSnapshotRecordDifferenceWeight; +import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticsKllSketchKParameter; import static com.facebook.presto.iceberg.IcebergUtil.getIdentityPartitions; import static com.facebook.presto.iceberg.Partition.toMap; +import static com.facebook.presto.iceberg.TypeConverter.toPrestoType; +import static com.facebook.presto.iceberg.statistics.KllHistogram.isKllHistogramSupportedType; import static com.facebook.presto.iceberg.util.StatisticsUtil.calculateAndSetTableSize; import static com.facebook.presto.iceberg.util.StatisticsUtil.formatIdentifier; +import static com.facebook.presto.spi.statistics.ColumnStatisticType.HISTOGRAM; import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.facebook.presto.spi.statistics.SourceInfo.ConfidenceLevel.HIGH; @@ -100,9 +114,11 @@ import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.collect.Iterators.getOnlyElement; import static java.lang.Long.parseLong; import static java.lang.Math.abs; import static java.lang.String.format; +import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; import static java.util.stream.Collectors.toSet; import static org.apache.iceberg.SnapshotSummary.TOTAL_RECORDS_PROP; @@ -112,29 +128,34 @@ public class TableStatisticsMaker private static final Logger log = Logger.get(TableStatisticsMaker.class); private static final String ICEBERG_THETA_SKETCH_BLOB_TYPE_ID = "apache-datasketches-theta-v1"; private static final String ICEBERG_DATA_SIZE_BLOB_TYPE_ID = "presto-sum-data-size-bytes-v1"; + private static final String ICEBERG_KLL_SKETCH_BLOB_TYPE_ID = "presto-kll-sketch-bytes-v1"; private static final String ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY = "ndv"; private static final String ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY = "data_size"; private final Table icebergTable; private final ConnectorSession session; + private final TypeManager typeManager; private static final String STATISITCS_CACHE_METRIC_FILE_SIZE_FORMAT = "StatisticsFileCache/PuffinFileSize/%s/%s"; private static final String STATISITCS_CACHE_METRIC_FILE_COLUMN_COUNT_FORMAT = "StatisticsFileCache/ColumnCount/%s/%s"; private static final String STATISITCS_CACHE_METRIC_PARTIAL_MISS_FORMAT = "StatisticsFileCache/PartialMiss/%s/%s"; - private TableStatisticsMaker(Table icebergTable, ConnectorSession session) + private TableStatisticsMaker(Table icebergTable, ConnectorSession session, TypeManager typeManager) { - this.icebergTable = icebergTable; - this.session = session; + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + this.session = requireNonNull(session, "session is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); } private static final Map puffinStatWriters = ImmutableMap.builder() .put(NUMBER_OF_DISTINCT_VALUES, TableStatisticsMaker::generateNDVBlob) .put(TOTAL_SIZE_IN_BYTES, TableStatisticsMaker::generateStatSizeBlob) + .put(HISTOGRAM, TableStatisticsMaker::generateKllSketchBlob) .build(); private static final Map puffinStatReaders = ImmutableMap.builder() .put(ICEBERG_THETA_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readNDVBlob) .put(ICEBERG_DATA_SIZE_BLOB_TYPE_ID, TableStatisticsMaker::readDataSizeBlob) + .put(ICEBERG_KLL_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readKllSketchBlob) .build(); public static TableStatistics getTableStatistics( @@ -147,7 +168,7 @@ public static TableStatistics getTableStatistics( Table icebergTable, List columns) { - return new TableStatisticsMaker(icebergTable, session).makeTableStatistics(statisticsFileCache, tableHandle, currentPredicate, constraint, columns); + return new TableStatisticsMaker(icebergTable, session, typeManager).makeTableStatistics(statisticsFileCache, tableHandle, currentPredicate, constraint, columns); } private TableStatistics makeTableStatistics(StatisticsFileCache statisticsFileCache, @@ -235,7 +256,18 @@ private TableStatistics makeTableStatistics(StatisticsFileCache statisticsFileCa Object min = summary.getMinValues().get(fieldId); Object max = summary.getMaxValues().get(fieldId); if (min instanceof Number && max instanceof Number) { - columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue()))); + DoubleRange range = new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue()); + columnBuilder.setRange(Optional.of(range)); + + // the histogram is generated by scanning the entire dataset. It is possible that + // the constraint prevents scanning portions of the table. Given that we know the + // range that the scan provides for a particular column, bound the histogram to the + // scanned range. + + final DoubleRange histRange = range; + columnBuilder.setHistogram(columnBuilder.getHistogram() + .map(histogram -> DisjointRangeDomainHistogram + .addConjunction(histogram, Range.range(DOUBLE, histRange.getMin(), true, histRange.getMax(), true)))); } result.setColumnStatistics(columnHandle, columnBuilder.build()); } @@ -310,12 +342,12 @@ private Partition getSummaryFromFiles(CloseableIterable> files, return summary; } - public static void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle tableHandle, Table icebergTable, ConnectorSession session, Collection computedStatistics) + public static void writeTableStatistics(NodeVersion nodeVersion, TypeManager typeManager, IcebergTableHandle tableHandle, Table icebergTable, ConnectorSession session, Collection computedStatistics) { - new TableStatisticsMaker(icebergTable, session).writeTableStatistics(nodeVersion, tableHandle, computedStatistics); + new TableStatisticsMaker(icebergTable, session, typeManager).writeTableStatistics(nodeVersion, typeManager, tableHandle, computedStatistics); } - private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle tableHandle, Collection computedStatistics) + private void writeTableStatistics(NodeVersion nodeVersion, TypeManager typeManager, IcebergTableHandle tableHandle, Collection computedStatistics) { Snapshot snapshot = tableHandle.getIcebergTableName().getSnapshotId().map(icebergTable::snapshot).orElseGet(icebergTable::currentSnapshot); if (snapshot == null) { @@ -335,9 +367,8 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) .forEach((key, value) -> { Optional.ofNullable(puffinStatWriters.get(key.getStatisticType())) - .ifPresent(generator -> { - writer.add(generator.generate(key, value, icebergTable, snapshot)); - }); + .flatMap(generator -> Optional.ofNullable(generator.generate(key, value, icebergTable, snapshot, typeManager))) + .ifPresent(writer::add); }); writer.finish(); icebergTable.updateStatistics().setStatistics( @@ -362,7 +393,8 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta @FunctionalInterface private interface PuffinBlobGenerator { - Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot); + @Nullable + Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager); } @FunctionalInterface @@ -371,12 +403,12 @@ private interface PuffinBlobReader /** * Reads the stats from the blob and then updates the stats builder argument. */ - void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats); + void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats, Table icebergTable, TypeManager typeManager); } - private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) + private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) { - int id = getFieldId(metadata, icebergTable); + int id = getField(metadata, icebergTable, snapshot).fieldId(); ByteBuffer raw = VARBINARY.getSlice(value, 0).toByteBuffer(); CompactSketch sketch = CompactSketch.wrap(Memory.wrap(raw, ByteOrder.nativeOrder())); return new Blob( @@ -389,9 +421,9 @@ private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block valu ImmutableMap.of(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY, Long.toString((long) sketch.getEstimate()))); } - private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) + private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) { - int id = getFieldId(metadata, icebergTable); + int id = getField(metadata, icebergTable, snapshot).fieldId(); long size = BIGINT.getLong(value, 0); return new Blob( ICEBERG_DATA_SIZE_BLOB_TYPE_ID, @@ -403,7 +435,26 @@ private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block ImmutableMap.of(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY, Long.toString(size))); } - private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + private static Blob generateKllSketchBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) + { + Types.NestedField field = getField(metadata, icebergTable, snapshot); + KllSketchType sketchType = new KllSketchType(toPrestoType(field.type(), typeManager)); + Slice sketchSlice = sketchType.getSlice(value, 0); + if (value.isNull(0)) { + // this can occur when all inputs to the sketch are null + return null; + } + return new Blob( + ICEBERG_KLL_SKETCH_BLOB_TYPE_ID, + ImmutableList.of(field.fieldId()), + snapshot.snapshotId(), + snapshot.sequenceNumber(), + sketchSlice.toByteBuffer(), + null, + ImmutableMap.of()); + } + + private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) { Optional.ofNullable(metadata.properties().get(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY)) .ifPresent(ndvProp -> { @@ -418,7 +469,7 @@ private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnSt }); } - private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) { Optional.ofNullable(metadata.properties().get(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY)) .ifPresent(sizeProp -> { @@ -433,9 +484,17 @@ private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, Col }); } - private static int getFieldId(ColumnStatisticMetadata metadata, Table icebergTable) + private static void readKllSketchBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) + { + statistics.setHistogram(Optional.ofNullable(icebergTable.schemas().get(icebergTable.snapshot(metadata.snapshotId()).schemaId())) + .map(schema -> toPrestoType(schema.findType(getOnlyElement(metadata.inputFields().iterator())), typeManager)) + .map(prestoType -> new KllHistogram(Slices.wrappedBuffer(blob), prestoType))); + } + + private static Types.NestedField getField(ColumnStatisticMetadata metadata, Table icebergTable, Snapshot snapshot) { - return Optional.ofNullable(icebergTable.schema().findField(metadata.getColumnName())).map(Types.NestedField::fieldId) + return Optional.ofNullable(icebergTable.schemas().get(snapshot.schemaId())) + .map(schema -> schema.findField(metadata.getColumnName())) .orElseThrow(() -> { log.warn("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name()); return new PrestoException(ICEBERG_INVALID_METADATA, format("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name())); @@ -577,7 +636,7 @@ private Map loadStatisticsFile(IcebergTableHandle tab if (value == null) { value = ColumnStatistics.builder(); } - statReader.read(metadata, blob, value); + statReader.read(metadata, blob, value, icebergTable, typeManager); return value; }); }); @@ -602,7 +661,7 @@ private Map loadStatisticsFile(IcebergTableHandle tab return finalResult.build(); } - public static List getSupportedColumnStatistics(String columnName, com.facebook.presto.common.type.Type type) + public static List getSupportedColumnStatistics(ConnectorSession session, String columnName, com.facebook.presto.common.type.Type type) { ImmutableList.Builder supportedStatistics = ImmutableList.builder(); // all types which support being passed to the sketch_theta function @@ -613,6 +672,16 @@ public static List getSupportedColumnStatistics(String columnName, format("RETURN sketch_theta(%s)", formatIdentifier(columnName)), ImmutableList.of(columnName))); } + if (isKllHistogramSupportedType(type)) { + String histogramFunctionFmt = "RETURN sketch_kll_with_k(%s, CAST(%s as bigint))"; + if (type instanceof DecimalType) { + histogramFunctionFmt = "RETURN sketch_kll_with_k(CAST(%s as double), CAST(%s as bigint))"; + } + supportedStatistics.add(HISTOGRAM.getColumnStatisticMetadataWithCustomFunction(columnName, + format(histogramFunctionFmt, formatIdentifier(columnName), getStatisticsKllSketchKParameter(session)), + ImmutableList.of(columnName))); + } + if (!(type instanceof FixedWidthType)) { supportedStatistics.add(TOTAL_SIZE_IN_BYTES.getColumnStatisticMetadata(columnName)); } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java new file mode 100644 index 0000000000000..0aab4cc0d1753 --- /dev/null +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java @@ -0,0 +1,210 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.presto.common.type.AbstractIntType; +import com.facebook.presto.common.type.AbstractLongType; +import com.facebook.presto.common.type.AbstractVarcharType; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.spi.PrestoException; +import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.Estimate; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.datasketches.common.ArrayOfBooleansSerDe; +import org.apache.datasketches.common.ArrayOfDoublesSerDe; +import org.apache.datasketches.common.ArrayOfItemsSerDe; +import org.apache.datasketches.common.ArrayOfLongsSerDe; +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; +import org.apache.datasketches.memory.Memory; +import org.openjdk.jol.info.ClassLayout; + +import java.util.Comparator; +import java.util.function.Function; + +import static com.facebook.presto.common.type.BooleanType.BOOLEAN; +import static com.facebook.presto.common.type.Decimals.isLongDecimal; +import static com.facebook.presto.common.type.Decimals.isShortDecimal; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.SmallintType.SMALLINT; +import static com.facebook.presto.common.type.TinyintType.TINYINT; +import static com.facebook.presto.common.type.TypeUtils.isNumericType; +import static com.facebook.presto.spi.StandardErrorCode.INVALID_ARGUMENTS; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Verify.verify; +import static java.nio.ByteOrder.LITTLE_ENDIAN; +import static java.util.Objects.requireNonNull; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +public class KllHistogram + implements ConnectorHistogram +{ + private static final long INSTANCE_SIZE = ClassLayout.parseClass(KllHistogram.class).instanceSize(); + // since the actual type parameter is only known at runtime, we can't concretely specify it + private final KllItemsSketch sketch; + private final Type type; + private final Function toDouble; + private final Function fromDouble; + + @SuppressWarnings({"unchecked", "rawtypes"}) + @JsonCreator + public KllHistogram(@JsonProperty("sketch") Slice bytes, @JsonProperty("type") Type type) + { + verify(isKllHistogramSupportedType(type), "histograms do not currently support type " + type.getDisplayName()); + this.type = requireNonNull(type, "type is null"); + SketchParameters parameters = getSketchParameters(type); + // the actual sketch can only accept the same object types which generated it + // however, the API can only accept or generate double types. We cast the inputs + // and results to/from double to satisfy the underlying sketch type. + if (parameters.getSerde().getClassOfT().equals(Double.class)) { + toDouble = x -> (double) x; + fromDouble = x -> x; + } + else if (parameters.getSerde().getClassOfT().equals(Long.class)) { + // dual cast to auto-box/unbox from Double/Long for sketch + toDouble = x -> (double) (long) x; + fromDouble = x -> (long) (double) x; + } + else { + throw new PrestoException(INVALID_ARGUMENTS, "can't create kll sketch from type: " + type); + } + sketch = KllItemsSketch.wrap(Memory.wrap(bytes.toByteBuffer(), LITTLE_ENDIAN), parameters.getComparator(), parameters.getSerde()); + } + + public static boolean isKllHistogramSupportedType(Type type) + { + try { + return isNumericType(type) || + type instanceof AbstractIntType; + } + catch (PrestoException e) { + return false; + } + } + + @JsonProperty + public Slice getSketch() + { + return Slices.wrappedBuffer(sketch.toByteArray()); + } + + @JsonProperty + public Type getType() + { + return type; + } + + @VisibleForTesting + @SuppressWarnings("rawtypes") + public KllItemsSketch getKllSketch() + { + return sketch; + } + + @Override + public Estimate cumulativeProbability(double value, boolean inclusive) + { + return Estimate.of(sketch.getRank(fromDouble.apply(value), inclusive ? INCLUSIVE : EXCLUSIVE)); + } + + @Override + public Estimate inverseCumulativeProbability(double percentile) + { + return Estimate.of(toDouble.apply(sketch.getQuantile(percentile))); + } + + /** + * The memory utilization is dominated by the size of the sketch. This estimate + * doesn't account for the other fields in the class. + */ + @Override + public long getEstimatedSize() + { + return INSTANCE_SIZE + sketch.getSerializedSizeBytes(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", type) + .add("k", this.sketch.getK()) + .add("N", this.sketch.getN()) + .add("retained", this.sketch.getNumRetained()) + .add("mingetSerialized", this.sketch.getMinItem()) + .add("max", this.sketch.getMaxItem()) + .add("p50", sketch.getQuantile(0.5)) + .add("p75", sketch.getQuantile(0.75)) + .add("p90", sketch.getQuantile(0.90)) + .add("p99", sketch.getQuantile(0.99)) + .add("p99.9", sketch.getQuantile(0.999)) + .toString(); + } + + private static class SketchParameters + { + private final Comparator comparator; + private final ArrayOfItemsSerDe serde; + + public SketchParameters(Comparator comparator, ArrayOfItemsSerDe serde) + { + this.comparator = comparator; + this.serde = serde; + } + + public Comparator getComparator() + { + return comparator; + } + + public ArrayOfItemsSerDe getSerde() + { + return serde; + } + } + + private static SketchParameters getSketchParameters(Type type) + { + if (type.equals(REAL)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (isShortDecimal(type)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (isLongDecimal(type)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (type.equals(DOUBLE)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (type.equals(BOOLEAN)) { + return new SketchParameters<>(Boolean::compareTo, new ArrayOfBooleansSerDe()); + } + else if (type instanceof AbstractIntType || type instanceof AbstractLongType || type.equals(SMALLINT) || type.equals(TINYINT)) { + return new SketchParameters<>(Long::compareTo, new ArrayOfLongsSerDe()); + } + else if (type instanceof AbstractVarcharType) { + return new SketchParameters<>(String::compareTo, new ArrayOfStringsSerDe()); + } + else { + throw new PrestoException(INVALID_ARGUMENTS, "Unsupported type for KLL sketch: " + type); + } + } +} diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index 95e35fbcece0f..ebc9814c774b4 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -21,6 +21,9 @@ import com.facebook.presto.common.transaction.TransactionId; import com.facebook.presto.common.type.FixedWidthType; import com.facebook.presto.common.type.TimeZoneKey; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.common.type.TypeParameter; +import com.facebook.presto.hive.BaseHiveColumnHandle; import com.facebook.presto.hive.HdfsConfiguration; import com.facebook.presto.hive.HdfsConfigurationInitializer; import com.facebook.presto.hive.HdfsContext; @@ -41,6 +44,8 @@ import com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorMetadata; import com.facebook.presto.spi.security.AllowAllAccessControl; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.testing.MaterializedResult; @@ -54,6 +59,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; +import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -101,17 +107,23 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.UUID; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; import static com.facebook.presto.SystemSessionProperties.LEGACY_TIMESTAMP; +import static com.facebook.presto.SystemSessionProperties.OPTIMIZER_USE_HISTOGRAMS; import static com.facebook.presto.common.type.BigintType.BIGINT; +import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.common.type.IntegerType.INTEGER; +import static com.facebook.presto.common.type.RealType.REAL; import static com.facebook.presto.common.type.TimeZoneKey.UTC_KEY; import static com.facebook.presto.common.type.VarcharType.VARCHAR; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.SYNTHESIZED; @@ -135,15 +147,20 @@ import static com.facebook.presto.testing.TestingConnectorSession.SESSION; import static com.facebook.presto.testing.assertions.Assert.assertEquals; import static com.facebook.presto.tests.sql.TestTable.randomTableSuffix; +import static com.facebook.presto.type.DecimalParametricType.DECIMAL; +import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.io.Files.createTempDir; import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; +import static java.util.function.Function.identity; import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_FILES_PROP; import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertTrue; @Test(singleThreaded = true) @@ -1026,6 +1043,11 @@ private TableStatistics getTableStats(String name, Optional snapshot) return getTableStats(name, snapshot, getSession(), Optional.empty()); } + private TableStatistics getTableStats(String name, Optional snapshot, Session session) + { + return getTableStats(name, snapshot, session, Optional.empty()); + } + private TableStatistics getTableStats(String name, Optional snapshot, Session session, Optional> columns) { TransactionId transactionId = getQueryRunner().getTransactionManager().beginTransaction(false); @@ -1453,6 +1475,59 @@ public void testMetadataDeleteOnUnPartitionedTableWithDeleteFiles() } } + @DataProvider(name = "validHistogramTypes") + public Object[][] validHistogramTypesDataProvider() + { + return new Object[][] { + // types not supported in Iceberg connector, but that histogram could support + // {TINYINT, new String[]{"1", "2", "10"}}, + // {SMALLINT, new String[]{"1", "2", "10"}}, + // {TIMESTAMP_WITH_TIME_ZONE, new String[]{"now() + interval '1' hour", "now() + interval '2' hour"}}, + // iceberg stores microsecond precision but presto calculates on millisecond precision + // need a fix to properly convert for the optimizer. + // {TIMESTAMP, new String[] {"localtimestamp + interval '1' hour", "localtimestamp + interval '2' hour"}}, + // {TIME, new String[] {"localtime", "localtime + interval '1' hour"}}, + // supported types + {INTEGER, new String[] {"1", "5", "9"}}, + {BIGINT, new String[] {"2", "4", "6"}}, + {DOUBLE, new String[] {"1.0", "3.1", "4.6"}}, + // short decimal + {DECIMAL.createType(ImmutableList.of(TypeParameter.of(2L), TypeParameter.of(1L))), new String[] {"0.0", "3.0", "4.0"}}, + // long decimal + {DECIMAL.createType(ImmutableList.of(TypeParameter.of(38L), TypeParameter.of(1L))), new String[] {"0.0", "3.0", "4.0"}}, + {DATE, new String[] {"date '2024-01-01'", "date '2024-03-30'", "date '2024-05-30'"}}, + {REAL, new String[] {"1.0", "2.0", "3.0"}}, + }; + } + + /** + * Verifies that the histogram is returned after ANALYZE for a variety of types + */ + @Test(dataProvider = "validHistogramTypes") + public void testHistogramStorage(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE create_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO create_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE create_histograms"); + TableStatistics tableStatistics = getTableStats("create_histograms"); + Map nameToHandle = tableStatistics.getColumnStatistics().keySet() + .stream().map(IcebergColumnHandle.class::cast) + .collect(Collectors.toMap(BaseHiveColumnHandle::getName, identity())); + assertNotNull(nameToHandle.get("c")); + IcebergColumnHandle handle = nameToHandle.get("c"); + ColumnStatistics statistics = tableStatistics.getColumnStatistics().get(handle); + assertTrue(statistics.getHistogram().isPresent()); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + } + } + @Test public void testMetadataDeleteOnPartitionedTableWithDeleteFiles() { @@ -1998,6 +2073,151 @@ public void testBatchReadOnTimeType(WriterVersion writerVersion) assertQuerySucceeds("DROP TABLE time_batch_read"); } + public void testAllNullHistogramColumn() + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_all_nulls"); + assertQuerySucceeds("CREATE TABLE histogram_all_nulls (c bigint)"); + TableStatistics stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst().isPresent()); + assertUpdate("INSERT INTO histogram_all_nulls VALUES NULL, NULL, NULL, NULL, NULL", 5); + stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst() + .get().getHistogram().isPresent()); + assertQuerySucceeds(session, "ANALYZE histogram_all_nulls"); + stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst() + .get().getHistogram().isPresent()); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_all_nulls"); + } + } + + @Test(dataProvider = "validHistogramTypes") + public void testHistogramShowStats(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE show_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO show_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE show_histograms"); + TableStatistics tableStatistics = getTableStats("show_histograms", Optional.empty(), session, Optional.empty()); + Map> histogramByColumnName = tableStatistics.getColumnStatistics() + .entrySet() + .stream() + .collect(toImmutableMap( + entry -> ((IcebergColumnHandle) entry.getKey()).getName(), + entry -> entry.getValue().getHistogram())); + MaterializedResult stats = getQueryRunner().execute("SHOW STATS for show_histograms"); + stats.getMaterializedRows() + .forEach(row -> { + String name = (String) row.getField(0); + String histogram = (String) row.getField(7); + assertEquals(Optional.ofNullable(histogramByColumnName.get(name)) + .flatMap(identity()) + .map(Objects::toString).orElse(null), + histogram); + }); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS show_histograms"); + } + } + + /** + * Verifies that when the users opts-in to using histograms that the + * optimizer estimates reflect the actual dataset for a variety of filter + * types (LTE, GT, EQ, NE) on a non-uniform data distribution + */ + @Test + public void testHistogramsUsedInOptimization() + { + Session histogramSession = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + // standard-normal distribution should have vastly different estimates than uniform at the tails (e.g. -3, +3) + NormalDistribution dist = new NormalDistribution(0, 1); + double[] values = dist.sample(1000); + Arrays.sort(values); + + try { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_validation"); + assertQuerySucceeds("CREATE TABLE histogram_validation (c double)"); + assertQuerySucceeds(String.format("INSERT INTO histogram_validation VALUES %s", Joiner.on(", ").join(Arrays.stream(values).iterator()))); + assertQuerySucceeds(histogramSession, "ANALYZE histogram_validation"); + Consumer assertFilters = (value) -> { + // use Math.abs because if the value isn't found, the returned value of binary + // search is (- insert index). The absolute value index tells us roughly how + // many records would have been returned regardless of if the actual value is in the + // dataset + double estimatedRowCount = Math.abs(Arrays.binarySearch(values, value)); + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c <= " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(estimatedRowCount, 25)); + // check that inverse filter equals roughly the inverse number of rows + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c > " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(Math.max(0.0, values.length - estimatedRowCount), 25)); + // having an exact random double value from the distribution exist more than once is exceedingly rare. + // the histogram calculation should return 1 (and the inverse) in both situations + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c = " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(1.0, 25)); + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c != " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(values.length - 1, 25)); + }; + + assertFilters.accept(values[1]); // choose 1 greater than the min value + assertFilters.accept(-2.0); // should be very unlikely to generate a distribution where all values > -2.0 + assertFilters.accept(-1.0); + assertFilters.accept(0.0); + assertFilters.accept(1.0); + assertFilters.accept(2.0); // should be very unlikely to generate a distribution where all values < 2.0 + assertFilters.accept(values[values.length - 2]); // choose 1 less than the max value + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_validation"); + } + } + + /** + * Verifies that the data in the histogram matches the mins/maxs of the values + * in the table when created + */ + @Test(dataProvider = "validHistogramTypes") + public void testHistogramReconstruction(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS verify_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE verify_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO verify_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE verify_histograms"); + TableStatistics tableStatistics = getTableStats("verify_histograms", Optional.empty(), session, Optional.empty()); + Map nameToHandle = tableStatistics.getColumnStatistics().keySet() + .stream().map(IcebergColumnHandle.class::cast) + .collect(Collectors.toMap(BaseHiveColumnHandle::getName, identity())); + assertNotNull(nameToHandle.get("c")); + IcebergColumnHandle handle = nameToHandle.get("c"); + ColumnStatistics statistics = tableStatistics.getColumnStatistics().get(handle); + ConnectorHistogram histogram = statistics.getHistogram().get(); + DoubleRange range = statistics.getRange().get(); + double min = range.getMin(); + double max = range.getMax(); + assertEquals(histogram.inverseCumulativeProbability(0.0).getValue(), min); + assertEquals(histogram.inverseCumulativeProbability(1.0).getValue(), max); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS verify_histograms"); + } + } + private void testCheckDeleteFiles(Table icebergTable, int expectedSize, List expectedFileContent) { // check delete file list diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java index d28503a27d316..588b7273d44c5 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java @@ -69,7 +69,8 @@ public void testDefaults() .setMetadataPreviousVersionsMax(METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT) .setMetadataDeleteAfterCommit(METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT) .setMetricsMaxInferredColumn(METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT) - .setMaxStatisticsFileCacheSize(succinctDataSize(256, MEGABYTE))); + .setMaxStatisticsFileCacheSize(succinctDataSize(256, MEGABYTE)) + .setStatisticsKllSketchKParameter(1024)); } @Test @@ -101,6 +102,7 @@ public void testExplicitPropertyMappings() .put("iceberg.metadata-delete-after-commit", "true") .put("iceberg.metrics-max-inferred-column", "16") .put("iceberg.max-statistics-file-cache-size", "512MB") + .put("iceberg.statistics-kll-sketch-k-parameter", "4096") .build(); IcebergConfig expected = new IcebergConfig() @@ -128,7 +130,8 @@ public void testExplicitPropertyMappings() .setMetadataPreviousVersionsMax(1) .setMetadataDeleteAfterCommit(true) .setMetricsMaxInferredColumn(16) - .setMaxStatisticsFileCacheSize(succinctDataSize(512, MEGABYTE)); + .setMaxStatisticsFileCacheSize(succinctDataSize(512, MEGABYTE)) + .setStatisticsKllSketchKParameter(4096); assertFullMapping(properties, expected); } diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java new file mode 100644 index 0000000000000..2db83a1b8f0a6 --- /dev/null +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java @@ -0,0 +1,166 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.presto.common.type.CharType; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.common.type.VarcharType; +import com.google.common.base.VerifyException; +import io.airlift.slice.Slices; +import org.apache.datasketches.common.ArrayOfDoublesSerDe; +import org.apache.datasketches.common.ArrayOfLongsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.nio.ByteBuffer; +import java.util.stream.DoubleStream; +import java.util.stream.LongStream; + +import static com.facebook.presto.common.type.BigintType.BIGINT; +import static com.facebook.presto.common.type.BooleanType.BOOLEAN; +import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DecimalType.createDecimalType; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.IntegerType.INTEGER; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.TimeType.TIME; +import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; +import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; +import static com.facebook.presto.common.type.VarcharType.VARCHAR; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertThrows; + +public class TestKllHistogram +{ + @SuppressWarnings("unchecked") + @Test + public void testSimpleCreation() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()); + DoubleStream.iterate(0.0, i -> i + 1).limit(100).forEach(sketch::update); + KllHistogram histogram = new KllHistogram(Slices.wrappedBuffer(ByteBuffer.wrap(sketch.toByteArray())), DOUBLE); + assertSketchesEqual(histogram.getKllSketch(), sketch); + } + + @Test + public void smokeTestHistogram() + { + // a histogram with a uniform distribution from 0.0 to 99.0 + KllHistogram basicHistogram = generateDoublesHistogram(); + // inverse cumulative probability + assertEquals(basicHistogram.inverseCumulativeProbability(0.0).getValue(), 0.0, 1E-8); + assertEquals(basicHistogram.inverseCumulativeProbability(1.0).getValue(), 99.0, 1E-8); + assertEquals(basicHistogram.inverseCumulativeProbability(0.5).getValue(), 49.0, 1E-8); + + // cumulative probability w/ inclusivities + assertEquals(basicHistogram.cumulativeProbability(0.0, true).getValue(), 0.01, 1E-8); + assertEquals(basicHistogram.cumulativeProbability(0.0, false).getValue(), 0.0, 1E-8); + + assertEquals(basicHistogram.cumulativeProbability(49.0, false).getValue(), 0.49, 1E-8); + assertEquals(basicHistogram.cumulativeProbability(49.0, true).getValue(), 0.5, 1E-8); + + assertEquals(basicHistogram.cumulativeProbability(99.0, false).getValue(), 0.99, 1E-8); + assertEquals(basicHistogram.cumulativeProbability(99.0, true).getValue(), 1.0, 1E-8); + } + + @DataProvider(name = "kllSupportedTypes") + public static Object[][] kllHistogramTypeDataProvider() + { + return new Object[][] { + // long decimal (represented by Slice.class), currently not supported + // {createDecimalType(), TestKllHistogram.generateLongSketch()}, + // time and timestamp types need additional changes because iceberg stores them in + // microsecond format but Presto always processes in milliseconds + // {TIMESTAMP_WITH_TIME_ZONE, generateLongSketch()}, + // {TIMESTAMP_MICROSECONDS, generateLongSketch()}, + // {TIMESTAMP, generateLongSketch()}, + // {TIME, generateLongSketch()}, + {INTEGER, TestKllHistogram.generateLongSketch()}, + {BIGINT, TestKllHistogram.generateLongSketch()}, + {DOUBLE, TestKllHistogram.generateDoubleSketch()}, + {createDecimalType(3, 1), TestKllHistogram.generateDoubleSketch()}, + {DATE, TestKllHistogram.generateLongSketch()}, + {createDecimalType(38, 0), TestKllHistogram.generateDoubleSketch()}, + {REAL, generateDoubleSketch()}, + }; + } + + @DataProvider(name = "kllUnsupportedTypes") + public static Object[][] unsupportedKllHistogramTypes() + { + return new Object[][] { + // long decimal (represented by Slice.class), currently not supported + {CharType.createCharType(0)}, + {CharType.createCharType(100)}, + {BOOLEAN}, + {VARCHAR}, + {VarcharType.createVarcharType(10)}, + {TIMESTAMP}, + {TIMESTAMP_WITH_TIME_ZONE}, + {TIME}, + }; + } + + @SuppressWarnings("rawtypes") + @Test(dataProvider = "kllSupportedTypes") + public void testTypeCreation(Type type, KllItemsSketch sketch) + { + KllHistogram histogram = new KllHistogram(Slices.wrappedBuffer(sketch.toByteArray()), type); + double value = histogram.inverseCumulativeProbability(0.5).getValue(); + double probability = histogram.cumulativeProbability(49.0, true).getValue(); + assertEquals(probability, 0.5); + assertEquals(value, 49.0); + } + + @Test(dataProvider = "kllUnsupportedTypes") + public void testUnsupportedKllTypes(Type type) + { + assertThrows(VerifyException.class, () -> { + new KllHistogram(null, type); + }); + } + + /** + * @return generates a histogram of doubles from [0.0, 99.9] in intervals of 1.0 + */ + private static KllHistogram generateDoublesHistogram() + { + return new KllHistogram(Slices.wrappedBuffer(ByteBuffer.wrap(generateDoubleSketch().toByteArray())), DOUBLE); + } + + private static KllItemsSketch generateLongSketch() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Long::compareTo, new ArrayOfLongsSerDe()); + LongStream.iterate(0, i -> i + 1).limit(100).forEach(sketch::update); + return sketch; + } + + private static KllItemsSketch generateDoubleSketch() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()); + DoubleStream.iterate(0.0, i -> i + 1).limit(100).forEach(sketch::update); + return sketch; + } + + private static void assertSketchesEqual(KllItemsSketch sketch, KllItemsSketch other) + { + assertEquals(other.getK(), sketch.getK()); + assertEquals(other.getN(), sketch.getN()); + assertEquals(other.getMinItem(), sketch.getMinItem()); + assertEquals(other.getMaxItem(), sketch.getMaxItem()); + assertEquals(other.getSortedView().getCumulativeWeights(), sketch.getSortedView().getCumulativeWeights()); + assertEquals(other.getSortedView().getQuantiles(), sketch.getSortedView().getQuantiles()); + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java index 306e00cfb9079..efc05bf4a20a6 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java @@ -17,7 +17,10 @@ import com.facebook.presto.Session; import com.facebook.presto.SystemSessionProperties; import com.facebook.presto.spi.relation.VariableReferenceExpression; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.spi.statistics.HistogramCalculator; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import com.facebook.presto.sql.tree.ComparisonExpression; import java.util.Optional; @@ -156,7 +159,7 @@ private PlanNodeStatsEstimate estimateFilterRange( .setStatisticsRange(intersectRange) .setNullsFraction(0.0); if (useHistograms) { - symbolNewEstimate.setHistogram(expressionStatistics.getHistogram().map(expressionHistogram -> DisjointRangeDomainHistogram.addConjunction(expressionHistogram, intersectRange))); + symbolNewEstimate.setHistogram(expressionStatistics.getHistogram().map(expressionHistogram -> DisjointRangeDomainHistogram.addConjunction(expressionHistogram, intersectRange.toPrestoRange()))); } estimate = estimate.mapVariableColumnStatistics(expressionVariable.get(), oldStats -> symbolNewEstimate.build()); @@ -171,7 +174,7 @@ private double calculateFilterFactor(VariableStatsEstimate variableStatistics, S Estimate filterEstimate; if (useHistograms) { Estimate distinctEstimate = isNaN(variableStatistics.getDistinctValuesCount()) ? Estimate.unknown() : Estimate.of(variableRange.getDistinctValuesCount()); - filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange, variableStatistics.getHistogram().orElse(new UniformDistributionHistogram(variableStatistics.getLowValue(), variableStatistics.getHighValue())), distinctEstimate, true); + filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange.toPrestoRange(), intersectRange.getDistinctValuesCount(), variableStatistics.getHistogram().orElse(new UniformDistributionHistogram(variableStatistics.getLowValue(), variableStatistics.getHighValue())), distinctEstimate, true); if (log.isDebugEnabled()) { double expressionFilter = variableRange.overlapPercentWith(intersectRange); if (!Double.isNaN(expressionFilter) && diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java index 7a2e804aa7e17..85a51c2b6bb51 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java @@ -141,6 +141,11 @@ private static ColumnStatistics toColumnStatistics(VariableStatsEstimate variabl if (!Double.isNaN(variableStatsEstimate.getLowValue()) && !Double.isNaN(variableStatsEstimate.getHighValue())) { builder.setRange(new DoubleRange(variableStatsEstimate.getLowValue(), variableStatsEstimate.getHighValue())); } + + if (variableStatsEstimate.getHistogram().isPresent()) { + builder.setHistogram(variableStatsEstimate.getHistogram()); + } + return builder.build(); } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java index 77e0b487a79d1..b9f340b623efb 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java @@ -34,10 +34,10 @@ import static com.facebook.presto.SystemSessionProperties.getDefaultJoinSelectivityCoefficient; import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.cost.FilterStatsCalculator.UNKNOWN_FILTER_COEFFICIENT; import static com.facebook.presto.cost.VariableStatsEstimate.buildFrom; import static com.facebook.presto.expressions.LogicalRowExpressions.extractConjuncts; +import static com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.sql.analyzer.ExpressionTreeUtils.getNodeLocation; import static com.facebook.presto.sql.planner.plan.Patterns.join; import static com.facebook.presto.sql.tree.ComparisonExpression.Operator.EQUAL; @@ -250,7 +250,7 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv); if (useHistograms) { - newLeftStats.setHistogram(leftStats.getHistogram().map(leftHistogram -> addConjunction(leftHistogram, intersect))); + newLeftStats.setHistogram(leftStats.getHistogram().map(leftHistogram -> addConjunction(leftHistogram, intersect.toPrestoRange()))); } VariableStatsEstimate.Builder newRightStats = buildFrom(rightStats) @@ -258,7 +258,7 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv); if (useHistograms) { - newRightStats.setHistogram(rightStats.getHistogram().map(rightHistogram -> addConjunction(rightHistogram, intersect))); + newRightStats.setHistogram(rightStats.getHistogram().map(rightHistogram -> addConjunction(rightHistogram, intersect.toPrestoRange()))); } PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(stats) diff --git a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java index 1b2797e18a8af..2a280ae2524ba 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java @@ -14,10 +14,11 @@ package com.facebook.presto.cost; import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import java.util.Optional; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; +import static com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram.addConjunction; import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Double.NaN; import static java.lang.Double.isNaN; @@ -139,7 +140,7 @@ public PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNodeStats double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); if (shouldUseHistograms) { - newSymbolStats.setHistogram(symbolStats.getHistogram().map(symbolHistogram -> addConjunction(symbolHistogram, new StatisticRange(newLow, newHigh, 0)))); + newSymbolStats.setHistogram(symbolStats.getHistogram().map(symbolHistogram -> addConjunction(symbolHistogram, new StatisticRange(newLow, newHigh, 0).toPrestoRange()))); } result.addVariableStatistics(symbol, newSymbolStats.build()); @@ -296,8 +297,8 @@ private VariableStatsEstimate addColumnStats( .setNullsFraction(newNullsFraction); if (shouldUseHistograms) { Optional newHistogram = RangeAdditionStrategy.INTERSECT == strategy ? - leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addConjunction(leftHistogram, rightRange)) : - leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addDisjunction(leftHistogram, rightRange)); + leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addConjunction(leftHistogram, rightRange.toPrestoRange())) : + leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addDisjunction(leftHistogram, rightRange.toPrestoRange())); statistics.setHistogram(newHistogram); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java index 4d80e13cb92cc..060e02bc8b07b 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java @@ -13,19 +13,19 @@ */ package com.facebook.presto.cost; +import com.facebook.presto.common.predicate.Range; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.collect.BoundType; -import com.google.common.collect.Range; import java.util.Objects; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; import static com.facebook.presto.util.MoreMath.nearlyEqual; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; -import static java.lang.Double.NEGATIVE_INFINITY; import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; @@ -36,9 +36,6 @@ public class StatisticRange { - protected static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; - protected static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; - // TODO unify field and method names with SymbolStatsEstimate /** * {@code NaN} represents empty range ({@code high} must be {@code NaN} too) @@ -222,19 +219,12 @@ public StatisticRange addAndCollapseDistinctValues(StatisticRange other) return expandRangeWithNewDistinct(newDistinctValues, other); } - public Range toRange() + public Range toPrestoRange() { - return Range.range(low, openLow ? BoundType.OPEN : BoundType.CLOSED, high, openHigh ? BoundType.OPEN : BoundType.CLOSED); - } - - public static StatisticRange fromRange(Range range) - { - return new StatisticRange( - range.hasLowerBound() ? range.lowerEndpoint() : NEGATIVE_INFINITY, - !range.hasLowerBound() || range.lowerBoundType() == BoundType.OPEN, - range.hasUpperBound() ? range.upperEndpoint() : POSITIVE_INFINITY, - !range.hasUpperBound() || range.upperBoundType() == BoundType.OPEN, - NaN); + if (low == high) { + return Range.equal(DOUBLE, low); + } + return Range.range(DOUBLE, low, !openLow, high, !openHigh); } private StatisticRange expandRangeWithNewDistinct(double newDistinctValues, StatisticRange other) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/BasePlanFragmenter.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/BasePlanFragmenter.java index ceb0186ffb413..190271f3e39ae 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/BasePlanFragmenter.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/BasePlanFragmenter.java @@ -62,7 +62,6 @@ import static com.facebook.presto.SystemSessionProperties.isForceSingleNodeOutput; import static com.facebook.presto.SystemSessionProperties.isSingleNodeExecutionEnabled; -import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static com.facebook.presto.sql.TemporaryTableUtil.assignPartitioningVariables; import static com.facebook.presto.sql.TemporaryTableUtil.assignTemporaryTableColumnNames; @@ -122,7 +121,7 @@ public BasePlanFragmenter( this.idAllocator = requireNonNull(idAllocator, "idAllocator is null"); this.variableAllocator = requireNonNull(variableAllocator, "variableAllocator is null"); this.outputTableWriterNodeIds = ImmutableSet.copyOf(requireNonNull(outputTableWriterNodeIds, "outputTableWriterNodeIds is null")); - this.statisticsAggregationPlanner = new StatisticsAggregationPlanner(variableAllocator, metadata.getFunctionAndTypeManager(), session, shouldOptimizerUseHistograms(session)); + this.statisticsAggregationPlanner = new StatisticsAggregationPlanner(variableAllocator, metadata.getFunctionAndTypeManager(), session); } public SubPlan buildRootFragment(PlanNode root, FragmentProperties properties) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java index 2b68e642dec97..9d0d1aaaf1804 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java @@ -86,7 +86,6 @@ import java.util.Optional; import java.util.Set; -import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.VarbinaryType.VARBINARY; import static com.facebook.presto.metadata.MetadataUtil.getConnectorIdOrThrow; @@ -135,7 +134,7 @@ public LogicalPlanner( this.idAllocator = requireNonNull(idAllocator, "idAllocator is null"); this.metadata = requireNonNull(metadata, "metadata is null"); this.variableAllocator = requireNonNull(variableAllocator, "variableAllocator is null"); - this.statisticsAggregationPlanner = new StatisticsAggregationPlanner(this.variableAllocator, metadata.getFunctionAndTypeManager(), session, shouldOptimizerUseHistograms(session)); + this.statisticsAggregationPlanner = new StatisticsAggregationPlanner(this.variableAllocator, metadata.getFunctionAndTypeManager(), session); this.sqlParser = requireNonNull(sqlParser, "sqlParser is null"); } diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java index 01da076c68078..dab0770f4788d 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java @@ -43,6 +43,7 @@ import java.util.Optional; import java.util.stream.Collectors; +import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.UnknownType.UNKNOWN; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; @@ -62,13 +63,13 @@ public class StatisticsAggregationPlanner private final Session session; private final FunctionAndTypeManager functionAndTypeManager; - public StatisticsAggregationPlanner(VariableAllocator variableAllocator, FunctionAndTypeManager functionAndTypeManager, Session session, boolean useHistograms) + public StatisticsAggregationPlanner(VariableAllocator variableAllocator, FunctionAndTypeManager functionAndTypeManager, Session session) { this.variableAllocator = requireNonNull(variableAllocator, "variableAllocator is null"); this.session = requireNonNull(session, "session is null"); this.functionAndTypeManager = requireNonNull(functionAndTypeManager, "functionAndTypeManager is null"); this.functionAndTypeResolver = functionAndTypeManager.getFunctionAndTypeResolver(); - this.useHistograms = useHistograms; + this.useHistograms = shouldOptimizerUseHistograms(session); } public TableStatisticAggregation createStatisticsAggregation(TableStatisticsMetadata statisticsMetadata, Map columnToVariableMap) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 7a7396824e091..801cb6f337732 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -21,6 +21,7 @@ import com.facebook.presto.common.type.IntegerType; import com.facebook.presto.common.type.RealType; import com.facebook.presto.common.type.SmallintType; +import com.facebook.presto.common.type.SqlTime; import com.facebook.presto.common.type.SqlTimestamp; import com.facebook.presto.common.type.TinyintType; import com.facebook.presto.common.type.Type; @@ -80,6 +81,7 @@ import static com.facebook.presto.common.type.SqlTimestamp.MICROSECONDS_PER_MILLISECOND; import static com.facebook.presto.common.type.StandardTypes.DOUBLE; import static com.facebook.presto.common.type.StandardTypes.VARCHAR; +import static com.facebook.presto.common.type.TimeType.TIME; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.metadata.MetadataUtil.createQualifiedObjectName; import static com.facebook.presto.sql.QueryUtil.aliased; @@ -373,6 +375,9 @@ private Expression toStringLiteral(Type type, double value) if (type.equals(TIMESTAMP)) { return new StringLiteral(new SqlTimestamp(round(value) / MICROSECONDS_PER_MILLISECOND, session.getSqlFunctionProperties().getTimeZoneKey(), MILLISECONDS).toString()); } + if (type.equals(TIME)) { + return new StringLiteral(new SqlTime(round(value)).toString()); + } throw new IllegalArgumentException("Unexpected type: " + type); } } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java b/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java deleted file mode 100644 index ddccfdfe3c065..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import org.testng.annotations.Test; - -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; -import static org.testng.Assert.assertEquals; - -public class TestHistogramCalculator -{ - @Test - public void testCalculateFilterFactor() - { - StatisticRange zeroToTen = range(0, 10, 10); - StatisticRange empty = StatisticRange.empty(); - - // Equal ranges - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 5); - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 20); - - // Some overlap - assertFilterFactor(Estimate.of(0.5), range(5, 3000, 5), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Single value overlap - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(3, 3, 1), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(10, 100, 357), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // No overlap - assertFilterFactor(Estimate.zero(), range(20, 30, 10), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Empty ranges - assertFilterFactor(Estimate.zero(), zeroToTen, uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // no test for (empty, empty) since any return value is correct - assertFilterFactor(Estimate.zero(), unboundedRange(10), uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(unboundedRange(10)), 10); - - // Unbounded (infinite), NDV-based - assertFilterFactor(Estimate.of(0.5), unboundedRange(10), uniformHist(unboundedRange(20)), 20); - assertFilterFactor(Estimate.of(1.0), unboundedRange(20), uniformHist(unboundedRange(10)), 10); - - // NEW TESTS (TPC-H Q2) - // unbounded ranges - assertFilterFactor(Estimate.of(.5), unboundedRange(0.5), uniformHist(unboundedRange(NaN)), NaN); - // unbounded ranges with limited distinct values - assertFilterFactor(Estimate.of(0.2), unboundedRange(1.0), - domainConstrained(unboundedRange(5.0), uniformHist(unboundedRange(7.0))), 5.0); - } - - private static StatisticRange range(double low, double high, double distinctValues) - { - return new StatisticRange(low, high, distinctValues); - } - - private static StatisticRange unboundedRange(double distinctValues) - { - return new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, distinctValues); - } - - private static void assertFilterFactor(Estimate expected, StatisticRange range, ConnectorHistogram histogram, double totalDistinctValues) - { - assertEquals( - calculateFilterFactor(range, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), - expected); - } - - private static ConnectorHistogram uniformHist(StatisticRange range) - { - return uniformHist(range.getLow(), range.getHigh()); - } - - private static ConnectorHistogram uniformHist(double low, double high) - { - return new UniformDistributionHistogram(low, high); - } - - private static ConnectorHistogram domainConstrained(StatisticRange range, ConnectorHistogram source) - { - return DisjointRangeDomainHistogram.addDisjunction(source, range); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java index dc2e60bb46e8d..79e47477bc129 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java @@ -15,6 +15,8 @@ import com.facebook.presto.spi.relation.VariableReferenceExpression; import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import org.testng.annotations.Test; import java.util.Optional; @@ -374,31 +376,31 @@ public void testAddHistograms() assertEquals(calculator.addStatsAndCollapseDistinctValues(unknownRowCount, unknownRowCount).getVariableStatistics(VARIABLE).getHistogram(), Optional.empty()); // check when rows are available histograms are added properly. - ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen); + ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen.toPrestoRange()); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndSumDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndCollapseDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndMaxDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndIntersect, addedSameRange); // check when only a sub-range is added, that the histogram still represents the full range - ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen); - ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToFive); + ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen.toPrestoRange()); + ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToFive.toPrestoRange()); assertAddStatsHistogram(first, second, calculator::addStatsAndSumDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndCollapseDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndMaxDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndIntersect, intersectedRangeSecond); // check when two ranges overlap, the new stats span both ranges - ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen); - ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen); + ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen.toPrestoRange()); + ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen.toPrestoRange()); assertAddStatsHistogram(second, third, calculator::addStatsAndSumDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndCollapseDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndMaxDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndIntersect, intersectedRangeSecondThird); // check when two ranges partially overlap, the addition/intersection is applied correctly - ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven); - ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven); + ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven.toPrestoRange()); + ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven.toPrestoRange()); assertAddStatsHistogram(third, fourth, calculator::addStatsAndSumDistinctValues, fullRangeThirdFourth); assertAddStatsHistogram(third, fourth, calculator::addStatsAndCollapseDistinctValues, fullRangeThirdFourth); assertAddStatsHistogram(third, fourth, calculator::addStatsAndMaxDistinctValues, fullRangeThirdFourth); @@ -419,7 +421,7 @@ private static PlanNodeStatsEstimate statistics(double rowCount, double totalSiz .setNullsFraction(nullsFraction) .setAverageRowSize(averageRowSize) .setStatisticsRange(range) - .setHistogram(Optional.of(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range))) + .setHistogram(Optional.of(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range.toPrestoRange()))) .build()) .build(); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java b/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java index b26665eb30af4..b0f364d1f34e1 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java @@ -15,7 +15,7 @@ package com.facebook.presto.cost; import com.facebook.airlift.json.JsonCodec; -import com.google.common.collect.Range; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import org.testng.annotations.Test; import java.util.Optional; @@ -33,7 +33,7 @@ public void testSkipHistogramSerialization() VariableStatsEstimate estimate = VariableStatsEstimate.builder() .setAverageRowSize(100) .setDistinctValuesCount(100) - .setStatisticsRange(StatisticRange.fromRange(Range.open(1.0d, 2.0d))) + .setStatisticsRange(new StatisticRange(55, 65, 100)) .setHistogram(Optional.of(new UniformDistributionHistogram(55, 65))) .setNullsFraction(0.1) .build(); diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java new file mode 100644 index 0000000000000..720adc0a0bf5b --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.sql.planner.assertions; + +import com.facebook.presto.Session; +import com.facebook.presto.cost.StatsProvider; +import com.facebook.presto.metadata.Metadata; +import com.facebook.presto.spi.plan.PlanNode; + +import static com.google.common.base.Verify.verify; + +public class ApproximateStatsOutputRowCountMatcher + implements Matcher +{ + private final double expectedOutputRowCount; + private final double error; + + ApproximateStatsOutputRowCountMatcher(double expectedOutputRowCount, double error) + { + verify(error >= 0.0, "error must be >= 0.0"); + verify(expectedOutputRowCount >= 0.0, "expectedOutputRowCount must be >= 0.0"); + this.expectedOutputRowCount = expectedOutputRowCount; + this.error = error; + } + + @Override + public boolean shapeMatches(PlanNode node) + { + return true; + } + + @Override + public MatchResult detailMatches(PlanNode node, StatsProvider stats, Session session, Metadata metadata, SymbolAliases symbolAliases) + { + return new MatchResult(Math.abs(stats.getStats(node).getOutputRowCount() - expectedOutputRowCount) < error); + } + + @Override + public String toString() + { + return "approximateExpectedOutputRowCount(" + expectedOutputRowCount + ", " + error + ")"; + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java index e059c42e89a90..d06a36ae81279 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java @@ -815,6 +815,12 @@ public PlanMatchPattern withOutputRowCount(boolean exactMatch, String expectedSo return this; } + public PlanMatchPattern withApproximateOutputRowCount(double expectedOutputRowCount, double error) + { + matchers.add(new ApproximateStatsOutputRowCountMatcher(expectedOutputRowCount, error)); + return this; + } + public PlanMatchPattern withOutputSize(double expectedOutputSize) { matchers.add(new StatsOutputSizeMatcher(expectedOutputSize)); diff --git a/presto-spi/pom.xml b/presto-spi/pom.xml index c1099d4e8597c..9a06db72ca290 100644 --- a/presto-spi/pom.xml +++ b/presto-spi/pom.xml @@ -126,5 +126,11 @@ assertj-core test + + + org.apache.commons + commons-math3 + test + diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index 97b1ac7fd4c1a..255c72835099c 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -29,6 +29,8 @@ public final class ColumnStatistics private static final long COLUMN_STATISTICS_SIZE = ClassLayout.parseClass(ColumnStatistics.class).instanceSize(); private static final long OPTION_SIZE = ClassLayout.parseClass(Optional.class).instanceSize(); + public static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; + public static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty(), Optional.empty()); private final Estimate nullsFraction; @@ -224,6 +226,11 @@ public Builder setHistogram(Optional histogram) return this; } + public Optional getHistogram() + { + return histogram; + } + public Builder mergeWith(Builder other) { if (nullsFraction.isUnknown()) { diff --git a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java similarity index 74% rename from presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java index 553cf84d07bf9..2d5f44550b2d7 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java @@ -12,36 +12,33 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.common.predicate.Marker; +import com.facebook.presto.common.predicate.Range; +import com.facebook.presto.common.predicate.SortedRangeSet; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.base.Suppliers; -import com.google.common.collect.BoundType; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; -import com.google.common.collect.RangeSet; -import com.google.common.collect.TreeRangeSet; import org.openjdk.jol.info.ClassLayout; -import java.util.Collection; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; import java.util.NoSuchElementException; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.function.Supplier; -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static com.facebook.presto.util.MoreMath.max; -import static com.facebook.presto.util.MoreMath.min; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.facebook.presto.common.Utils.checkArgument; +import static com.facebook.presto.common.Utils.memoizedSupplier; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.NaN; import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; +import static java.lang.Math.max; +import static java.lang.Math.min; import static java.util.Objects.hash; import static java.util.Objects.requireNonNull; @@ -75,31 +72,27 @@ public class DisjointRangeDomainHistogram private final ConnectorHistogram source; // use RangeSet as the internal representation of the ranges, but the constructor arguments // use StatisticRange to support serialization and deserialization. - private final Supplier> rangeSet; - private final Set> ranges; + private final Supplier rangeSet; + private final Set ranges; @JsonCreator - public DisjointRangeDomainHistogram(@JsonProperty("source") ConnectorHistogram source, @JsonProperty("ranges") Collection ranges) - { - this(source, ranges.stream().map(StatisticRange::toRange).collect(toImmutableSet())); - } - - public DisjointRangeDomainHistogram(ConnectorHistogram source, Set> ranges) + public DisjointRangeDomainHistogram(ConnectorHistogram source, Set ranges) { this.source = requireNonNull(source, "source is null"); this.ranges = requireNonNull(ranges, "ranges is null"); - this.rangeSet = Suppliers.memoize(() -> { - RangeSet rangeSet = TreeRangeSet.create(); - rangeSet.addAll(ranges); + this.rangeSet = memoizedSupplier(() -> { + SortedRangeSet rangeSet = SortedRangeSet.copyOf(DOUBLE, new ArrayList<>(ranges)); return rangeSet.subRangeSet(getSourceSpan(this.source)); }); } - private static Range getSourceSpan(ConnectorHistogram source) + private static Range getSourceSpan(ConnectorHistogram source) { - return Range.closed( + return Range.range(DOUBLE, source.inverseCumulativeProbability(0.0).orElse(() -> NEGATIVE_INFINITY), - source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY)); + true, + source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY), + true); } @JsonProperty @@ -109,14 +102,14 @@ public ConnectorHistogram getSource() } @JsonProperty - public Set getRanges() + public SortedRangeSet getRanges() { - return rangeSet.get().asRanges().stream().map(StatisticRange::fromRange).collect(toImmutableSet()); + return rangeSet.get(); } public DisjointRangeDomainHistogram(ConnectorHistogram source) { - this(source, ImmutableSet.>of()); + this(source, Collections.emptySet()); } @Override @@ -130,17 +123,22 @@ public Estimate cumulativeProbability(double value, boolean inclusive) if (Double.isNaN(value)) { return Estimate.unknown(); } - Optional> optionalSpan = getSpan(); + Optional optionalSpan = getSpan(); if (!optionalSpan.isPresent()) { return Estimate.of(0.0); } - Range span = optionalSpan.get(); - if (value <= span.lowerEndpoint()) { + Range span = optionalSpan.get(); + if (value <= span.getLowValue().map(Double.class::cast) + .orElse(NEGATIVE_INFINITY)) { return Estimate.of(0.0); } - Range input = Range.range(span.lowerEndpoint(), span.lowerBoundType(), value, inclusive ? BoundType.CLOSED : BoundType.OPEN); + Range input = Range.range(DOUBLE, + span.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY), + span.getLow().getBound() == Marker.Bound.EXACTLY, + value, + inclusive); Estimate fullSetOverlap = calculateRangeSetOverlap(rangeSet.get()); - RangeSet spanned = rangeSet.get().subRangeSet(input); + SortedRangeSet spanned = rangeSet.get().subRangeSet(input); Estimate spannedOverlap = calculateRangeSetOverlap(spanned); return spannedOverlap.flatMap(spannedProbability -> @@ -152,11 +150,11 @@ public Estimate cumulativeProbability(double value, boolean inclusive) })); } - private Estimate calculateRangeSetOverlap(RangeSet ranges) + private Estimate calculateRangeSetOverlap(SortedRangeSet ranges) { // we require knowing bounds on all ranges double cumulativeTotal = 0.0; - for (Range range : ranges.asRanges()) { + for (Range range : ranges.getOrderedRanges()) { Estimate rangeProbability = getRangeProbability(range); if (rangeProbability.isUnknown()) { return Estimate.unknown(); @@ -173,9 +171,9 @@ private Estimate calculateRangeSetOverlap(RangeSet ranges) * @param range the range over the source domain * @return estimate of the total probability the range covers in the source */ - private Estimate getRangeProbability(Range range) + private Estimate getRangeProbability(Range range) { - return calculateFilterFactor(StatisticRange.fromRange(range), source, Estimate.unknown(), false); + return HistogramCalculator.calculateFilterFactor(range, NaN, source, Estimate.unknown(), false); } @Override @@ -189,17 +187,19 @@ public Estimate inverseCumulativeProbability(double percentile) // rangedPercentile = percentile - percentileLow // // percentileLow + (rangedPercentile * rangePercentileLength) - Optional> optionalSpan = getSpan(); + Optional optionalSpan = getSpan(); if (!optionalSpan.isPresent()) { return Estimate.unknown(); } - Range span = optionalSpan.get(); - if (percentile == 0.0 && isFinite(span.lowerEndpoint())) { - return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(span.lowerEndpoint(), sourceMin)); + Range span = optionalSpan.get(); + double lower = span.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + double upper = span.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY); + if (percentile == 0.0 && isFinite(lower)) { + return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(lower, sourceMin)); } - if (percentile == 1.0 && isFinite(span.upperEndpoint())) { - return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(span.upperEndpoint(), sourceMax)); + if (percentile == 1.0 && isFinite(upper)) { + return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(upper, sourceMax)); } Estimate totalCumulativeEstimate = calculateRangeSetOverlap(rangeSet.get()); @@ -213,9 +213,9 @@ public Estimate inverseCumulativeProbability(double percentile) } double cumulativeProbabilityNewDomain = 0.0; double lastRangeEstimateSourceDomain = 0.0; - Range currentRange = null; + Range currentRange = null; // find the range where the percentile falls - for (Range range : rangeSet.get().asRanges()) { + for (Range range : rangeSet.get().getOrderedRanges()) { Estimate rangeEstimate = getRangeProbability(range); if (rangeEstimate.isUnknown()) { return Estimate.unknown(); @@ -231,7 +231,8 @@ public Estimate inverseCumulativeProbability(double percentile) // no ranges to iterate over. Did a constraint cut the entire domain of values? return Estimate.unknown(); } - Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentRange.lowerEndpoint(), currentRange.lowerBoundType() == BoundType.OPEN); + Double currentLow = currentRange.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentLow, !currentRange.isLowInclusive()); if (rangeLeftSourceEstimate.isUnknown()) { return Estimate.unknown(); } @@ -250,12 +251,10 @@ public Estimate inverseCumulativeProbability(double percentile) * @param other the new range to add to the set. * @return a new {@link DisjointRangeDomainHistogram} */ - public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) + public DisjointRangeDomainHistogram addDisjunction(Range other) { - Set> ranges = ImmutableSet.>builder() - .addAll(this.ranges) - .add(other.toRange()) - .build(); + Set ranges = new HashSet<>(this.ranges); + ranges.add(other); return new DisjointRangeDomainHistogram(source, ranges); } @@ -266,9 +265,9 @@ public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) * @param other the range that should enclose the set. * @return a new {@link DisjointRangeDomainHistogram} where */ - public DisjointRangeDomainHistogram addConjunction(StatisticRange other) + public DisjointRangeDomainHistogram addConjunction(Range other) { - return new DisjointRangeDomainHistogram(source, rangeSet.get().subRangeSet(other.toRange()).asRanges()); + return new DisjointRangeDomainHistogram(source, new HashSet<>(rangeSet.get().subRangeSet(other).getOrderedRanges())); } /** @@ -287,17 +286,17 @@ public DisjointRangeDomainHistogram addConjunction(StatisticRange other) * @param range the range representing the conjunction to add * @return a new histogram with the conjunction applied. */ - public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, StatisticRange range) + public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, Range range) { if (histogram instanceof DisjointRangeDomainHistogram) { return ((DisjointRangeDomainHistogram) histogram).addDisjunction(range); } - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); + return new DisjointRangeDomainHistogram(histogram, Collections.singleton(range)); } /** - * Similar to {@link #addDisjunction(ConnectorHistogram, StatisticRange)} this method constrains + * Similar to {@link #addDisjunction(ConnectorHistogram, Range)} this method constrains * the entire domain such that all ranges in the set intersect with the given range * argument to this method. *
@@ -308,22 +307,24 @@ public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, St * @param range the range of values that the entire histogram's domain must fall within * @return a histogram with the new range constraint */ - public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, StatisticRange range) + public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, Range range) { if (histogram instanceof DisjointRangeDomainHistogram) { return ((DisjointRangeDomainHistogram) histogram).addConjunction(range); } - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); + return new DisjointRangeDomainHistogram(histogram, Collections.singleton(range)); } /** * @return the span if it exists, empty otherwise */ - private Optional> getSpan() + private Optional getSpan() { try { - return Optional.of(rangeSet.get().span()); + return Optional.of(rangeSet.get()) + .filter(set -> !set.isNone()) // prevent exception + .map(SortedRangeSet::getSpan); } catch (NoSuchElementException e) { return Optional.empty(); @@ -333,10 +334,10 @@ private Optional> getSpan() @Override public String toString() { - return toStringHelper(this) - .add("source", this.source) - .add("rangeSet", this.rangeSet) - .toString(); + return "DisjointRangeDomainHistogram{" + + "source=" + this.source + ", " + + ", rangeSet" + this.rangeSet.get() + + "}"; } @Override diff --git a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java similarity index 68% rename from presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java index 12525b6120ccb..8db65dbc69ffc 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java @@ -12,12 +12,14 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.math.DoubleMath; +import com.facebook.presto.common.predicate.Range; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isNaN; import static java.lang.Math.min; @@ -43,16 +45,19 @@ private HistogramCalculator() * heuristic would have been used * @return an estimate, x, where 0.0 <= x <= 1.0. */ - public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) + public static Estimate calculateFilterFactor(Range range, double rangeDistinctValues, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) { - boolean openHigh = range.getOpenHigh(); - boolean openLow = range.getOpenLow(); + boolean openHigh = !range.isHighInclusive(); + boolean openLow = !range.isLowInclusive(); Estimate min = histogram.inverseCumulativeProbability(0.0); Estimate max = histogram.inverseCumulativeProbability(1.0); + double rangeLow = range.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + double rangeHigh = range.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY); + double rangeLength = rangeHigh - rangeLow; // range is either above or below histogram - if ((!max.isUnknown() && (openHigh ? max.getValue() <= range.getLow() : max.getValue() < range.getLow())) - || (!min.isUnknown() && (openLow ? min.getValue() >= range.getHigh() : min.getValue() > range.getHigh()))) { + if ((!max.isUnknown() && (openHigh ? max.getValue() <= rangeLow : max.getValue() < rangeLow)) + || (!min.isUnknown() && (openLow ? min.getValue() >= rangeHigh : min.getValue() > rangeHigh))) { return Estimate.of(0.0); } @@ -63,14 +68,14 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist return Estimate.unknown(); } - if (range.length() == 0.0) { + if (rangeLength == 0.0) { return totalDistinctValues.map(distinct -> 1.0 / distinct); } - if (isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + if (isFinite(rangeLength)) { + return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } // we know the bounds are both known, so calculate the percentile for each bound @@ -82,8 +87,8 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist // thus for the "lowPercentile" calculation we should pass "false" to be non-inclusive // (same as openness) however, on the high-end we want the inclusivity to be the opposite // of the openness since if it's open, we _don't_ want to include the bound. - Estimate lowPercentile = histogram.cumulativeProbability(range.getLow(), openLow); - Estimate highPercentile = histogram.cumulativeProbability(range.getHigh(), !openHigh); + Estimate lowPercentile = histogram.cumulativeProbability(rangeLow, openLow); + Estimate highPercentile = histogram.cumulativeProbability(rangeHigh, !openHigh); // both bounds are probably infinity, use the infinite-infinite heuristic if (lowPercentile.isUnknown() || highPercentile.isUnknown()) { @@ -91,26 +96,26 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist return Estimate.unknown(); } // in the case the histogram has no values - if (totalDistinctValues.equals(Estimate.zero()) || range.getDistinctValuesCount() == 0.0) { + if (totalDistinctValues.equals(Estimate.zero()) || rangeDistinctValues == 0.0) { return Estimate.of(0.0); } // in the case only one is unknown if (((lowPercentile.isUnknown() && !highPercentile.isUnknown()) || (!lowPercentile.isUnknown() && highPercentile.isUnknown())) && - isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + isFinite(rangeLength)) { + return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } - if (range.length() == 0.0) { + if (rangeLength == 0.0) { return totalDistinctValues.map(distinct -> 1.0 / distinct); } - if (!isNaN(range.getDistinctValuesCount())) { - return totalDistinctValues.map(distinct -> min(1.0, range.getDistinctValuesCount() / distinct)); + if (!isNaN(rangeDistinctValues)) { + return totalDistinctValues.map(distinct -> min(1.0, rangeDistinctValues / distinct)); } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } // in the case the range is a single value, this can occur if the input @@ -134,15 +139,23 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist } return totalDistinctValues.flatMap(totalDistinct -> { - if (DoubleMath.fuzzyEquals(totalDistinct, 0.0, 1E-6)) { + if (fuzzyEquals(totalDistinct, 0.0, 1E-6)) { return Estimate.of(1.0); } - return Estimate.of(min(1.0, range.getDistinctValuesCount() / totalDistinct)); + return Estimate.of(min(1.0, rangeDistinctValues / totalDistinct)); }) - // in the case totalDistinct is NaN or 0 - .or(() -> Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); + // in the case totalDistinct is NaN or 0 + .or(() -> Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); } return lowPercentile.flatMap(lowPercent -> highPercentile.map(highPercent -> highPercent - lowPercent)); } + + private static boolean fuzzyEquals(double a, double b, double tolerance) + { + return Math.copySign(a - b, 1.0) <= tolerance + // copySign(x, 1.0) is a branch-free version of abs(x), but with different NaN semantics + || (a == b) // needed to ensure that infinities equal themselves + || (Double.isNaN(a) && Double.isNaN(b)); + } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java similarity index 85% rename from presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java index c9cb6bfd19523..a1a95aa974ec3 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java @@ -12,17 +12,13 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import org.openjdk.jol.info.ClassLayout; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Verify.verify; +import static com.facebook.presto.common.Utils.checkArgument; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; import static java.lang.Math.max; @@ -48,7 +44,7 @@ public UniformDistributionHistogram( @JsonProperty("lowValue") double lowValue, @JsonProperty("highValue") double highValue) { - verify(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); + checkArgument(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); this.lowValue = lowValue; this.highValue = highValue; } @@ -122,10 +118,10 @@ public long getEstimatedSize() @Override public String toString() { - return toStringHelper(this) - .add("lowValue", lowValue) - .add("highValue", highValue) - .toString(); + return "UniformDistributionHistogram{" + + "lowValue=" + lowValue + + ", highValue=" + highValue + + "}"; } @Override @@ -151,6 +147,6 @@ public int hashCode() private static boolean equalsOrBothNaN(Double first, Double second) { - return first.equals(second) || (Double.isNaN(first) && Double.isNaN(second)); + return first.equals(second) || (isNaN(first) && isNaN(second)); } } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java similarity index 81% rename from presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java index 0c11e729b1fef..6939becf267c0 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java @@ -12,20 +12,20 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.testng.annotations.Test; import java.util.List; -import java.util.stream.Collectors; +import static com.facebook.presto.common.predicate.Range.greaterThanOrEqual; +import static com.facebook.presto.common.predicate.Range.lessThanOrEqual; +import static com.facebook.presto.common.predicate.Range.range; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static org.testng.Assert.assertEquals; public class TestDisjointRangeDomainHistogram @@ -39,9 +39,9 @@ public void testBasicDisjointRanges() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(0d, 25d))); + .addDisjunction(source, rangeOpen(0d, 25d)); constrained = DisjointRangeDomainHistogram - .addDisjunction(constrained, StatisticRange.fromRange(Range.open(75d, 100d))); + .addDisjunction(constrained, rangeOpen(75d, 100d)); assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 87.5); assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 0.0); assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 100); @@ -59,7 +59,7 @@ public void testSingleDisjointRange() // no overlap, left bound ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(-10d, -5d))); + .addDisjunction(source, rangeOpen(-10d, -5d)); for (int i = -11; i < 12; i++) { assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); @@ -68,7 +68,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0), Estimate.unknown()); // partial overlap left bound - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-2d, 2d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(-2d, 2d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(-1, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); @@ -82,7 +82,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 2d, 1E-8); //full overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(3d, 4d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(3d, 4d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(1, false).getValue(), 0.0, 1E-8); @@ -96,7 +96,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 4d, 1E-8); //right side overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(8d, 12d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(8d, 12d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(5, false).getValue(), 0.0, 1E-8); @@ -114,7 +114,7 @@ public void testSingleDisjointRange() // no overlap, right bound constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(15d, 20d))); + .addDisjunction(source, rangeOpen(15d, 20d)); for (int i = 15; i < 20; i++) { assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); @@ -132,8 +132,8 @@ public void testMultipleDisjunction() { StandardNormalHistogram source = new StandardNormalHistogram(); RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = disjunction(source, Range.closed(-2d, -1d)); - constrained = disjunction(constrained, Range.closed(1d, 2d)); + ConnectorHistogram constrained = disjunction(source, rangeClosed(-2d, -1d)); + constrained = disjunction(constrained, rangeClosed(1d, 2d)); double rangeLeftProb = dist.cumulativeProbability(-1) - dist.cumulativeProbability(-2); double rangeRightProb = dist.cumulativeProbability(2) - dist.cumulativeProbability(1); double sumRangeProb = rangeLeftProb + rangeRightProb; @@ -156,7 +156,7 @@ public void testNormalDistribution() // standard normal StandardNormalHistogram source = new StandardNormalHistogram(); RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-1d, 1d))); + ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(-1d, 1d))); assertEquals(constrained.cumulativeProbability(-1.0, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0.0, true).getValue(), 0.5, 1E-8); assertEquals(constrained.cumulativeProbability(1.0, true).getValue(), 1.0, 1E-8); @@ -179,16 +179,16 @@ public void testNormalDistribution() public void testAddDisjunction() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(-1d, 2d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 2d)); - constrained = disjunction(constrained, Range.open(1d, 10d)); + DisjointRangeDomainHistogram constrained = disjunction(source, rangeOpen(-1d, 2d)); + assertEquals(constrained.getRanges().getOrderedRanges().size(), 1); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 2d, false)); + constrained = disjunction(constrained, rangeOpen(1d, 10d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - constrained = disjunction(constrained, Range.closedOpen(50d, 100d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 10d, false)); + constrained = disjunction(constrained, range(DOUBLE, 50d, true, 100d, false)); assertEquals(ranges(constrained).size(), 2); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - assertEquals(ranges(constrained).get(1), Range.closedOpen(50d, 100d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 10d, false)); + assertEquals(ranges(constrained).get(1), range(DOUBLE, 50d, true, 100d, false)); } /** @@ -198,30 +198,40 @@ public void testAddDisjunction() public void testAddConjunction() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(10d, 90d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.open(10d, 90d)); - constrained = conjunction(constrained, Range.atMost(50d)); + DisjointRangeDomainHistogram constrained = disjunction(source, rangeOpen(10d, 90d)); + assertEquals(constrained.getRanges().getOrderedRanges().size(), 1); + assertEquals(ranges(constrained).get(0), rangeOpen(10d, 90d)); + constrained = conjunction(constrained, lessThanOrEqual(DOUBLE, 50d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.openClosed(10d, 50d)); - constrained = conjunction(constrained, Range.atLeast(25d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 10d, false, 50d, true)); + constrained = conjunction(constrained, greaterThanOrEqual(DOUBLE, 25d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closed(25d, 50d)); + assertEquals(ranges(constrained).get(0), rangeClosed(25d, 50d)); } - private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, Range range) + private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, com.facebook.presto.common.predicate.Range range) { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, StatisticRange.fromRange(range)); + return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, range); } - private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, Range range) + private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, com.facebook.presto.common.predicate.Range range) { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, StatisticRange.fromRange(range)); + return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, range); } - private static List> ranges(DisjointRangeDomainHistogram hist) + private static List ranges(DisjointRangeDomainHistogram hist) { - return hist.getRanges().stream().map(StatisticRange::toRange).collect(Collectors.toList()); + return hist.getRanges().getOrderedRanges(); + } + + private static com.facebook.presto.common.predicate.Range rangeOpen(double low, double high) + { + return range(DOUBLE, low, false, high, false); + } + + private static com.facebook.presto.common.predicate.Range rangeClosed(double low, double high) + { + return range(DOUBLE, low, true, high, true); } private static class StandardNormalHistogram @@ -269,7 +279,7 @@ ConnectorHistogram createHistogram() return new DisjointRangeDomainHistogram( new UniformDistributionHistogram( distribution.getSupportLowerBound(), distribution.getSupportUpperBound())) - .addDisjunction(new StatisticRange(0.0, 100.0, 0.0)); + .addDisjunction(rangeClosed(0.0, 100.0)); } @Override diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java similarity index 97% rename from presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java index 26c68b7e5730e..341870d138bdc 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java @@ -12,9 +12,8 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; import org.apache.commons.math3.distribution.RealDistribution; import org.testng.annotations.Test; diff --git a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java new file mode 100644 index 0000000000000..0632e14247b1b --- /dev/null +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java @@ -0,0 +1,101 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.spi.statistics; + +import com.facebook.presto.common.predicate.Range; +import org.testng.annotations.Test; + +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.statistics.HistogramCalculator.calculateFilterFactor; +import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.NaN; +import static java.lang.Double.POSITIVE_INFINITY; +import static org.testng.Assert.assertEquals; + +public class TestHistogramCalculator +{ + @Test + public void testCalculateFilterFactor() + { + Range zeroToTen = range(0, 10); + Range empty = Range.range(DOUBLE, NaN, true, NaN, true); + + // Equal ranges + assertFilterFactor(Estimate.of(1.0), zeroToTen, 10, uniformHist(0, 10), 5); + assertFilterFactor(Estimate.of(1.0), zeroToTen, 10, uniformHist(0, 10), 20); + + // Some overlap + assertFilterFactor(Estimate.of(0.5), range(5, 3000), 5, uniformHist(zeroToTen), 10); + + // Single value overlap + assertFilterFactor(Estimate.of(1.0 / 10), range(3, 3), 1, uniformHist(zeroToTen), 10); + assertFilterFactor(Estimate.of(1.0 / 10), range(10, 100), 357, uniformHist(zeroToTen), 10); + + // No overlap + assertFilterFactor(Estimate.zero(), range(20, 30), 10, uniformHist(zeroToTen), 10); + + // Empty ranges + assertFilterFactor(Estimate.zero(), zeroToTen, 10, uniformHist(empty), 0); + assertFilterFactor(Estimate.zero(), empty, 0, uniformHist(zeroToTen), 10); + + // no test for (empty, empty) since any return value is correct + assertFilterFactor(Estimate.zero(), unboundedRange(), 10, uniformHist(empty), 0); + assertFilterFactor(Estimate.zero(), empty, 0, uniformHist(unboundedRange()), 10); + + // Unbounded (infinite), NDV-based + assertFilterFactor(Estimate.of(0.5), unboundedRange(), 10, uniformHist(unboundedRange()), 20); + assertFilterFactor(Estimate.of(1.0), unboundedRange(), 20, uniformHist(unboundedRange()), 10); + + // NEW TESTS (TPC-H Q2) + // unbounded ranges + assertFilterFactor(Estimate.of(.5), unboundedRange(), 0.5, uniformHist(unboundedRange()), NaN); + // unbounded ranges with limited distinct values + assertFilterFactor(Estimate.of(0.2), unboundedRange(), 1.0, + domainConstrained(unboundedRange(), uniformHist(unboundedRange())), 5.0); + } + + private static Range range(double low, double high) + { + return Range.range(DOUBLE, low, true, high, true); + } + + private static Range unboundedRange() + { + return Range.all(DOUBLE); + } + + private static void assertFilterFactor(Estimate expected, Range range, double distinctValues, ConnectorHistogram histogram, double totalDistinctValues) + { + assertEquals( + calculateFilterFactor(range, distinctValues, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), + expected); + } + + private static ConnectorHistogram uniformHist(Range range) + { + return uniformHist(range.getLow().getObjectValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY), + range.getHigh().getObjectValue().map(Double.class::cast).orElse(POSITIVE_INFINITY)); + } + + private static ConnectorHistogram uniformHist(double low, double high) + { + return new UniformDistributionHistogram(low, high); + } + + private static ConnectorHistogram domainConstrained(Range range, ConnectorHistogram source) + { + return DisjointRangeDomainHistogram.addDisjunction(source, range); + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java similarity index 93% rename from presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java index 395bc3f6e7518..e1d3dc0b6f162 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java @@ -12,11 +12,8 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.base.VerifyException; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.testng.annotations.Test; @@ -48,7 +45,7 @@ RealDistribution getDistribution() @Test public void testInvalidConstruction() { - assertThrows(VerifyException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); + assertThrows(IllegalArgumentException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); } @Test