From 4fceaa0add5b67e5460e58f278326be2e459ae49 Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Fri, 18 Oct 2024 18:21:35 +0800 Subject: [PATCH] [fix](statistics)Skip analyze if the collected info is invalid. (#42028) (#42088) backport: https://github.com/apache/doris/pull/42028 --- .../doris/statistics/BaseAnalysisTask.java | 5 ++ .../apache/doris/statistics/ColStatsData.java | 21 +++++ .../statistics/BaseAnalysisTaskTest.java | 45 ++++++++++ .../doris/statistics/ColStatsDataTest.java | 87 +++++++++++++++++++ 4 files changed, 158 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 7d637d97e0638d..f126a9d8dc6861 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -484,6 +484,11 @@ protected void runQuery(String sql) { try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext(false)) { stmtExecutor = new StmtExecutor(a.connectContext, sql); ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); + if (!colStatsData.isValid()) { + String message = String.format("ColStatsData is invalid, skip analyzing. %s", colStatsData.toSQL(true)); + LOG.warn(message); + throw new RuntimeException(message); + } // Update index row count after analyze. if (this instanceof OlapAnalysisTask) { AnalysisInfo jobInfo = Env.getCurrentEnv().getAnalysisManager().findJobInfo(job.getJobInfo().jobId); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index 7cf75462fee0ad..bb8263994583fd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -186,4 +186,25 @@ public ColumnStatistic toColumnStatistic() { return ColumnStatistic.UNKNOWN; } } + + public boolean isNull(String value) { + // Checking "NULL" as null is a historical bug which treat literal value "NULL" as null. Will fix it soon. + return value == null || value.equalsIgnoreCase("NULL"); + } + + public boolean isValid() { + if (ndv > 10 * count) { + LOG.debug("Ndv {} is much larger than count {}", ndv, count); + return false; + } + if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) { + LOG.debug("Ndv is 0 but min or max exists"); + return false; + } + if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && (nullCount == 0 || count > nullCount * 10)) { + LOG.debug("count {} not 0, ndv is 0, min and max are all null, null count {} is too small", count, count); + return false; + } + return true; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index 86ccfff26e0375..de2cf522366373 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -20,10 +20,16 @@ import org.apache.doris.analysis.TableSample; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.qe.StmtExecutor; +import com.google.common.collect.Lists; +import mockit.Mock; +import mockit.MockUp; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.util.List; + public class BaseAnalysisTaskTest { @Test @@ -60,4 +66,43 @@ public void testGetFunctions() { System.out.println(ndvFunction); } + @Test + public void testInvalidColStats() { + List values = Lists.newArrayList(); + values.add("id"); + values.add("10000"); + values.add("20000"); + values.add("30000"); + values.add("0"); + values.add("col"); + values.add(null); + values.add("100"); // count + values.add("1100"); // ndv + values.add("300"); // null + values.add("min"); + values.add("max"); + values.add("400"); + values.add("500"); + ResultRow row = new ResultRow(values); + List result = Lists.newArrayList(); + result.add(row); + + new MockUp() { + @Mock + public List executeInternalQuery() { + return result; + } + }; + BaseAnalysisTask task = new OlapAnalysisTask(); + try { + task.runQuery("test"); + } catch (Exception e) { + Assertions.assertEquals(e.getMessage(), + "ColStatsData is invalid, skip analyzing. " + + "('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')"); + return; + } + Assertions.fail(); + } + } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java index 8743105a6444f7..a7c843701531cb 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java @@ -164,4 +164,91 @@ public void testToColumnStatisticNormal(@Mocked StatisticsUtil mockedClass) { Assertions.assertEquals(400, columnStatistic.dataSize); Assertions.assertEquals("500", columnStatistic.updatedTime); } + + @Test + public void testIsNull() { + ColStatsData stats = new ColStatsData(); + Assertions.assertTrue(stats.isNull(null)); + Assertions.assertTrue(stats.isNull("null")); + Assertions.assertTrue(stats.isNull("NuLl")); + Assertions.assertFalse(stats.isNull("")); + Assertions.assertFalse(stats.isNull(" ")); + Assertions.assertFalse(stats.isNull("123")); + } + + @Test + public void testIsValid() { + List values = Lists.newArrayList(); + values.add("id"); + values.add("10000"); + values.add("20000"); + values.add("30000"); + values.add("0"); + values.add("col"); + values.add(null); + values.add("100"); // count + values.add("1100"); // ndv + values.add("300"); // null + values.add("min"); + values.add("max"); + values.add("400"); + values.add("500"); + ResultRow row = new ResultRow(values); + ColStatsData data = new ColStatsData(row); + Assertions.assertFalse(data.isValid()); + + // Set count = 200 + values.set(7, "200"); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertTrue(data.isValid()); + + // Set ndv = 0, min/max is not null + values.set(8, "0"); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertFalse(data.isValid()); + + // Set min to null, min/max is not null + values.set(10, null); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertFalse(data.isValid()); + + // Set max to null, min/max is not null + values.set(11, null); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertTrue(data.isValid()); + + // Set min to not null, min/max is not null + values.set(10, "min"); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertFalse(data.isValid()); + + // Set min and max to null, nullNum = 0 + values.set(9, "0"); + values.set(10, "nuLl"); + values.set(11, null); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertFalse(data.isValid()); + + // nullNum = 19, count = 200 + values.set(9, "19"); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertFalse(data.isValid()); + + // nullNum = 21, count = 200, so count < nullNum * 10 + values.set(9, "21"); + row = new ResultRow(values); + data = new ColStatsData(row); + Assertions.assertTrue(data.isValid()); + + // Empty table stats is valid. + data = new ColStatsData(); + Assertions.assertTrue(data.isValid()); + } }