diff --git a/presto-analyzer/src/main/java/com/facebook/presto/sql/analyzer/RelationId.java b/presto-analyzer/src/main/java/com/facebook/presto/sql/analyzer/RelationId.java index bdac0594f242d..b4ff1ad5f3daf 100644 --- a/presto-analyzer/src/main/java/com/facebook/presto/sql/analyzer/RelationId.java +++ b/presto-analyzer/src/main/java/com/facebook/presto/sql/analyzer/RelationId.java @@ -53,6 +53,11 @@ public boolean isAnonymous() return sourceNode == null; } + public Node getSourceNode() + { + return sourceNode; + } + @Override public boolean equals(Object o) { diff --git a/presto-main/src/main/java/com/facebook/presto/sql/analyzer/Analyzer.java b/presto-main/src/main/java/com/facebook/presto/sql/analyzer/Analyzer.java index 742e68453174a..35d1793e3e500 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/analyzer/Analyzer.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/analyzer/Analyzer.java @@ -115,7 +115,7 @@ public Analysis analyzeSemantic(Statement statement, boolean isDescribe) metadataExtractor.populateMetadataHandle(session, rewrittenStatement, analysis.getMetadataHandle()); StatementAnalyzer analyzer = new StatementAnalyzer(analysis, metadata, sqlParser, accessControl, session, warningCollector); analyzer.analyze(rewrittenStatement, Optional.empty()); - analyzeForUtilizedColumns(analysis, analysis.getStatement()); + analyzeForUtilizedColumns(analysis, analysis.getStatement(), warningCollector); analysis.populateTableColumnAndSubfieldReferencesForAccessControl(isCheckAccessControlOnUtilizedColumnsOnly(session), isCheckAccessControlWithSubfields(session)); return analysis; } diff --git a/presto-main/src/main/java/com/facebook/presto/sql/analyzer/UtilizedColumnsAnalyzer.java b/presto-main/src/main/java/com/facebook/presto/sql/analyzer/UtilizedColumnsAnalyzer.java index 895145bff7975..965396ecd390f 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/analyzer/UtilizedColumnsAnalyzer.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/analyzer/UtilizedColumnsAnalyzer.java @@ -15,6 +15,8 @@ import com.facebook.airlift.log.Logger; import com.facebook.presto.common.QualifiedObjectName; +import com.facebook.presto.spi.PrestoWarning; +import com.facebook.presto.spi.WarningCollector; import com.facebook.presto.spi.analyzer.AccessControlInfo; import com.facebook.presto.sql.tree.AliasedRelation; import com.facebook.presto.sql.tree.Cube; @@ -37,6 +39,7 @@ import com.facebook.presto.sql.tree.Lateral; import com.facebook.presto.sql.tree.Node; import com.facebook.presto.sql.tree.NodeRef; +import com.facebook.presto.sql.tree.QualifiedName; import com.facebook.presto.sql.tree.Query; import com.facebook.presto.sql.tree.QuerySpecification; import com.facebook.presto.sql.tree.Relation; @@ -48,7 +51,10 @@ import com.facebook.presto.sql.tree.Union; import com.facebook.presto.sql.tree.Unnest; import com.facebook.presto.sql.tree.Values; +import com.facebook.presto.sql.tree.With; +import com.facebook.presto.sql.tree.WithQuery; import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import java.util.HashMap; @@ -57,28 +63,29 @@ import java.util.Map.Entry; import java.util.Set; +import static com.facebook.presto.spi.StandardWarningCode.UTILIZED_COLUMN_ANALYSIS_FAILED; +import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.Sets.intersection; -import static java.lang.String.format; /** * Finds all utilized columns in the query. Utilized columns are those that would have an "impact" on the query's results. - * + *

* For example, in the query: - * SELECT nationkey FROM (SELECT * FROM nation WHERE name = 'USA') + * SELECT nationkey FROM (SELECT * FROM nation WHERE name = 'USA') * Even though all the columns in table nation are referenced by the query (in the SELECT * part), only the columns * "name" and "nationkey" have an "impact" on the query's results. - * + *

* The high-level algorithm works as follows: * 1. Find all fields referenced in all clauses of the outermost SELECT query, and add them to an explore list. * 2. For each field reference F in the explore list, find its referenced relation R. * 3. If R is a SELECT query: - * a. Find the SELECT item expression that F references. Add all fields referenced by that expression to the explore list. - * b. Add all fields referenced by every other clause of the SELECT query to the explore list. + * a. Find the SELECT item expression that F references. Add all fields referenced by that expression to the explore list. + * b. Add all fields referenced by every other clause of the SELECT query to the explore list. * 4. Otherwise, - * a. Add F's referenced field to a referenced fields list. - * b. For each child of R, find the corresponding child of F, and add it to the explore list. + * a. Add F's referenced field to a referenced fields list. + * b. For each child of R, find the corresponding child of F, and add it to the explore list. * 5. Repeat from step 2 for all fields in the explore list, until all have been resolved to a base table relation. - * + *

* The referenced fields list at the end of this algorithm will contain all the columns referenced by the query, that impact the output. * Step 3a is where fields that do not impact the output are pruned. */ @@ -88,14 +95,16 @@ public class UtilizedColumnsAnalyzer private final Analysis analysis; - public static void analyzeForUtilizedColumns(Analysis analysis, Node node) + public static void analyzeForUtilizedColumns(Analysis analysis, Node node, WarningCollector warningCollector) { UtilizedColumnsAnalyzer analyzer = new UtilizedColumnsAnalyzer(analysis); try { analyzer.analyze(node); } catch (Exception e) { - LOG.debug(e, format("Error in analyzing utilized columns, falling back to access control on all columns: %s", analysis.getStatement())); + warningCollector.add(new PrestoWarning( + UTILIZED_COLUMN_ANALYSIS_FAILED, + "Error in analyzing utilized columns for access control, falling back to checking access on all columns: " + e.getMessage())); analysis.getTableColumnReferences().forEach(analysis::addUtilizedTableColumnReferences); } } @@ -271,6 +280,23 @@ protected Void visitQuerySpecification(QuerySpecification querySpec, Context con return null; } + @Override + protected Void visitWith(With node, Context context) + { + ImmutableList.copyOf(node.getQueries()).reverse().forEach(query -> process(query, context)); + + return null; + } + + @Override + protected Void visitWithQuery(WithQuery withQuery, Context context) + { + context.copyFieldIdsToExploreForWithQuery(withQuery); + process(withQuery.getQuery(), context); + + return null; + } + @Override protected Void visitSampledRelation(SampledRelation sampledRelation, Context context) { @@ -493,5 +519,22 @@ private void addFieldIdToExplore(FieldId fieldId) { fieldsToExplore.put(fieldId.getRelationId(), fieldId); } + + // Associate the relation from the with clause with the fieldIdsToExplore that we collected for it + // when processing the main part of the query + public void copyFieldIdsToExploreForWithQuery(WithQuery withQuery) + { + QualifiedName name = QualifiedName.of(withQuery.getName().getValue()); + List relationIds = fieldsToExplore.keySet().stream() + .filter(key -> key.getSourceNode() instanceof Table && ((Table) key.getSourceNode()).getName().equals(name)) + .collect(toImmutableList()); + // if a cte is used more than once, it will be listed multiple times in the fieldIds to explore + // if multiple ctes have the same name, this will also be captured here. These will fail later if the columns used don't exist in the tables + for (RelationId relationId : relationIds) { + fieldsToExplore.putAll( + RelationId.of(withQuery.getQuery().getQueryBody()), + fieldsToExplore.get(relationId)); + } + } } } diff --git a/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestUtilizedColumnsAnalyzer.java b/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestUtilizedColumnsAnalyzer.java index a18b45b2d09ed..97526fb43c6e0 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestUtilizedColumnsAnalyzer.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestUtilizedColumnsAnalyzer.java @@ -519,6 +519,82 @@ public void testUDF() ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a"))); } + @Test + public void testCteWithExpressionInSelect() + { + assertUtilizedTableColumns( + "with cte as (select x as c1, y as c2, z + 1 as c3 from t13) select c1, c3 from (select * from cte)", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "z"))); + } + + @Test + public void testMultipleCtes() + { + assertUtilizedTableColumns( + "with cte1 as (select x as c1, y as c2, z + 1 as c3 from t13), cte2 as (select c1 + 1 as a, c3 as b from cte1) select a, b from (select * from cte2)", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "z"))); + assertUtilizedTableColumns( + "with cte1 as (select x as c1, z + 1 as c2, y from t13), cte2 as (select c1 + 1 c3, c2 +1 as c4 from cte1) select c3 from cte2", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"))); + } + + @Test + public void testMultipleCtesUsingSameTable() + { + assertUtilizedTableColumns( + "with cte1 as (select x + 1 as c1, y as c2, z + 1 as c3 from t13), cte2 as (with cte1 AS (select y +1 as c1 from t13) select * from cte1) SELECT cte1.c1, cte2.c1 from cte1 join cte2 on cte1.c1=cte2.c1", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y"))); + } + + @Test + public void testMultipleCtesSameNameSameTable() + { + // TODO(kevintang2022): Fix visitWithQuery so that it looks up relation properly. + // Currently, it just looks the relations using the name (string) of the CTE + assertUtilizedTableColumns( + "with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select x + 1 as c2, y + 1 as c3, z as c4 from t13) select * from cte1) select cte1.c1, cte2.c3 from cte1 join cte2 on true", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y", "z"))); + } + + @Test + public void testMultipleCtesSameNameDifferentTable() + { + // This happens because the two CTE1s having the same name, and the CTE1 of t13 has its first column utilized, so the CTE1 of t1 will also have its first column added. + assertUtilizedTableColumns( + "with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte1) select cte1.c1 from cte1 join cte2 on true", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"), + QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a"))); + } + + @Test + public void testMultipleCtesDifferentNameSameTable() + { + assertUtilizedTableColumns( + "with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte3 as (select x + 1 as c2, y + 1 as c3, z as c4 from t13) select * from cte3) select cte1.c1, cte2.c3 from cte1 join cte2 on true", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y"))); + } + + @Test + public void testMultipleCtesDifferentNameDifferentTable() + { + // This is the same behavior for join queries like "select t1.a from t1 join t13 on true" regardless of wrapping in CTE + assertUtilizedTableColumns( + "with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte3 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte3) select cte1.c1 from cte1 join cte2 on true", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"), + QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of())); + } + + @Test + public void testMultipleCtesSameNameDifferentTableMultipleColumnsUtilized() + { + // This will fallback to checking all utilized columns on t1 because CTE1 of t13 only has one item in the select expression, but CTE1 of t1 has 3, so there's a mismatch. + // It's trying to add the third column of CTE1 of t1 but it does not exist. + assertUtilizedTableColumns( + "with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte1) select cte1.c1, cte2.c4 from cte1 join cte2 on true", + ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"), + QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a", "b", "c"))); + } + private void assertUtilizedTableColumns(@Language("SQL") String query, Map> expected) { transaction(transactionManager, accessControl) diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/StandardWarningCode.java b/presto-spi/src/main/java/com/facebook/presto/spi/StandardWarningCode.java index 1213d11080f10..7768c980953e9 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/StandardWarningCode.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/StandardWarningCode.java @@ -26,7 +26,8 @@ public enum StandardWarningCode DEFAULT_SAMPLE_FUNCTION(0x0000_0008), SAMPLED_FIELDS(0x0000_0009), MULTIPLE_TABLE_METADATA(0x0000_0010), - SORT_COLUMN_TRANSFORM_NOT_SUPPORTED_WARNING(0x0000_0011) + SORT_COLUMN_TRANSFORM_NOT_SUPPORTED_WARNING(0x0000_0011), + UTILIZED_COLUMN_ANALYSIS_FAILED(0x0000_0012), /**/; private final WarningCode warningCode;