Skip to content

Commit 95845fb

Browse files
committed
Fix check_access_control_on_utilized_columns_only for CTE
Previously we weren't associating the with query with the columns used from it in the main part of the query, so we wouldn't consider any columns as used. When the usage within the cte or outside of it was just an identifier, we would have the utilized column collected anyway because the column collected in the main query kept the source table information. However, if there was an expression in between, we would lose that information, and we wouldn't check access control for the column that was used in an expression in the cte. Example: For the following query ``` with cte as (select x as c1, z + 1 as c2 from t13) select c1, c2 from (select * from cte) ``` Previously we would only check access permissions on t13.x, but not t13.z. With this change we will check column access on both 13.x and t13.z. Fix empty relations case Add error message to utilized columns warning. handle ctes used multiple times fixup >1 reference support Add tests for multiple cte with same name Co-authored-by: Kevin Tang <tangk@meta.com> Add comments clarification and test for multiple same name for multiple columns
1 parent 144105d commit 95845fb

File tree

5 files changed

+138
-13
lines changed

5 files changed

+138
-13
lines changed

presto-analyzer/src/main/java/com/facebook/presto/sql/analyzer/RelationId.java

+5
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ public boolean isAnonymous()
5353
return sourceNode == null;
5454
}
5555

56+
public Node getSourceNode()
57+
{
58+
return sourceNode;
59+
}
60+
5661
@Override
5762
public boolean equals(Object o)
5863
{

presto-main/src/main/java/com/facebook/presto/sql/analyzer/Analyzer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ public Analysis analyzeSemantic(Statement statement, boolean isDescribe)
115115
metadataExtractor.populateMetadataHandle(session, rewrittenStatement, analysis.getMetadataHandle());
116116
StatementAnalyzer analyzer = new StatementAnalyzer(analysis, metadata, sqlParser, accessControl, session, warningCollector);
117117
analyzer.analyze(rewrittenStatement, Optional.empty());
118-
analyzeForUtilizedColumns(analysis, analysis.getStatement());
118+
analyzeForUtilizedColumns(analysis, analysis.getStatement(), warningCollector);
119119
analysis.populateTableColumnAndSubfieldReferencesForAccessControl(isCheckAccessControlOnUtilizedColumnsOnly(session), isCheckAccessControlWithSubfields(session));
120120
return analysis;
121121
}

presto-main/src/main/java/com/facebook/presto/sql/analyzer/UtilizedColumnsAnalyzer.java

+54-11
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
import com.facebook.airlift.log.Logger;
1717
import com.facebook.presto.common.QualifiedObjectName;
18+
import com.facebook.presto.spi.PrestoWarning;
19+
import com.facebook.presto.spi.WarningCollector;
1820
import com.facebook.presto.spi.analyzer.AccessControlInfo;
1921
import com.facebook.presto.sql.tree.AliasedRelation;
2022
import com.facebook.presto.sql.tree.Cube;
@@ -37,6 +39,7 @@
3739
import com.facebook.presto.sql.tree.Lateral;
3840
import com.facebook.presto.sql.tree.Node;
3941
import com.facebook.presto.sql.tree.NodeRef;
42+
import com.facebook.presto.sql.tree.QualifiedName;
4043
import com.facebook.presto.sql.tree.Query;
4144
import com.facebook.presto.sql.tree.QuerySpecification;
4245
import com.facebook.presto.sql.tree.Relation;
@@ -48,7 +51,10 @@
4851
import com.facebook.presto.sql.tree.Union;
4952
import com.facebook.presto.sql.tree.Unnest;
5053
import com.facebook.presto.sql.tree.Values;
54+
import com.facebook.presto.sql.tree.With;
55+
import com.facebook.presto.sql.tree.WithQuery;
5156
import com.google.common.collect.HashMultimap;
57+
import com.google.common.collect.ImmutableList;
5258
import com.google.common.collect.ImmutableSet;
5359

5460
import java.util.HashMap;
@@ -57,28 +63,29 @@
5763
import java.util.Map.Entry;
5864
import java.util.Set;
5965

66+
import static com.facebook.presto.spi.StandardWarningCode.UTILIZED_COLUMN_ANALYSIS_FAILED;
67+
import static com.google.common.collect.ImmutableList.toImmutableList;
6068
import static com.google.common.collect.Sets.intersection;
61-
import static java.lang.String.format;
6269

6370
/**
6471
* Finds all utilized columns in the query. Utilized columns are those that would have an "impact" on the query's results.
65-
*
72+
* <p>
6673
* For example, in the query:
67-
* SELECT nationkey FROM (SELECT * FROM nation WHERE name = 'USA')
74+
* SELECT nationkey FROM (SELECT * FROM nation WHERE name = 'USA')
6875
* Even though all the columns in table nation are referenced by the query (in the SELECT * part), only the columns
6976
* "name" and "nationkey" have an "impact" on the query's results.
70-
*
77+
* <p>
7178
* The high-level algorithm works as follows:
7279
* 1. Find all fields referenced in all clauses of the outermost SELECT query, and add them to an explore list.
7380
* 2. For each field reference F in the explore list, find its referenced relation R.
7481
* 3. If R is a SELECT query:
75-
* a. Find the SELECT item expression that F references. Add all fields referenced by that expression to the explore list.
76-
* b. Add all fields referenced by every other clause of the SELECT query to the explore list.
82+
* a. Find the SELECT item expression that F references. Add all fields referenced by that expression to the explore list.
83+
* b. Add all fields referenced by every other clause of the SELECT query to the explore list.
7784
* 4. Otherwise,
78-
* a. Add F's referenced field to a referenced fields list.
79-
* b. For each child of R, find the corresponding child of F, and add it to the explore list.
85+
* a. Add F's referenced field to a referenced fields list.
86+
* b. For each child of R, find the corresponding child of F, and add it to the explore list.
8087
* 5. Repeat from step 2 for all fields in the explore list, until all have been resolved to a base table relation.
81-
*
88+
* <p>
8289
* The referenced fields list at the end of this algorithm will contain all the columns referenced by the query, that impact the output.
8390
* Step 3a is where fields that do not impact the output are pruned.
8491
*/
@@ -88,14 +95,16 @@ public class UtilizedColumnsAnalyzer
8895

8996
private final Analysis analysis;
9097

91-
public static void analyzeForUtilizedColumns(Analysis analysis, Node node)
98+
public static void analyzeForUtilizedColumns(Analysis analysis, Node node, WarningCollector warningCollector)
9299
{
93100
UtilizedColumnsAnalyzer analyzer = new UtilizedColumnsAnalyzer(analysis);
94101
try {
95102
analyzer.analyze(node);
96103
}
97104
catch (Exception e) {
98-
LOG.debug(e, format("Error in analyzing utilized columns, falling back to access control on all columns: %s", analysis.getStatement()));
105+
warningCollector.add(new PrestoWarning(
106+
UTILIZED_COLUMN_ANALYSIS_FAILED,
107+
"Error in analyzing utilized columns for access control, falling back to checking access on all columns: " + e.getMessage()));
99108
analysis.getTableColumnReferences().forEach(analysis::addUtilizedTableColumnReferences);
100109
}
101110
}
@@ -271,6 +280,23 @@ protected Void visitQuerySpecification(QuerySpecification querySpec, Context con
271280
return null;
272281
}
273282

283+
@Override
284+
protected Void visitWith(With node, Context context)
285+
{
286+
ImmutableList.copyOf(node.getQueries()).reverse().forEach(query -> process(query, context));
287+
288+
return null;
289+
}
290+
291+
@Override
292+
protected Void visitWithQuery(WithQuery withQuery, Context context)
293+
{
294+
context.copyFieldIdsToExploreForWithQuery(withQuery);
295+
process(withQuery.getQuery(), context);
296+
297+
return null;
298+
}
299+
274300
@Override
275301
protected Void visitSampledRelation(SampledRelation sampledRelation, Context context)
276302
{
@@ -493,5 +519,22 @@ private void addFieldIdToExplore(FieldId fieldId)
493519
{
494520
fieldsToExplore.put(fieldId.getRelationId(), fieldId);
495521
}
522+
523+
// Associate the relation from the with clause with the fieldIdsToExplore that we collected for it
524+
// when processing the main part of the query
525+
public void copyFieldIdsToExploreForWithQuery(WithQuery withQuery)
526+
{
527+
QualifiedName name = QualifiedName.of(withQuery.getName().getValue());
528+
List<RelationId> relationIds = fieldsToExplore.keySet().stream()
529+
.filter(key -> key.getSourceNode() instanceof Table && ((Table) key.getSourceNode()).getName().equals(name))
530+
.collect(toImmutableList());
531+
// if a cte is used more than once, it will be listed multiple times in the fieldIds to explore
532+
// if multiple ctes have the same name, this will also be captured here. These will fail later if the columns used don't exist in the tables
533+
for (RelationId relationId : relationIds) {
534+
fieldsToExplore.putAll(
535+
RelationId.of(withQuery.getQuery().getQueryBody()),
536+
fieldsToExplore.get(relationId));
537+
}
538+
}
496539
}
497540
}

presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestUtilizedColumnsAnalyzer.java

+76
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,82 @@ public void testUDF()
519519
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a")));
520520
}
521521

522+
@Test
523+
public void testCteWithExpressionInSelect()
524+
{
525+
assertUtilizedTableColumns(
526+
"with cte as (select x as c1, y as c2, z + 1 as c3 from t13) select c1, c3 from (select * from cte)",
527+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "z")));
528+
}
529+
530+
@Test
531+
public void testMultipleCtes()
532+
{
533+
assertUtilizedTableColumns(
534+
"with cte1 as (select x as c1, y as c2, z + 1 as c3 from t13), cte2 as (select c1 + 1 as a, c3 as b from cte1) select a, b from (select * from cte2)",
535+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "z")));
536+
assertUtilizedTableColumns(
537+
"with cte1 as (select x as c1, z + 1 as c2, y from t13), cte2 as (select c1 + 1 c3, c2 +1 as c4 from cte1) select c3 from cte2",
538+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x")));
539+
}
540+
541+
@Test
542+
public void testMultipleCtesUsingSameTable()
543+
{
544+
assertUtilizedTableColumns(
545+
"with cte1 as (select x + 1 as c1, y as c2, z + 1 as c3 from t13), cte2 as (with cte1 AS (select y +1 as c1 from t13) select * from cte1) SELECT cte1.c1, cte2.c1 from cte1 join cte2 on cte1.c1=cte2.c1",
546+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y")));
547+
}
548+
549+
@Test
550+
public void testMultipleCtesSameNameSameTable()
551+
{
552+
// TODO(kevintang2022): Fix visitWithQuery so that it looks up relation properly.
553+
// Currently, it just looks the relations using the name (string) of the CTE
554+
assertUtilizedTableColumns(
555+
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select x + 1 as c2, y + 1 as c3, z as c4 from t13) select * from cte1) select cte1.c1, cte2.c3 from cte1 join cte2 on true",
556+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y", "z")));
557+
}
558+
559+
@Test
560+
public void testMultipleCtesSameNameDifferentTable()
561+
{
562+
// This happens because the two CTE1s having the same name, and the CTE1 of t13 has its first column utilized, so the CTE1 of t1 will also have its first column added.
563+
assertUtilizedTableColumns(
564+
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte1) select cte1.c1 from cte1 join cte2 on true",
565+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"),
566+
QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a")));
567+
}
568+
569+
@Test
570+
public void testMultipleCtesDifferentNameSameTable()
571+
{
572+
assertUtilizedTableColumns(
573+
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte3 as (select x + 1 as c2, y + 1 as c3, z as c4 from t13) select * from cte3) select cte1.c1, cte2.c3 from cte1 join cte2 on true",
574+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y")));
575+
}
576+
577+
@Test
578+
public void testMultipleCtesDifferentNameDifferentTable()
579+
{
580+
// This is the same behavior for join queries like "select t1.a from t1 join t13 on true" regardless of wrapping in CTE
581+
assertUtilizedTableColumns(
582+
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte3 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte3) select cte1.c1 from cte1 join cte2 on true",
583+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"),
584+
QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of()));
585+
}
586+
587+
@Test
588+
public void testMultipleCtesSameNameDifferentTableMultipleColumnsUtilized()
589+
{
590+
// This will fallback to checking all utilized columns on t1 because CTE1 of t13 only has one item in the select expression, but CTE1 of t1 has 3, so there's a mismatch.
591+
// It's trying to add the third column of CTE1 of t1 but it does not exist.
592+
assertUtilizedTableColumns(
593+
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte1) select cte1.c1, cte2.c4 from cte1 join cte2 on true",
594+
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"),
595+
QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a", "b", "c")));
596+
}
597+
522598
private void assertUtilizedTableColumns(@Language("SQL") String query, Map<QualifiedObjectName, Set<String>> expected)
523599
{
524600
transaction(transactionManager, accessControl)

presto-spi/src/main/java/com/facebook/presto/spi/StandardWarningCode.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ public enum StandardWarningCode
2626
DEFAULT_SAMPLE_FUNCTION(0x0000_0008),
2727
SAMPLED_FIELDS(0x0000_0009),
2828
MULTIPLE_TABLE_METADATA(0x0000_0010),
29-
SORT_COLUMN_TRANSFORM_NOT_SUPPORTED_WARNING(0x0000_0011)
29+
SORT_COLUMN_TRANSFORM_NOT_SUPPORTED_WARNING(0x0000_0011),
30+
UTILIZED_COLUMN_ANALYSIS_FAILED(0x0000_0012),
3031
/**/;
3132
private final WarningCode warningCode;
3233

0 commit comments

Comments
 (0)