Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix check_access_control_on_utilized_columns_only for CTE #24647

Merged
merged 1 commit into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ public boolean isAnonymous()
return sourceNode == null;
}

public Node getSourceNode()
{
return sourceNode;
}

@Override
public boolean equals(Object o)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ public Analysis analyzeSemantic(Statement statement, boolean isDescribe)
metadataExtractor.populateMetadataHandle(session, rewrittenStatement, analysis.getMetadataHandle());
StatementAnalyzer analyzer = new StatementAnalyzer(analysis, metadata, sqlParser, accessControl, session, warningCollector);
analyzer.analyze(rewrittenStatement, Optional.empty());
analyzeForUtilizedColumns(analysis, analysis.getStatement());
analyzeForUtilizedColumns(analysis, analysis.getStatement(), warningCollector);
analysis.populateTableColumnAndSubfieldReferencesForAccessControl(isCheckAccessControlOnUtilizedColumnsOnly(session), isCheckAccessControlWithSubfields(session));
return analysis;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

import com.facebook.airlift.log.Logger;
import com.facebook.presto.common.QualifiedObjectName;
import com.facebook.presto.spi.PrestoWarning;
import com.facebook.presto.spi.WarningCollector;
import com.facebook.presto.spi.analyzer.AccessControlInfo;
import com.facebook.presto.sql.tree.AliasedRelation;
import com.facebook.presto.sql.tree.Cube;
Expand All @@ -37,6 +39,7 @@
import com.facebook.presto.sql.tree.Lateral;
import com.facebook.presto.sql.tree.Node;
import com.facebook.presto.sql.tree.NodeRef;
import com.facebook.presto.sql.tree.QualifiedName;
import com.facebook.presto.sql.tree.Query;
import com.facebook.presto.sql.tree.QuerySpecification;
import com.facebook.presto.sql.tree.Relation;
Expand All @@ -48,7 +51,10 @@
import com.facebook.presto.sql.tree.Union;
import com.facebook.presto.sql.tree.Unnest;
import com.facebook.presto.sql.tree.Values;
import com.facebook.presto.sql.tree.With;
import com.facebook.presto.sql.tree.WithQuery;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;

import java.util.HashMap;
Expand All @@ -57,28 +63,29 @@
import java.util.Map.Entry;
import java.util.Set;

import static com.facebook.presto.spi.StandardWarningCode.UTILIZED_COLUMN_ANALYSIS_FAILED;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.Sets.intersection;
import static java.lang.String.format;

/**
* Finds all utilized columns in the query. Utilized columns are those that would have an "impact" on the query's results.
*
* <p>
* For example, in the query:
* SELECT nationkey FROM (SELECT * FROM nation WHERE name = 'USA')
* SELECT nationkey FROM (SELECT * FROM nation WHERE name = 'USA')
* Even though all the columns in table nation are referenced by the query (in the SELECT * part), only the columns
* "name" and "nationkey" have an "impact" on the query's results.
*
* <p>
* The high-level algorithm works as follows:
* 1. Find all fields referenced in all clauses of the outermost SELECT query, and add them to an explore list.
* 2. For each field reference F in the explore list, find its referenced relation R.
* 3. If R is a SELECT query:
* a. Find the SELECT item expression that F references. Add all fields referenced by that expression to the explore list.
* b. Add all fields referenced by every other clause of the SELECT query to the explore list.
* a. Find the SELECT item expression that F references. Add all fields referenced by that expression to the explore list.
* b. Add all fields referenced by every other clause of the SELECT query to the explore list.
* 4. Otherwise,
* a. Add F's referenced field to a referenced fields list.
* b. For each child of R, find the corresponding child of F, and add it to the explore list.
* a. Add F's referenced field to a referenced fields list.
* b. For each child of R, find the corresponding child of F, and add it to the explore list.
* 5. Repeat from step 2 for all fields in the explore list, until all have been resolved to a base table relation.
*
* <p>
* The referenced fields list at the end of this algorithm will contain all the columns referenced by the query, that impact the output.
* Step 3a is where fields that do not impact the output are pruned.
*/
Expand All @@ -88,14 +95,16 @@ public class UtilizedColumnsAnalyzer

private final Analysis analysis;

public static void analyzeForUtilizedColumns(Analysis analysis, Node node)
public static void analyzeForUtilizedColumns(Analysis analysis, Node node, WarningCollector warningCollector)
{
UtilizedColumnsAnalyzer analyzer = new UtilizedColumnsAnalyzer(analysis);
try {
analyzer.analyze(node);
}
catch (Exception e) {
LOG.debug(e, format("Error in analyzing utilized columns, falling back to access control on all columns: %s", analysis.getStatement()));
warningCollector.add(new PrestoWarning(
UTILIZED_COLUMN_ANALYSIS_FAILED,
"Error in analyzing utilized columns for access control, falling back to checking access on all columns: " + e.getMessage()));
analysis.getTableColumnReferences().forEach(analysis::addUtilizedTableColumnReferences);
}
}
Expand Down Expand Up @@ -271,6 +280,23 @@ protected Void visitQuerySpecification(QuerySpecification querySpec, Context con
return null;
}

@Override
protected Void visitWith(With node, Context context)
{
ImmutableList.copyOf(node.getQueries()).reverse().forEach(query -> process(query, context));

return null;
}

@Override
protected Void visitWithQuery(WithQuery withQuery, Context context)
{
context.copyFieldIdsToExploreForWithQuery(withQuery);
process(withQuery.getQuery(), context);

return null;
}

@Override
protected Void visitSampledRelation(SampledRelation sampledRelation, Context context)
{
Expand Down Expand Up @@ -493,5 +519,22 @@ private void addFieldIdToExplore(FieldId fieldId)
{
fieldsToExplore.put(fieldId.getRelationId(), fieldId);
}

// Associate the relation from the with clause with the fieldIdsToExplore that we collected for it
// when processing the main part of the query
public void copyFieldIdsToExploreForWithQuery(WithQuery withQuery)
{
QualifiedName name = QualifiedName.of(withQuery.getName().getValue());
List<RelationId> relationIds = fieldsToExplore.keySet().stream()
.filter(key -> key.getSourceNode() instanceof Table && ((Table) key.getSourceNode()).getName().equals(name))
.collect(toImmutableList());
// if a cte is used more than once, it will be listed multiple times in the fieldIds to explore
// if multiple ctes have the same name, this will also be captured here. These will fail later if the columns used don't exist in the tables
for (RelationId relationId : relationIds) {
fieldsToExplore.putAll(
RelationId.of(withQuery.getQuery().getQueryBody()),
fieldsToExplore.get(relationId));
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,82 @@ public void testUDF()
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a")));
}

@Test
public void testCteWithExpressionInSelect()
{
assertUtilizedTableColumns(
"with cte as (select x as c1, y as c2, z + 1 as c3 from t13) select c1, c3 from (select * from cte)",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "z")));
}

@Test
public void testMultipleCtes()
{
assertUtilizedTableColumns(
"with cte1 as (select x as c1, y as c2, z + 1 as c3 from t13), cte2 as (select c1 + 1 as a, c3 as b from cte1) select a, b from (select * from cte2)",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "z")));
assertUtilizedTableColumns(
"with cte1 as (select x as c1, z + 1 as c2, y from t13), cte2 as (select c1 + 1 c3, c2 +1 as c4 from cte1) select c3 from cte2",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x")));
}

@Test
public void testMultipleCtesUsingSameTable()
{
assertUtilizedTableColumns(
"with cte1 as (select x + 1 as c1, y as c2, z + 1 as c3 from t13), cte2 as (with cte1 AS (select y +1 as c1 from t13) select * from cte1) SELECT cte1.c1, cte2.c1 from cte1 join cte2 on cte1.c1=cte2.c1",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y")));
}

@Test
public void testMultipleCtesSameNameSameTable()
{
// TODO(kevintang2022): Fix visitWithQuery so that it looks up relation properly.
// Currently, it just looks the relations using the name (string) of the CTE
Comment on lines +552 to +553
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some todo for followup. The name conflict is caused by just using name for the look up. We might need to lookup with more info or add a unique identifier to each column

assertUtilizedTableColumns(
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select x + 1 as c2, y + 1 as c3, z as c4 from t13) select * from cte1) select cte1.c1, cte2.c3 from cte1 join cte2 on true",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y", "z")));
}

@Test
public void testMultipleCtesSameNameDifferentTable()
{
// This happens because the two CTE1s having the same name, and the CTE1 of t13 has its first column utilized, so the CTE1 of t1 will also have its first column added.
assertUtilizedTableColumns(
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte1) select cte1.c1 from cte1 join cte2 on true",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"),
QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a")));
}

@Test
public void testMultipleCtesDifferentNameSameTable()
{
assertUtilizedTableColumns(
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte3 as (select x + 1 as c2, y + 1 as c3, z as c4 from t13) select * from cte3) select cte1.c1, cte2.c3 from cte1 join cte2 on true",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x", "y")));
}

@Test
public void testMultipleCtesDifferentNameDifferentTable()
{
// This is the same behavior for join queries like "select t1.a from t1 join t13 on true" regardless of wrapping in CTE
assertUtilizedTableColumns(
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte3 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte3) select cte1.c1 from cte1 join cte2 on true",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"),
QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of()));
}

@Test
public void testMultipleCtesSameNameDifferentTableMultipleColumnsUtilized()
{
// This will fallback to checking all utilized columns on t1 because CTE1 of t13 only has one item in the select expression, but CTE1 of t1 has 3, so there's a mismatch.
// It's trying to add the third column of CTE1 of t1 but it does not exist.
assertUtilizedTableColumns(
"with cte1 as (select x + 1 as c1 from t13), cte2 as (with cte1 as (select a + 1 as c2, b + 1 as c3, c as c4 from t1) select * from cte1) select cte1.c1, cte2.c4 from cte1 join cte2 on true",
ImmutableMap.of(QualifiedObjectName.valueOf("tpch.s1.t13"), ImmutableSet.of("x"),
QualifiedObjectName.valueOf("tpch.s1.t1"), ImmutableSet.of("a", "b", "c")));
}

private void assertUtilizedTableColumns(@Language("SQL") String query, Map<QualifiedObjectName, Set<String>> expected)
{
transaction(transactionManager, accessControl)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ public enum StandardWarningCode
DEFAULT_SAMPLE_FUNCTION(0x0000_0008),
SAMPLED_FIELDS(0x0000_0009),
MULTIPLE_TABLE_METADATA(0x0000_0010),
SORT_COLUMN_TRANSFORM_NOT_SUPPORTED_WARNING(0x0000_0011)
SORT_COLUMN_TRANSFORM_NOT_SUPPORTED_WARNING(0x0000_0011),
UTILIZED_COLUMN_ANALYSIS_FAILED(0x0000_0012),
/**/;
private final WarningCode warningCode;

Expand Down
Loading