Skip to content

Commit

Permalink
feat(glue): enable partition filtering on tables (#21081)
Browse files Browse the repository at this point in the history
Fixes #20825

Adds `partition_filtering.enabled` to `TableProps` in AWS Glue.

----

### All Submissions:

* [x] Have you followed the guidelines in our [Contributing guide?](https://github.com/aws/aws-cdk/blob/main/CONTRIBUTING.md)

### Adding new Unconventional Dependencies:

* [ ] This PR adds new unconventional dependencies following the process described [here](https://github.com/aws/aws-cdk/blob/main/CONTRIBUTING.md/#adding-new-unconventional-dependencies)

### New Features

* [x] Have you added the new feature to an [integration test](https://github.com/aws/aws-cdk/blob/main/INTEGRATION_TESTS.md)?
	* [x] Did you use `yarn integ` to deploy the infrastructure and generate the snapshot (i.e. `yarn integ` without `--dry-run`)?

*By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
  • Loading branch information
daschaa authored Jul 12, 2022
1 parent dd9f5c5 commit bf35048
Show file tree
Hide file tree
Showing 10 changed files with 333 additions and 11 deletions.
25 changes: 25 additions & 0 deletions packages/@aws-cdk/aws-glue/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,31 @@ myTable.addPartitionIndex({
});
```

### Partition Filtering

If you have a table with a large number of partitions that grows over time, consider using AWS Glue partition indexing and filtering.

```ts
declare const myDatabase: glue.Database;
new glue.Table(this, 'MyTable', {
database: myDatabase,
tableName: 'my_table',
columns: [{
name: 'col1',
type: glue.Schema.STRING,
}],
partitionKeys: [{
name: 'year',
type: glue.Schema.SMALL_INT,
}, {
name: 'month',
type: glue.Schema.SMALL_INT,
}],
dataFormat: glue.DataFormat.JSON,
enablePartitionFiltering: true,
});
```

## [Encryption](https://docs.aws.amazon.com/athena/latest/ug/encryption.html)

You can enable encryption on a Table's data:
Expand Down
14 changes: 12 additions & 2 deletions packages/@aws-cdk/aws-glue/lib/table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,15 @@ export interface TableProps {
* @default false
*/
readonly storedAsSubDirectories?: boolean;

/**
* Enables partition filtering.
*
* @see https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html#glue-best-practices-partition-index
*
* @default - The parameter is not defined
*/
readonly enablePartitionFiltering?: boolean;
}

/**
Expand Down Expand Up @@ -302,8 +311,9 @@ export class Table extends Resource implements ITable {
partitionKeys: renderColumns(props.partitionKeys),

parameters: {
classification: props.dataFormat.classificationString?.value,
has_encrypted_data: this.encryption !== TableEncryption.UNENCRYPTED,
'classification': props.dataFormat.classificationString?.value,
'has_encrypted_data': this.encryption !== TableEncryption.UNENCRYPTED,
'partition_filtering.enabled': props.enablePartitionFiltering,
},
storageDescriptor: {
location: `s3://${this.bucket.bucketName}/${this.s3Prefix}`,
Expand Down
8 changes: 8 additions & 0 deletions packages/@aws-cdk/aws-glue/test/integ.table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@ const encryptedTable = new glue.Table(stack, 'MyEncryptedTable', {
encryptionKey: new kms.Key(stack, 'MyKey'),
});

new glue.Table(stack, 'MyPartitionFilteredTable', {
database,
tableName: 'partition_filtered_table',
columns,
dataFormat: glue.DataFormat.JSON,
enablePartitionFiltering: true,
});

const user = new iam.User(stack, 'MyUser');
csvTable.grantReadWrite(user);
encryptedTable.grantReadWrite(user);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
{
"version": "17.0.0",
"version": "20.0.0",
"files": {
"92638b7a8efe38efd7c845883423f3767018a9e5bd3d67d8d638332f054d0d0f": {
"419b39f03d496de4fb02e795181e9a2ab218fb90bf7a5c9354cf93baa6fea2cf": {
"source": {
"path": "aws-cdk-glue.template.json",
"packaging": "file"
},
"destinations": {
"current_account-current_region": {
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
"objectKey": "92638b7a8efe38efd7c845883423f3767018a9e5bd3d67d8d638332f054d0d0f.json",
"objectKey": "419b39f03d496de4fb02e795181e9a2ab218fb90bf7a5c9354cf93baa6fea2cf.json",
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,76 @@
}
}
},
"MyPartitionFilteredTableBucket6ACAA137": {
"Type": "AWS::S3::Bucket",
"UpdateReplacePolicy": "Retain",
"DeletionPolicy": "Retain"
},
"MyPartitionFilteredTable324BA27A": {
"Type": "AWS::Glue::Table",
"Properties": {
"CatalogId": {
"Ref": "AWS::AccountId"
},
"DatabaseName": {
"Ref": "MyDatabase1E2517DB"
},
"TableInput": {
"Description": "partition_filtered_table generated by CDK",
"Name": "partition_filtered_table",
"Parameters": {
"classification": "json",
"has_encrypted_data": false,
"partition_filtering.enabled": true
},
"StorageDescriptor": {
"Columns": [
{
"Name": "col1",
"Type": "string"
},
{
"Comment": "col2 comment",
"Name": "col2",
"Type": "string"
},
{
"Name": "col3",
"Type": "array<string>"
},
{
"Name": "col4",
"Type": "map<string,string>"
},
{
"Name": "col5",
"Type": "struct<col1:string>"
}
],
"Compressed": false,
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"Location": {
"Fn::Join": [
"",
[
"s3://",
{
"Ref": "MyPartitionFilteredTableBucket6ACAA137"
},
"/"
]
]
},
"OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"SerdeInfo": {
"SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe"
},
"StoredAsSubDirectories": false
},
"TableType": "EXTERNAL_TABLE"
}
}
},
"MyUserDC45028B": {
"Type": "AWS::IAM::User"
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"version":"17.0.0"}
{"version":"20.0.0"}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"version": "18.0.0",
"version": "20.0.0",
"testCases": {
"aws-glue/test/integ.table": {
"integ.table": {
"stacks": [
"aws-cdk-glue"
],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "17.0.0",
"version": "20.0.0",
"artifacts": {
"Tree": {
"type": "cdk:tree",
Expand Down Expand Up @@ -69,6 +69,18 @@
"data": "MyEncryptedTable981A88C6"
}
],
"/aws-cdk-glue/MyPartitionFilteredTable/Bucket/Resource": [
{
"type": "aws:cdk:logicalId",
"data": "MyPartitionFilteredTableBucket6ACAA137"
}
],
"/aws-cdk-glue/MyPartitionFilteredTable/Table": [
{
"type": "aws:cdk:logicalId",
"data": "MyPartitionFilteredTable324BA27A"
}
],
"/aws-cdk-glue/MyUser/Resource": [
{
"type": "aws:cdk:logicalId",
Expand Down
107 changes: 106 additions & 1 deletion packages/@aws-cdk/aws-glue/test/table.integ.snapshot/tree.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"path": "Tree",
"constructInfo": {
"fqn": "constructs.Construct",
"version": "10.0.9"
"version": "10.1.33"
}
},
"aws-cdk-glue": {
Expand Down Expand Up @@ -596,6 +596,111 @@
"version": "0.0.0"
}
},
"MyPartitionFilteredTable": {
"id": "MyPartitionFilteredTable",
"path": "aws-cdk-glue/MyPartitionFilteredTable",
"children": {
"Bucket": {
"id": "Bucket",
"path": "aws-cdk-glue/MyPartitionFilteredTable/Bucket",
"children": {
"Resource": {
"id": "Resource",
"path": "aws-cdk-glue/MyPartitionFilteredTable/Bucket/Resource",
"attributes": {
"aws:cdk:cloudformation:type": "AWS::S3::Bucket",
"aws:cdk:cloudformation:props": {}
},
"constructInfo": {
"fqn": "@aws-cdk/aws-s3.CfnBucket",
"version": "0.0.0"
}
}
},
"constructInfo": {
"fqn": "@aws-cdk/aws-s3.Bucket",
"version": "0.0.0"
}
},
"Table": {
"id": "Table",
"path": "aws-cdk-glue/MyPartitionFilteredTable/Table",
"attributes": {
"aws:cdk:cloudformation:type": "AWS::Glue::Table",
"aws:cdk:cloudformation:props": {
"catalogId": {
"Ref": "AWS::AccountId"
},
"databaseName": {
"Ref": "MyDatabase1E2517DB"
},
"tableInput": {
"name": "partition_filtered_table",
"description": "partition_filtered_table generated by CDK",
"parameters": {
"classification": "json",
"has_encrypted_data": false,
"partition_filtering.enabled": true
},
"storageDescriptor": {
"location": {
"Fn::Join": [
"",
[
"s3://",
{
"Ref": "MyPartitionFilteredTableBucket6ACAA137"
},
"/"
]
]
},
"compressed": false,
"storedAsSubDirectories": false,
"columns": [
{
"name": "col1",
"type": "string"
},
{
"name": "col2",
"type": "string",
"comment": "col2 comment"
},
{
"name": "col3",
"type": "array<string>"
},
{
"name": "col4",
"type": "map<string,string>"
},
{
"name": "col5",
"type": "struct<col1:string>"
}
],
"inputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"serdeInfo": {
"serializationLibrary": "org.openx.data.jsonserde.JsonSerDe"
}
},
"tableType": "EXTERNAL_TABLE"
}
}
},
"constructInfo": {
"fqn": "@aws-cdk/aws-glue.CfnTable",
"version": "0.0.0"
}
}
},
"constructInfo": {
"fqn": "@aws-cdk/aws-glue.Table",
"version": "0.0.0"
}
},
"MyUser": {
"id": "MyUser",
"path": "aws-cdk-glue/MyUser",
Expand Down
Loading

0 comments on commit bf35048

Please sign in to comment.