Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Scaling] #6428

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cli/src/pcluster/models/s3_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class S3FileType(Enum):
"""Define S3 file types."""

ASSETS = "assets"
HEAD_NODE_DNA_ASSETS = f"{ASSETS}/HeadNode"
COMPUTE_DNA_ASSETS = f"{ASSETS}/ComputeNode"
LOGIN_NODE_DNA_ASSETS = f"{ASSETS}/LoginNode"
CONFIGS = "configs"
TEMPLATES = "templates"
CUSTOM_RESOURCES = "custom_resources"
Expand Down Expand Up @@ -325,6 +328,10 @@ def upload_cfn_asset(self, asset_file_content, asset_name: str, format=S3FileFor
file_type=S3FileType.ASSETS, content=asset_file_content, file_name=asset_name, format=format
)

def upload_dna_cfn_asset(self, asset_file_content, asset_name: str, file_type=S3FileType, format=S3FileFormat.YAML):
"""Upload cloudformation assets to S3 bucket."""
return self.upload_file(file_type=file_type, content=asset_file_content, file_name=asset_name, format=format)

def upload_resources(self, resource_dir, custom_artifacts_name):
"""
Upload custom resources to S3 bucket.
Expand Down
23 changes: 19 additions & 4 deletions cli/src/pcluster/resources/compute_node/user_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ datasource_list: [ Ec2, None ]
output:
all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0"
write_files:
- path: /tmp/stack-arn.json
permissions: '0644'
owner: root:root
content: |
{
"cluster":{
"stack_arn": "${AWS::StackId}"
}
}
- path: /tmp/bootstrap.sh
permissions: '0744'
owner: root:root
Expand Down Expand Up @@ -99,14 +108,18 @@ write_files:

[ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh

$CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the compute node. Please check /var/log/cfn-init.log in the compute node or in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.'

[ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh

custom_cookbook=${CustomChefCookbook}
export _region=${AWS::Region}

s3_url=${AWS::URLSuffix}
echo "Running S3 commands"
AWS_RETRY_MODE=standard
aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/common-dna-${ClusterConfigVersion}.json" --region ${AWS::Region} /tmp/common-dna.json 2>&1 || error_exit "${!S3API_RESULT}"
aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/ComputeNode/compute-dna-${LaunchTemplateResourceId}-${ClusterConfigVersion}.json" --region ${AWS::Region} /tmp/compute-dna.json 2>&1 || error_exit "${!S3API_RESULT}"
aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/extra-${ClusterConfigVersion}.json" --region ${AWS::Region} /tmp/extra.json 2>&1 || error_exit "${!S3API_RESULT}"
echo "Completed S3 commands"

if [ "${!custom_cookbook}" != "NONE" ]; then
if [[ "${!custom_cookbook}" =~ ^s3://([^/]*)(.*) ]]; then
# Socket timeout = 15s; the actual timeout is 8*(cli-connect-timeout), so (15s*8)*3retries=6min.
Expand Down Expand Up @@ -139,9 +152,11 @@ write_files:
vendor_cookbook
fi
cd /tmp
mkdir -p /etc/chef/ohai/hints
touch /etc/chef/ohai/hints/ec2.json

start=$(date +%s)

jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json )
{
CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist"
FR_CMD="/opt/parallelcluster/scripts/fetch_and_run"
Expand Down
28 changes: 26 additions & 2 deletions cli/src/pcluster/templates/cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,13 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
iam.PolicyStatement(
sid="ResourcesS3Bucket",
effect=iam.Effect.ALLOW,
actions=["s3:GetObject", "s3:GetObjectVersion", "s3:GetBucketLocation", "s3:ListBucket"],
actions=[
"s3:GetObject",
"s3:PutObject",
"s3:GetObjectVersion",
"s3:GetBucketLocation",
"s3:ListBucket",
],
resources=[
self._format_arn(service="s3", resource=self._cluster_bucket.name, region="", account=""),
self._format_arn(
Expand Down Expand Up @@ -820,7 +826,9 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
(
"secretsmanager:GetSecretValue"
if password_secret_arn.service == "secretsmanager"
else "ssm:GetParameter" if password_secret_arn.service == "ssm" else None
else "ssm:GetParameter"
if password_secret_arn.service == "ssm"
else None
)
],
effect=iam.Effect.ALLOW,
Expand Down Expand Up @@ -997,7 +1005,9 @@ def __init__(
node: Union[HeadNode, BaseQueue, LoginNodesPool],
shared_storage_infos: dict,
name: str,
cluster_bucket: S3Bucket,
):
self._cluster_bucket = cluster_bucket
super().__init__(scope, id, config, node, shared_storage_infos, name)

def _build_policy(self) -> List[iam.PolicyStatement]:
Expand All @@ -1023,6 +1033,20 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
)
],
),
iam.PolicyStatement(
sid="S3GetLaunchTemplate",
actions=["s3:GetObject", "s3:ListBucket"],
effect=iam.Effect.ALLOW,
resources=[
self._format_arn(service="s3", resource=self._cluster_bucket.name, region="", account=""),
self._format_arn(
service="s3",
resource=f"{self._cluster_bucket.name}/{self._cluster_bucket.artifact_directory}/*",
region="",
account="",
),
],
),
iam.PolicyStatement(
sid="CloudFormation",
actions=[
Expand Down
102 changes: 62 additions & 40 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
PCLUSTER_S3_ARTIFACTS_DICT,
SLURM_PORTS_RANGE,
)
from pcluster.models.s3_bucket import S3Bucket
from pcluster.models.s3_bucket import S3Bucket, S3FileFormat, S3FileType
from pcluster.templates.awsbatch_builder import AwsBatchConstruct
from pcluster.templates.cdk_builder_utils import (
CdkLaunchTemplateBuilder,
Expand Down Expand Up @@ -813,12 +813,12 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
rules.append(egress_rule)

if sg_type == "Storage":
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
storage_deletion_policy
)
egress_rule.cfn_options.deletion_policy = egress_rule.cfn_options.update_replace_policy = (
storage_deletion_policy
)
ingress_rule.cfn_options.deletion_policy = (
ingress_rule.cfn_options.update_replace_policy
) = storage_deletion_policy
egress_rule.cfn_options.deletion_policy = (
egress_rule.cfn_options.update_replace_policy
) = storage_deletion_policy

return storage_security_group, rules

Expand Down Expand Up @@ -1010,9 +1010,9 @@ def _add_managed_fsx(self, fsx_id, id, mount_name, shared_fsx):
)
for rule in security_group_rules:
fsx_resource.add_depends_on(rule)
fsx_resource.cfn_options.deletion_policy = fsx_resource.cfn_options.update_replace_policy = (
convert_deletion_policy(shared_fsx.deletion_policy)
)
fsx_resource.cfn_options.deletion_policy = (
fsx_resource.cfn_options.update_replace_policy
) = convert_deletion_policy(shared_fsx.deletion_policy)
fsx_id = fsx_resource.ref
self._add_dra(fsx_id, shared_fsx)
# Get MountName for new filesystem. DNSName cannot be retrieved from CFN and will be generated in cookbook
Expand Down Expand Up @@ -1256,12 +1256,10 @@ def _add_head_node(self):
head_node_launch_template.add_metadata("Comment", "AWS ParallelCluster Head Node")
# CloudFormation::Init metadata

dna_json = json.dumps(
common_dna_json = json.dumps(
{
"cluster": {
"stack_name": self._stack_name,
"stack_arn": self.stack.stack_id,
"raid_vol_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.RAID),
"raid_shared_dir": to_comma_separated_string(
self.shared_storage_mount_dirs[SharedStorageType.RAID]
),
Expand Down Expand Up @@ -1306,49 +1304,28 @@ def _add_head_node(self):
self.shared_storage_attributes[SharedStorageType.FSX]["FileSystemTypes"]
),
"fsx_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.FSX]),
"volume": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EBS),
"scheduler": self.config.scheduling.scheduler,
"ephemeral_dir": (
head_node.local_storage.ephemeral_volume.mount_dir
if head_node.local_storage.ephemeral_volume
else DEFAULT_EPHEMERAL_DIR
),
"ebs_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.EBS]),
"proxy": head_node.networking.proxy.http_proxy_address if head_node.networking.proxy else "NONE",
"node_type": "HeadNode",
"cluster_user": OS_MAPPING[self.config.image.os]["user"],
"ddb_table": self.dynamodb_table_status.ref if not self._condition_is_batch() else "NONE",
"log_group_name": (
self.log_group.log_group_name if self.config.monitoring.logs.cloud_watch.enabled else "NONE"
),
"dcv_enabled": "head_node" if self.config.is_dcv_enabled else "false",
"dcv_port": head_node.dcv.port if head_node.dcv else "NONE",
"enable_intel_hpc_platform": "true" if self.config.is_intel_hpc_platform_enabled else "false",
"cw_logging_enabled": "true" if self.config.is_cw_logging_enabled else "false",
"log_rotation_enabled": "true" if self.config.is_log_rotation_enabled else "false",
"cluster_s3_bucket": self.bucket.name,
"cluster_config_s3_key": "{0}/configs/{1}".format(
self.bucket.artifact_directory, PCLUSTER_S3_ARTIFACTS_DICT.get("config_name")
),
"cluster_config_version": self.config.config_version,
"instance_types_data_version": self.config.instance_types_data_version,
"change_set_s3_key": f"{self.bucket.artifact_directory}/configs/"
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}",
"instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/"
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}",
"custom_node_package": self.config.custom_node_package or "",
"custom_awsbatchcli_package": self.config.custom_aws_batch_cli_package or "",
"head_node_imds_secured": str(self.config.head_node.imds.secured).lower(),
"compute_node_bootstrap_timeout": get_attr(
self.config, "dev_settings.timeouts.compute_node_bootstrap_timeout", NODE_BOOTSTRAP_TIMEOUT
),
"head_node_private_ip": "HEAD_NODE_PRIVATE_IP",
"disable_sudo_access_for_default_user": (
"true"
if self.config.deployment_settings
and self.config.deployment_settings.disable_sudo_access_default_user
else "false"
),
"launch_template_id": launch_template_id,
**(
get_slurm_specific_dna_json_for_head_node(self.config, self.scheduler_resources)
if self._condition_is_slurm()
Expand All @@ -1359,7 +1336,44 @@ def _add_head_node(self):
},
indent=4,
)

head_node_specific_dna_json = json.dumps(
{
"cluster": {
"stack_arn": self.stack.stack_id,
"raid_vol_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.RAID),
"volume": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EBS),
"ephemeral_dir": (
head_node.local_storage.ephemeral_volume.mount_dir
if head_node.local_storage.ephemeral_volume
else DEFAULT_EPHEMERAL_DIR
),
"proxy": head_node.networking.proxy.http_proxy_address if head_node.networking.proxy else "NONE",
"node_type": "HeadNode",
"ddb_table": self.dynamodb_table_status.ref if not self._condition_is_batch() else "NONE",
"dcv_enabled": "head_node" if self.config.is_dcv_enabled else "false",
"dcv_port": head_node.dcv.port if head_node.dcv else "NONE",
"common_dna_s3_key": f"{self.bucket.artifact_directory}/assets/"
f"common-dna-{self.config.config_version}.json",
"instance_types_data_version": self.config.instance_types_data_version,
"change_set_s3_key": f"{self.bucket.artifact_directory}/configs/"
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}",
"instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/"
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}",
"head_node_imds_secured": str(self.config.head_node.imds.secured).lower(),
"compute_node_bootstrap_timeout": get_attr(
self.config, "dev_settings.timeouts.compute_node_bootstrap_timeout", NODE_BOOTSTRAP_TIMEOUT
),
"launch_template_id": launch_template_id,
},
},
indent=4,
)
self.bucket.upload_dna_cfn_asset(
file_type=S3FileType.ASSETS,
asset_file_content=json.loads(self.config.extra_chef_attributes),
asset_name=f"extra-{self.config.config_version}.json",
format=S3FileFormat.JSON,
)
cfn_init = {
"configSets": {
"deployFiles": ["deployConfigFiles"],
Expand All @@ -1377,8 +1391,15 @@ def _add_head_node(self):
# A nosec comment is appended to the following line in order to disable the B108 check.
# The file is needed by the product
# [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
"/tmp/dna.json": { # nosec B108
"content": dna_json,
"/tmp/head-node-dna.json": { # nosec B108
"content": head_node_specific_dna_json,
"mode": "000644",
"owner": "root",
"group": "root",
"encoding": "plain",
},
"/tmp/common-dna.json": { # nosec B108
"content": common_dna_json,
"mode": "000644",
"owner": "root",
"group": "root",
Expand Down Expand Up @@ -1408,8 +1429,9 @@ def _add_head_node(self):
"touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"},
"jq": {
"command": (
'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json '
'|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )'
'jq -s ".[0] * .[1] * .[2]" /tmp/common-dna.json /tmp/head-node-dna.json /tmp/extra.json '
'> /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json '
"/tmp/head-node-dna.json /etc/chef/dna.json )"
)
},
},
Expand Down
Loading
Loading