aws · himani2411 · Aug 23, 2024 · Sep 9, 2024 · Sep 10, 2024 · Sep 10, 2024
@@ -42,6 +42,9 @@ class S3FileType(Enum):
     """Define S3 file types."""
 
     ASSETS = "assets"
+    HEAD_NODE_DNA_ASSETS = f"{ASSETS}/HeadNode"
+    COMPUTE_DNA_ASSETS = f"{ASSETS}/ComputeNode"
+    LOGIN_NODE_DNA_ASSETS = f"{ASSETS}/LoginNode"
     CONFIGS = "configs"
     TEMPLATES = "templates"
     CUSTOM_RESOURCES = "custom_resources"
@@ -325,6 +328,10 @@ def upload_cfn_asset(self, asset_file_content, asset_name: str, format=S3FileFor
             file_type=S3FileType.ASSETS, content=asset_file_content, file_name=asset_name, format=format
         )
 
+    def upload_dna_cfn_asset(self, asset_file_content, asset_name: str, file_type=S3FileType, format=S3FileFormat.YAML):
+        """Upload cloudformation assets to S3 bucket."""
+        return self.upload_file(file_type=file_type, content=asset_file_content, file_name=asset_name, format=format)
+
     def upload_resources(self, resource_dir, custom_artifacts_name):
         """
         Upload custom resources to S3 bucket.

@@ -51,6 +51,15 @@ datasource_list: [ Ec2, None ]
 output:
   all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0"
 write_files:
+  - path: /tmp/stack-arn.json
+    permissions: '0644'
+    owner: root:root
+    content: |
+       {
+          "cluster":{
+            "stack_arn": "${AWS::StackId}"
+          }
+       }
   - path: /tmp/bootstrap.sh
     permissions: '0744'
     owner: root:root
@@ -99,14 +108,18 @@ write_files:
 
       [ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh
 
-      $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the compute node. Please check /var/log/cfn-init.log in the compute node or in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.'
-
       [ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh
-
       custom_cookbook=${CustomChefCookbook}
       export _region=${AWS::Region}
 
       s3_url=${AWS::URLSuffix}
+      echo "Running S3 commands"
+      AWS_RETRY_MODE=standard
+      aws s3api get-object --bucket ${S3_BUCKET} --key  "${S3_ARTIFACT_DIR}/assets/common-dna-${ClusterConfigVersion}.json" --region ${AWS::Region} /tmp/common-dna.json 2>&1 || error_exit "${!S3API_RESULT}"
+      aws s3api get-object --bucket ${S3_BUCKET} --key  "${S3_ARTIFACT_DIR}/assets/ComputeNode/compute-dna-${LaunchTemplateResourceId}-${ClusterConfigVersion}.json" --region ${AWS::Region} /tmp/compute-dna.json 2>&1 || error_exit "${!S3API_RESULT}"
+      aws s3api get-object --bucket ${S3_BUCKET} --key  "${S3_ARTIFACT_DIR}/assets/extra-${ClusterConfigVersion}.json" --region ${AWS::Region} /tmp/extra.json 2>&1 || error_exit "${!S3API_RESULT}"
+      echo "Completed S3 commands"
+
       if [ "${!custom_cookbook}" != "NONE" ]; then
         if [[ "${!custom_cookbook}" =~ ^s3://([^/]*)(.*) ]]; then
           # Socket timeout = 15s; the actual timeout is 8*(cli-connect-timeout), so (15s*8)*3retries=6min.
@@ -139,9 +152,11 @@ write_files:
         vendor_cookbook
       fi
       cd /tmp
+      mkdir -p /etc/chef/ohai/hints
+      touch /etc/chef/ohai/hints/ec2.json
 
       start=$(date +%s)
-
+      jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json  /etc/chef/dna.json )
       {
         CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist"
         FR_CMD="/opt/parallelcluster/scripts/fetch_and_run"

@@ -648,7 +648,13 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
             iam.PolicyStatement(
                 sid="ResourcesS3Bucket",
                 effect=iam.Effect.ALLOW,
-                actions=["s3:GetObject", "s3:GetObjectVersion", "s3:GetBucketLocation", "s3:ListBucket"],
+                actions=[
+                    "s3:GetObject",
+                    "s3:PutObject",
+                    "s3:GetObjectVersion",
+                    "s3:GetBucketLocation",
+                    "s3:ListBucket",
+                ],
                 resources=[
                     self._format_arn(service="s3", resource=self._cluster_bucket.name, region="", account=""),
                     self._format_arn(
@@ -820,7 +826,9 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
                         (
                             "secretsmanager:GetSecretValue"
                             if password_secret_arn.service == "secretsmanager"
-                            else "ssm:GetParameter" if password_secret_arn.service == "ssm" else None
+                            else "ssm:GetParameter"
+                            if password_secret_arn.service == "ssm"
+                            else None
                         )
                     ],
                     effect=iam.Effect.ALLOW,
@@ -997,7 +1005,9 @@ def __init__(
         node: Union[HeadNode, BaseQueue, LoginNodesPool],
         shared_storage_infos: dict,
         name: str,
+        cluster_bucket: S3Bucket,
     ):
+        self._cluster_bucket = cluster_bucket
         super().__init__(scope, id, config, node, shared_storage_infos, name)
 
     def _build_policy(self) -> List[iam.PolicyStatement]:
@@ -1023,6 +1033,20 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
                     )
                 ],
             ),
+            iam.PolicyStatement(
+                sid="S3GetLaunchTemplate",
+                actions=["s3:GetObject", "s3:ListBucket"],
+                effect=iam.Effect.ALLOW,
+                resources=[
+                    self._format_arn(service="s3", resource=self._cluster_bucket.name, region="", account=""),
+                    self._format_arn(
+                        service="s3",
+                        resource=f"{self._cluster_bucket.name}/{self._cluster_bucket.artifact_directory}/*",
+                        region="",
+                        account="",
+                    ),
+                ],
+            ),
             iam.PolicyStatement(
                 sid="CloudFormation",
                 actions=[

@@ -76,7 +76,7 @@
     PCLUSTER_S3_ARTIFACTS_DICT,
     SLURM_PORTS_RANGE,
 )
-from pcluster.models.s3_bucket import S3Bucket
+from pcluster.models.s3_bucket import S3Bucket, S3FileFormat, S3FileType
 from pcluster.templates.awsbatch_builder import AwsBatchConstruct
 from pcluster.templates.cdk_builder_utils import (
     CdkLaunchTemplateBuilder,
@@ -813,12 +813,12 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
                 rules.append(egress_rule)
 
                 if sg_type == "Storage":
-                    ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
-                        storage_deletion_policy
-                    )
-                    egress_rule.cfn_options.deletion_policy = egress_rule.cfn_options.update_replace_policy = (
-                        storage_deletion_policy
-                    )
+                    ingress_rule.cfn_options.deletion_policy = (
+                        ingress_rule.cfn_options.update_replace_policy
+                    ) = storage_deletion_policy
+                    egress_rule.cfn_options.deletion_policy = (
+                        egress_rule.cfn_options.update_replace_policy
+                    ) = storage_deletion_policy
 
         return storage_security_group, rules
 
@@ -1010,9 +1010,9 @@ def _add_managed_fsx(self, fsx_id, id, mount_name, shared_fsx):
         )
         for rule in security_group_rules:
             fsx_resource.add_depends_on(rule)
-        fsx_resource.cfn_options.deletion_policy = fsx_resource.cfn_options.update_replace_policy = (
-            convert_deletion_policy(shared_fsx.deletion_policy)
-        )
+        fsx_resource.cfn_options.deletion_policy = (
+            fsx_resource.cfn_options.update_replace_policy
+        ) = convert_deletion_policy(shared_fsx.deletion_policy)
         fsx_id = fsx_resource.ref
         self._add_dra(fsx_id, shared_fsx)
         # Get MountName for new filesystem. DNSName cannot be retrieved from CFN and will be generated in cookbook
@@ -1256,12 +1256,10 @@ def _add_head_node(self):
         head_node_launch_template.add_metadata("Comment", "AWS ParallelCluster Head Node")
         # CloudFormation::Init metadata
 
-        dna_json = json.dumps(
+        common_dna_json = json.dumps(
             {
                 "cluster": {
                     "stack_name": self._stack_name,
-                    "stack_arn": self.stack.stack_id,
-                    "raid_vol_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.RAID),
                     "raid_shared_dir": to_comma_separated_string(
                         self.shared_storage_mount_dirs[SharedStorageType.RAID]
                     ),
@@ -1306,49 +1304,28 @@ def _add_head_node(self):
                         self.shared_storage_attributes[SharedStorageType.FSX]["FileSystemTypes"]
                     ),
                     "fsx_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.FSX]),
-                    "volume": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EBS),
                     "scheduler": self.config.scheduling.scheduler,
-                    "ephemeral_dir": (
-                        head_node.local_storage.ephemeral_volume.mount_dir
-                        if head_node.local_storage.ephemeral_volume
-                        else DEFAULT_EPHEMERAL_DIR
-                    ),
                     "ebs_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.EBS]),
-                    "proxy": head_node.networking.proxy.http_proxy_address if head_node.networking.proxy else "NONE",
-                    "node_type": "HeadNode",
                     "cluster_user": OS_MAPPING[self.config.image.os]["user"],
-                    "ddb_table": self.dynamodb_table_status.ref if not self._condition_is_batch() else "NONE",
                     "log_group_name": (
                         self.log_group.log_group_name if self.config.monitoring.logs.cloud_watch.enabled else "NONE"
                     ),
-                    "dcv_enabled": "head_node" if self.config.is_dcv_enabled else "false",
-                    "dcv_port": head_node.dcv.port if head_node.dcv else "NONE",
-                    "enable_intel_hpc_platform": "true" if self.config.is_intel_hpc_platform_enabled else "false",
                     "cw_logging_enabled": "true" if self.config.is_cw_logging_enabled else "false",
                     "log_rotation_enabled": "true" if self.config.is_log_rotation_enabled else "false",
                     "cluster_s3_bucket": self.bucket.name,
                     "cluster_config_s3_key": "{0}/configs/{1}".format(
                         self.bucket.artifact_directory, PCLUSTER_S3_ARTIFACTS_DICT.get("config_name")
                     ),
                     "cluster_config_version": self.config.config_version,
-                    "instance_types_data_version": self.config.instance_types_data_version,
-                    "change_set_s3_key": f"{self.bucket.artifact_directory}/configs/"
-                    f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}",
-                    "instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/"
-                    f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}",
                     "custom_node_package": self.config.custom_node_package or "",
                     "custom_awsbatchcli_package": self.config.custom_aws_batch_cli_package or "",
-                    "head_node_imds_secured": str(self.config.head_node.imds.secured).lower(),
-                    "compute_node_bootstrap_timeout": get_attr(
-                        self.config, "dev_settings.timeouts.compute_node_bootstrap_timeout", NODE_BOOTSTRAP_TIMEOUT
-                    ),
+                    "head_node_private_ip": "HEAD_NODE_PRIVATE_IP",
                     "disable_sudo_access_for_default_user": (
                         "true"
                         if self.config.deployment_settings
                         and self.config.deployment_settings.disable_sudo_access_default_user
                         else "false"
                     ),
-                    "launch_template_id": launch_template_id,
                     **(
                         get_slurm_specific_dna_json_for_head_node(self.config, self.scheduler_resources)
                         if self._condition_is_slurm()
@@ -1359,7 +1336,44 @@ def _add_head_node(self):
             },
             indent=4,
         )
-
+        head_node_specific_dna_json = json.dumps(
+            {
+                "cluster": {
+                    "stack_arn": self.stack.stack_id,
+                    "raid_vol_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.RAID),
+                    "volume": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EBS),
+                    "ephemeral_dir": (
+                        head_node.local_storage.ephemeral_volume.mount_dir
+                        if head_node.local_storage.ephemeral_volume
+                        else DEFAULT_EPHEMERAL_DIR
+                    ),
+                    "proxy": head_node.networking.proxy.http_proxy_address if head_node.networking.proxy else "NONE",
+                    "node_type": "HeadNode",
+                    "ddb_table": self.dynamodb_table_status.ref if not self._condition_is_batch() else "NONE",
+                    "dcv_enabled": "head_node" if self.config.is_dcv_enabled else "false",
+                    "dcv_port": head_node.dcv.port if head_node.dcv else "NONE",
+                    "common_dna_s3_key": f"{self.bucket.artifact_directory}/assets/"
+                    f"common-dna-{self.config.config_version}.json",
+                    "instance_types_data_version": self.config.instance_types_data_version,
+                    "change_set_s3_key": f"{self.bucket.artifact_directory}/configs/"
+                    f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}",
+                    "instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/"
+                    f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}",
+                    "head_node_imds_secured": str(self.config.head_node.imds.secured).lower(),
+                    "compute_node_bootstrap_timeout": get_attr(
+                        self.config, "dev_settings.timeouts.compute_node_bootstrap_timeout", NODE_BOOTSTRAP_TIMEOUT
+                    ),
+                    "launch_template_id": launch_template_id,
+                },
+            },
+            indent=4,
+        )
+        self.bucket.upload_dna_cfn_asset(
+            file_type=S3FileType.ASSETS,
+            asset_file_content=json.loads(self.config.extra_chef_attributes),
+            asset_name=f"extra-{self.config.config_version}.json",
+            format=S3FileFormat.JSON,
+        )
         cfn_init = {
             "configSets": {
                 "deployFiles": ["deployConfigFiles"],
@@ -1377,8 +1391,15 @@ def _add_head_node(self):
                     # A nosec comment is appended to the following line in order to disable the B108 check.
                     # The file is needed by the product
                     # [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
-                    "/tmp/dna.json": {  # nosec B108
-                        "content": dna_json,
+                    "/tmp/head-node-dna.json": {  # nosec B108
+                        "content": head_node_specific_dna_json,
+                        "mode": "000644",
+                        "owner": "root",
+                        "group": "root",
+                        "encoding": "plain",
+                    },
+                    "/tmp/common-dna.json": {  # nosec B108
+                        "content": common_dna_json,
                         "mode": "000644",
                         "owner": "root",
                         "group": "root",
@@ -1408,8 +1429,9 @@ def _add_head_node(self):
                     "touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"},
                     "jq": {
                         "command": (
-                            'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json '
-                            '|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )'
+                            'jq -s ".[0] * .[1] * .[2]" /tmp/common-dna.json /tmp/head-node-dna.json /tmp/extra.json '
+                            '> /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json '
+                            "/tmp/head-node-dna.json /etc/chef/dna.json )"
                         )
                     },
                 },