From a9afb4cb81b5eca61c8bce1150cfd7d9b4eebcc1 Mon Sep 17 00:00:00 2001 From: dspeck Date: Tue, 10 Oct 2023 15:50:25 +0000 Subject: [PATCH 1/2] Add set gpu driver install. Remove unused comments. --- ...cpBatchAsyncBackendJobExecutionActor.scala | 5 -- .../api/GcpBatchRequestFactoryImpl.scala | 57 ++++++++++--------- .../models/GcpBatchRuntimeAttributes.scala | 9 ++- .../google/batch/runnable/UserRunnable.scala | 3 - 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala index b2b10d92afb..ae228ad503b 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala @@ -829,12 +829,7 @@ class GcpBatchAsyncBackendJobExecutionActor(override val standardParams: Standar _ <- evaluateRuntimeAttributes _ <- uploadScriptFile() customLabels <- Future.fromTry(GcpLabel.fromWorkflowOptions(workflowDescriptor.workflowOptions)) - _ = customLabels.foreach(x => println(s"ZZZ Custom Labels - $x")) batchParameters <- generateInputOutputParameters - _ = batchParameters.fileInputParameters.foreach(x => println(s"ZZZ File InputParameters - $x")) - _ = batchParameters.jobInputParameters.foreach(x => println(s"ZZZ InputParameters - $x")) - _ = batchParameters.fileOutputParameters.foreach(x => println(s"ZZZ File OutputParameters - $x")) - _ = batchParameters.jobOutputParameters.foreach(x => println(s"ZZZ OutputParameters - $x")) createParameters = createBatchParameters(batchParameters, customLabels) drsLocalizationManifestCloudPath = jobPaths.callExecutionRoot / GcpBatchJobPaths.DrsLocalizationManifestName _ <- uploadDrsLocalizationManifest(createParameters, drsLocalizationManifestCloudPath) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala index 435992b8cd8..bc08b6d850e 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala @@ -1,17 +1,14 @@ package cromwell.backend.google.batch.api -import com.google.cloud.batch.v1.AllocationPolicy.Accelerator -import com.google.cloud.batch.v1.{DeleteJobRequest, GetJobRequest, JobName} -import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration -import cromwell.backend.google.batch.models.GcpBatchRequest -import cromwell.backend.google.batch.runnable._ -import cromwell.backend.google.batch.util.BatchUtilityConversions -import com.google.cloud.batch.v1.AllocationPolicy.{AttachedDisk, InstancePolicy, InstancePolicyOrTemplate, LocationPolicy, NetworkInterface, NetworkPolicy, ProvisioningModel} +import com.google.cloud.batch.v1.AllocationPolicy._ import com.google.cloud.batch.v1.LogsPolicy.Destination -import com.google.cloud.batch.v1.{AllocationPolicy, ComputeResource, CreateJobRequest, Job, LogsPolicy, Runnable, ServiceAccount, TaskGroup, TaskSpec, Volume} +import com.google.cloud.batch.v1.{AllocationPolicy, ComputeResource, CreateJobRequest, DeleteJobRequest, GetJobRequest, Job, JobName, LogsPolicy, Runnable, ServiceAccount, TaskGroup, TaskSpec, Volume} import com.google.protobuf.Duration import cromwell.backend.google.batch.io.GcpBatchAttachedDisk -import cromwell.backend.google.batch.models.VpcAndSubnetworkProjectLabelValues +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models.{GcpBatchRequest, VpcAndSubnetworkProjectLabelValues} +import cromwell.backend.google.batch.runnable._ +import cromwell.backend.google.batch.util.BatchUtilityConversions import scala.jdk.CollectionConverters._ @@ -61,10 +58,11 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe .build } - private def createInstancePolicy(cpuPlatform: String, spotModel: ProvisioningModel, accelerators: Option[Accelerator.Builder], attachedDisks: List[AttachedDisk]) = { + private def createInstancePolicy(cpuPlatform: String, spotModel: ProvisioningModel, accelerators: Option[Accelerator.Builder], attachedDisks: List[AttachedDisk]): InstancePolicy.Builder = { //set GPU count to 0 if not included in workflow - val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) + val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) // TODO: Driver version + val instancePolicy = InstancePolicy .newBuilder .setProvisioningModel(spotModel) @@ -72,6 +70,10 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe .setMinCpuPlatform(cpuPlatform) .buildPartial() + + //val acceleratorCount = accelerators.mkString + println(s"gpu count ${accelerators.mkString}") + //add GPUs if GPU count is greater than 1 if (gpuAccelerators.getCount >= 1) { val instancePolicyGpu = instancePolicy.toBuilder @@ -83,7 +85,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe } - private def createNetworkPolicy(networkInterface: NetworkInterface): NetworkPolicy = { NetworkPolicy .newBuilder @@ -113,8 +114,9 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe } - private def createAllocationPolicy(data: GcpBatchRequest, locationPolicy: LocationPolicy, instancePolicy: InstancePolicy, networkPolicy: NetworkPolicy, serviceAccount: ServiceAccount) = { - AllocationPolicy + private def createAllocationPolicy(data: GcpBatchRequest, locationPolicy: LocationPolicy, instancePolicy: InstancePolicy, networkPolicy: NetworkPolicy, serviceAccount: ServiceAccount, accelerators: Option[Accelerator.Builder]) = { + + val allocationPolicy = AllocationPolicy .newBuilder .setLocation(locationPolicy) .setNetwork(networkPolicy) @@ -122,13 +124,19 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe .putLabels("goog-batch-worker", "true") .putAllLabels((data.createParameters.googleLabels.map(label => label.key -> label.value).toMap.asJava)) .setServiceAccount(serviceAccount) - .addInstances(InstancePolicyOrTemplate - .newBuilder - .setPolicy(instancePolicy) - .build) - .build + .buildPartial() + + val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) + + //add GPUs if GPU count is greater than or equal to 1 + if (gpuAccelerators.getCount >= 1) { + allocationPolicy.toBuilder.addInstances(InstancePolicyOrTemplate.newBuilder.setPolicy(instancePolicy).setInstallGpuDrivers(true).build) + } else { + allocationPolicy.toBuilder.addInstances(InstancePolicyOrTemplate.newBuilder.setPolicy(instancePolicy).build) + } } + override def submitRequest(data: GcpBatchRequest): CreateJobRequest = { val batchAttributes = data.gcpBatchParameters.batchAttributes @@ -160,10 +168,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe // Batch defaults to 1 task val taskCount: Long = 1 - println(f"command script container path ${data.createParameters.commandScriptContainerPath}") - println(f"cloud workflow root ${data.createParameters.cloudWorkflowRoot}") - println(f"all parameters:\n ${data.createParameters.allParameters.mkString("\n")}") - // parse preemption value and set value for Spot. Spot is replacement for preemptible val spotModel = toProvisioningModel(runtimeAttributes.preemptible) @@ -205,11 +209,11 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe val taskGroup: TaskGroup = createTaskGroup(taskCount, taskSpec) val instancePolicy = createInstancePolicy(cpuPlatform, spotModel, accelerators, allDisks) val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build - val allocationPolicy = createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa) + val allocationPolicy = createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa, accelerators) val job = Job .newBuilder .addTaskGroups(taskGroup) - .setAllocationPolicy(allocationPolicy) + .setAllocationPolicy(allocationPolicy.build()) .putLabels("submitter", "cromwell") // label to signify job submitted by cromwell for larger tracking purposes within GCP batch .putLabels("goog-batch-worker", "true") .putAllLabels((data.createParameters.googleLabels.map(label => label.key -> label.value).toMap.asJava)) @@ -218,9 +222,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe .setDestination(Destination.CLOUD_LOGGING) .build) - println(f"job shell ${data.createParameters.jobShell}") - println(f"script container path ${data.createParameters.commandScriptContainerPath}") - println(f"labels ${data.createParameters.googleLabels}") CreateJobRequest .newBuilder diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala index 9fe3327f35d..c9dc62b0cc7 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala @@ -19,6 +19,8 @@ import wom.values.{WomArray, WomBoolean, WomInteger, WomString, WomValue} object GpuResource { + val DefaultNvidiaDriverVersion = "418.87.00" + final case class GpuType(name: String) { override def toString: String = name } @@ -99,6 +101,9 @@ object GcpBatchRuntimeAttributes { private def cpuPlatformValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] = cpuPlatformValidationInstance private def gpuTypeValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[GpuType] = GpuTypeValidation.optional + val GpuDriverVersionKey = "nvidiaDriverVersion" + private def gpuDriverValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] = new StringRuntimeAttributesValidation(GpuDriverVersionKey).optional + private def gpuCountValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[Int Refined Positive] = GpuValidation.optional private def gpuMinValidation(runtimeConfig: Option[Config]):OptionalRuntimeAttributesValidation[Int Refined Positive] = GpuValidation.optionalMin @@ -159,6 +164,7 @@ object GcpBatchRuntimeAttributes { StandardValidatedRuntimeAttributesBuilder.default(runtimeConfig).withValidation( gpuCountValidation(runtimeConfig), gpuTypeValidation(runtimeConfig), + gpuDriverValidation(runtimeConfig), cpuValidation(runtimeConfig), cpuPlatformValidation(runtimeConfig), cpuMinValidation(runtimeConfig), @@ -189,8 +195,9 @@ object GcpBatchRuntimeAttributes { .extractOption(gpuTypeValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) lazy val gpuCount: Option[Int Refined Positive] = RuntimeAttributesValidation .extractOption(gpuCountValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) + lazy val gpuDriver: Option[String] = RuntimeAttributesValidation.extractOption(gpuDriverValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) - val gpuResource: Option[GpuResource] = if (gpuType.isDefined || gpuCount.isDefined) { + val gpuResource: Option[GpuResource] = if (gpuType.isDefined || gpuCount.isDefined || gpuDriver.isDefined) { Option(GpuResource(gpuType.getOrElse(GpuType.DefaultGpuType), gpuCount .getOrElse(GpuType.DefaultGpuCount))) } else { diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala index b7f5a026476..d2d499b3127 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala @@ -8,9 +8,6 @@ trait UserRunnable { def userRunnables(createParameters: CreateBatchJobParameters, volumes: List[Volume]): List[Runnable] = { - println(f"job shell ${createParameters.jobShell}") - println(f"script container path ${createParameters.commandScriptContainerPath}") - val userRunnable = RunnableBuilder.userRunnable( docker = createParameters.dockerImage, scriptContainerPath = createParameters.commandScriptContainerPath.pathAsString, From 0745bcaded7fe011ad883143bd48652ed0f690c9 Mon Sep 17 00:00:00 2001 From: dspeck Date: Tue, 10 Oct 2023 18:45:42 +0000 Subject: [PATCH 2/2] remove debugging println --- .../backend/google/batch/api/GcpBatchRequestFactoryImpl.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala index bc08b6d850e..9c482d852d1 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala @@ -70,10 +70,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe .setMinCpuPlatform(cpuPlatform) .buildPartial() - - //val acceleratorCount = accelerators.mkString - println(s"gpu count ${accelerators.mkString}") - //add GPUs if GPU count is greater than 1 if (gpuAccelerators.getCount >= 1) { val instancePolicyGpu = instancePolicy.toBuilder