Skip to content

Commit

Permalink
Merge pull request #1450 from losipiuk/lo/cherry-picks-ca-13
Browse files Browse the repository at this point in the history
CA cherry picks to 1.13 and update version
  • Loading branch information
k8s-ci-robot authored Nov 27, 2018
2 parents 7d42c40 + ab47574 commit 6ffe3e7
Show file tree
Hide file tree
Showing 9 changed files with 257 additions and 17 deletions.
22 changes: 13 additions & 9 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ limitations under the License.
package aws

import (
"errors"
"fmt"
"io"
"math/rand"
Expand Down Expand Up @@ -183,27 +184,30 @@ func (m *AwsManager) GetAsgNodes(ref AwsRef) ([]AwsInstanceRef, error) {

func (m *AwsManager) getAsgTemplate(asg *asg) (*asgTemplate, error) {
if len(asg.AvailabilityZones) < 1 {
return nil, fmt.Errorf("Unable to get first AvailabilityZone for %s", asg.Name)
return nil, fmt.Errorf("Unable to get first AvailabilityZone for ASG %q", asg.Name)
}

az := asg.AvailabilityZones[0]
region := az[0 : len(az)-1]

if len(asg.AvailabilityZones) > 1 {
klog.Warningf("Found multiple availability zones, using %s\n", az)
klog.Warningf("Found multiple availability zones for ASG %q; using %s\n", asg.Name, az)
}

instanceTypeName, err := m.buildInstanceType(asg)
if err != nil {
return nil, err
}

return &asgTemplate{
InstanceType: InstanceTypes[instanceTypeName],
Region: region,
Zone: az,
Tags: asg.Tags,
}, nil
if t, ok := InstanceTypes[instanceTypeName]; ok {
return &asgTemplate{
InstanceType: t,
Region: region,
Zone: az,
Tags: asg.Tags,
}, nil
}
return nil, fmt.Errorf("ASG %q uses the unknown EC2 instance type %q", asg.Name, instanceTypeName)
}

func (m *AwsManager) buildInstanceType(asg *asg) (string, error) {
Expand All @@ -213,7 +217,7 @@ func (m *AwsManager) buildInstanceType(asg *asg) (string, error) {
return m.ec2Service.getInstanceTypeByLT(asg.LaunchTemplateName, asg.LaunchTemplateVersion)
}

return "", fmt.Errorf("Unable to get instance type from launch config or launch template")
return "", errors.New("Unable to get instance type from launch config or launch template")
}

func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*apiv1.Node, error) {
Expand Down
75 changes: 75 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/aws_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,81 @@ func TestBuildInstanceType(t *testing.T) {
assert.Equal(t, instanceType, builtInstanceType)
}

func TestGetASGTemplate(t *testing.T) {
const (
knownInstanceType = "t3.micro"
region = "us-east-1"
az = region + "a"
ltName = "launcher"
ltVersion = "1"
)

tags := []*autoscaling.TagDescription{
{
Key: aws.String("k8s.io/cluster-autoscaler/node-template/taint/dedicated"),
Value: aws.String("foo:NoSchedule"),
},
}

tests := []struct {
description string
instanceType string
availabilityZones []string
error bool
}{
{"insufficient availability zones",
knownInstanceType, []string{}, true},
{"single availability zone",
knownInstanceType, []string{az}, false},
{"multiple availability zones",
knownInstanceType, []string{az, "us-west-1b"}, false},
{"unknown instance type",
"nonexistent.xlarge", []string{az}, true},
}

for _, test := range tests {
t.Run(test.description, func(t *testing.T) {
s := &EC2Mock{}
s.On("DescribeLaunchTemplateVersions", &ec2.DescribeLaunchTemplateVersionsInput{
LaunchTemplateName: aws.String(ltName),
Versions: []*string{aws.String(ltVersion)},
}).Return(&ec2.DescribeLaunchTemplateVersionsOutput{
LaunchTemplateVersions: []*ec2.LaunchTemplateVersion{
{
LaunchTemplateData: &ec2.ResponseLaunchTemplateData{
InstanceType: aws.String(test.instanceType),
},
},
},
})

m, err := createAWSManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, nil, &ec2Wrapper{s})
assert.NoError(t, err)

asg := &asg{
AwsRef: AwsRef{Name: "sample"},
AvailabilityZones: test.availabilityZones,
LaunchTemplateName: ltName,
LaunchTemplateVersion: ltVersion,
Tags: tags,
}

template, err := m.getAsgTemplate(asg)
if test.error {
assert.Error(t, err)
} else {
assert.NoError(t, err)
if assert.NotNil(t, template) {
assert.Equal(t, test.instanceType, template.InstanceType.InstanceType)
assert.Equal(t, region, template.Region)
assert.Equal(t, test.availabilityZones[0], template.Zone)
assert.Equal(t, tags, template.Tags)
}
}
})
}
}

/* Disabled due to flakiness. See https://github.com/kubernetes/autoscaler/issues/608
func TestFetchAutoAsgs(t *testing.T) {
min, max := 1, 10
Expand Down
102 changes: 102 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ var InstanceTypes = map[string]*instanceType{
MemoryMb: 124928,
GPU: 0,
},
"f1.4xlarge": {
InstanceType: "f1.4xlarge",
VCPU: 16,
MemoryMb: 249856,
GPU: 0,
},
"g2": {
InstanceType: "g2",
VCPU: 32,
Expand Down Expand Up @@ -297,6 +303,12 @@ var InstanceTypes = map[string]*instanceType{
MemoryMb: 249856,
GPU: 2,
},
"g3s.xlarge": {
InstanceType: "g3s.xlarge",
VCPU: 4,
MemoryMb: 31232,
GPU: 0,
},
"h1": {
InstanceType: "h1",
VCPU: 64,
Expand Down Expand Up @@ -567,6 +579,42 @@ var InstanceTypes = map[string]*instanceType{
MemoryMb: 16384,
GPU: 0,
},
"m5a.12xlarge": {
InstanceType: "m5a.12xlarge",
VCPU: 48,
MemoryMb: 196608,
GPU: 0,
},
"m5a.24xlarge": {
InstanceType: "m5a.24xlarge",
VCPU: 96,
MemoryMb: 393216,
GPU: 0,
},
"m5a.2xlarge": {
InstanceType: "m5a.2xlarge",
VCPU: 8,
MemoryMb: 32768,
GPU: 0,
},
"m5a.4xlarge": {
InstanceType: "m5a.4xlarge",
VCPU: 16,
MemoryMb: 65536,
GPU: 0,
},
"m5a.large": {
InstanceType: "m5a.large",
VCPU: 2,
MemoryMb: 8192,
GPU: 0,
},
"m5a.xlarge": {
InstanceType: "m5a.xlarge",
VCPU: 4,
MemoryMb: 16384,
GPU: 0,
},
"m5d": {
InstanceType: "m5d",
VCPU: 96,
Expand Down Expand Up @@ -777,6 +825,42 @@ var InstanceTypes = map[string]*instanceType{
MemoryMb: 32768,
GPU: 0,
},
"r5a.12xlarge": {
InstanceType: "r5a.12xlarge",
VCPU: 48,
MemoryMb: 393216,
GPU: 0,
},
"r5a.24xlarge": {
InstanceType: "r5a.24xlarge",
VCPU: 96,
MemoryMb: 786432,
GPU: 0,
},
"r5a.2xlarge": {
InstanceType: "r5a.2xlarge",
VCPU: 8,
MemoryMb: 65536,
GPU: 0,
},
"r5a.4xlarge": {
InstanceType: "r5a.4xlarge",
VCPU: 16,
MemoryMb: 131072,
GPU: 0,
},
"r5a.large": {
InstanceType: "r5a.large",
VCPU: 2,
MemoryMb: 16384,
GPU: 0,
},
"r5a.xlarge": {
InstanceType: "r5a.xlarge",
VCPU: 4,
MemoryMb: 32768,
GPU: 0,
},
"r5d": {
InstanceType: "r5d",
VCPU: 96,
Expand Down Expand Up @@ -909,6 +993,24 @@ var InstanceTypes = map[string]*instanceType{
MemoryMb: 16384,
GPU: 0,
},
"u-12tb1": {
InstanceType: "u-12tb1",
VCPU: 448,
MemoryMb: 12582912,
GPU: 0,
},
"u-6tb1": {
InstanceType: "u-6tb1",
VCPU: 448,
MemoryMb: 6291456,
GPU: 0,
},
"u-9tb1": {
InstanceType: "u-9tb1",
VCPU: 448,
MemoryMb: 9437184,
GPU: 0,
},
"x1": {
InstanceType: "x1",
VCPU: 128,
Expand Down
4 changes: 4 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ type AutoscalingOptions struct {
EstimatorName string
// ExpanderName sets the type of node group expander to be used in scale up
ExpanderName string
// IgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
IgnoreDaemonSetsUtilization bool
// IgnoreMirrorPodsUtilization is whether CA will ignore Mirror pods when calculating resource utilization for scaling down
IgnoreMirrorPodsUtilization bool
// MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before
// removing the node from cloud provider.
MaxGracefulTerminationSec int
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scale_down.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ func (sd *ScaleDown) UpdateUnneededNodes(
klog.Errorf("Node info for %s not found", node.Name)
continue
}
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo)
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)

if err != nil {
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
Expand Down
9 changes: 9 additions & 0 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ var (
expanderFlag = flag.String("expander", expander.RandomExpanderName,
"Type of node group expander to be used in scale up. Available values: ["+strings.Join(expander.AvailableExpanders, ",")+"]")

ignoreDaemonSetsUtilization = flag.Bool("ignore-daemonsets-utilization", false,
"Should CA ignore DaemonSet pods when calculating resource utilization for scaling down")
ignoreMirrorPodsUtilization = flag.Bool("ignore-mirror-pods-utilization", false,
"Should CA ignore Mirror pods when calculating resource utilization for scaling down")

writeStatusConfigMapFlag = flag.Bool("write-status-configmap", true, "Should CA write status information to a configmap")
maxInactivityTimeFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart")
maxFailingTimeFlag = flag.Duration("max-failing-time", 15*time.Minute, "Maximum time from last recorded successful autoscaler run before automatic restart")
Expand Down Expand Up @@ -179,6 +184,8 @@ func createAutoscalingOptions() config.AutoscalingOptions {
OkTotalUnreadyCount: *okTotalUnreadyCount,
EstimatorName: *estimatorFlag,
ExpanderName: *expanderFlag,
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
IgnoreMirrorPodsUtilization: *ignoreMirrorPodsUtilization,
MaxEmptyBulkDelete: *maxEmptyBulkDeleteFlag,
MaxGracefulTerminationSec: *maxGracefulTerminationFlag,
MaxNodeProvisionTime: *maxNodeProvisionTime,
Expand Down Expand Up @@ -314,6 +321,8 @@ func run(healthCheck *metrics.HealthCheck) {
}

func main() {
klog.InitFlags(nil)

leaderElection := defaultLeaderElectionConfiguration()
leaderElection.LeaderElect = true

Expand Down
26 changes: 22 additions & 4 deletions cluster-autoscaler/simulator/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"math/rand"
"time"

"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
Expand Down Expand Up @@ -151,19 +152,19 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual
// cpu and memory utilization.
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo) (utilInfo UtilizationInfo, err error) {
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU)
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) {
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
if err != nil {
return UtilizationInfo{}, err
}
mem, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceMemory)
mem, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceMemory, skipDaemonSetPods, skipMirrorPods)
if err != nil {
return UtilizationInfo{}, err
}
return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
}

func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo, resourceName apiv1.ResourceName) (float64, error) {
func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {
nodeAllocatable, found := node.Status.Allocatable[resourceName]
if !found {
return 0, fmt.Errorf("Failed to get %v from %s", resourceName, node.Name)
Expand All @@ -173,6 +174,14 @@ func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulercache.N
}
podsRequest := resource.MustParse("0")
for _, pod := range nodeInfo.Pods() {
// factor daemonset pods out of the utilization calculations
if skipDaemonSetPods && isDaemonSet(pod) {
continue
}
// factor mirror pods out of the utilization calculations
if skipMirrorPods && drain.IsMirrorPod(pod) {
continue
}
for _, container := range pod.Spec.Containers {
if resourceValue, found := container.Resources.Requests[resourceName]; found {
podsRequest.Add(resourceValue)
Expand Down Expand Up @@ -283,3 +292,12 @@ func shuffleNodes(nodes []*apiv1.Node) []*apiv1.Node {
}
return result
}

func isDaemonSet(pod *apiv1.Pod) bool {
for _, ownerReference := range pod.ObjectMeta.OwnerReferences {
if ownerReference.Kind == "DaemonSet" {
return true
}
}
return false
}
Loading

0 comments on commit 6ffe3e7

Please sign in to comment.