Skip to content

Commit

Permalink
Merge pull request #278 from Poor12/fix-gpu-zero
Browse files Browse the repository at this point in the history
  • Loading branch information
gary-lgy authored Nov 20, 2023
2 parents e653c57 + b6f96b4 commit 6bfa139
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 17 deletions.
8 changes: 4 additions & 4 deletions pkg/controllers/scheduler/framework/plugins/rsp/rsp.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ const (
allocatableResource string = "allocatable"
)

var ErrNoCPUResource = errors.New("no cpu resource")

type ClusterCapacityWeight struct{}

var _ framework.ReplicasPlugin = &ClusterCapacityWeight{}
Expand Down Expand Up @@ -209,7 +207,7 @@ func CalcWeightLimit(
for member, resources := range allocatables {
resourceQuantity, ok := resources[resourceName]
if !ok {
err = ErrNoCPUResource
err = fmt.Errorf("no %s resource", resourceName)
return
}
weightLimit[member] = int64(math.Round(float64(resourceQuantity.Value()) / sum * sumWeight * supplyLimitRatio))
Expand Down Expand Up @@ -244,7 +242,7 @@ func AvailableToPercentage(
for member, resources := range clusterAvailables {
resourceQuantity, ok := resources[resourceName]
if !ok {
err = ErrNoCPUResource
err = fmt.Errorf("no %s resource", resourceName)
return
}

Expand Down Expand Up @@ -295,6 +293,7 @@ func QueryAvailable(clusters []*fedcorev1a1.FederatedCluster) map[string]corev1.
available := make(corev1.ResourceList)
available[corev1.ResourceCPU] = resource.MustParse("0")
available[corev1.ResourceMemory] = resource.MustParse("0")
available[framework.ResourceGPU] = resource.MustParse("0")
// sum up by resource
for resourceName := range cluster.Status.Resources.Available {
if val, ok := available[resourceName]; ok {
Expand All @@ -316,6 +315,7 @@ func QueryAllocatable(clusters []*fedcorev1a1.FederatedCluster) map[string]corev
allocatable := make(corev1.ResourceList)
allocatable[corev1.ResourceCPU] = resource.MustParse("0")
allocatable[corev1.ResourceMemory] = resource.MustParse("0")
allocatable[framework.ResourceGPU] = resource.MustParse("0")
// sum up by resource
for resourceName := range cluster.Status.Resources.Allocatable {
if val, ok := allocatable[resourceName]; ok {
Expand Down
179 changes: 166 additions & 13 deletions pkg/controllers/scheduler/framework/plugins/rsp/rsp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,40 @@ func makeClusterWithCPU(name string, allocatable, available int) *fedcorev1a1.Fe
return cluster
}

func makeClusterWithGPU(name string, allocatable, available int) *fedcorev1a1.FederatedCluster {
cluster := &fedcorev1a1.FederatedCluster{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
}
if allocatable >= 0 && available >= 0 {
cluster.Status.Resources = fedcorev1a1.Resources{
Allocatable: corev1.ResourceList{
framework.ResourceGPU: resource.MustParse(strconv.Itoa(allocatable)),
},
Available: corev1.ResourceList{
framework.ResourceGPU: resource.MustParse(strconv.Itoa(available)),
},
}
}
return cluster
}

func TestCalcWeightLimit(t *testing.T) {
type args struct {
clusters []*fedcorev1a1.FederatedCluster
supplyLimitRatio float64
}
tests := []struct {
name string
resourceName corev1.ResourceName
args args
wantWeightLimit map[string]int64
wantErr assert.ErrorAssertionFunc
}{
{
name: "two clusters have the same resource",
name: "two clusters have the same resource",
resourceName: corev1.ResourceCPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 100, 0),
Expand All @@ -99,7 +120,8 @@ func TestCalcWeightLimit(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "3 clusters have different resource amount",
name: "3 clusters have different resource amount",
resourceName: corev1.ResourceCPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 3000, 0),
Expand All @@ -116,7 +138,8 @@ func TestCalcWeightLimit(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "1 cluster node level info missing",
name: "1 cluster node level info missing",
resourceName: corev1.ResourceCPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 3000, -1),
Expand All @@ -133,7 +156,8 @@ func TestCalcWeightLimit(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "all clusters node level info missing",
name: "all clusters node level info missing",
resourceName: corev1.ResourceCPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 3000, -1),
Expand All @@ -149,10 +173,64 @@ func TestCalcWeightLimit(t *testing.T) {
},
wantErr: assert.NoError,
},
{
name: "all cluster nodes have no gpu",
resourceName: framework.ResourceGPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 3000, 3000),
makeClusterWithCPU("cluster2", 7000, 7000),
makeClusterWithCPU("cluster3", 3000, 3000),
},
supplyLimitRatio: 1.0,
},
wantWeightLimit: map[string]int64{
"cluster1": int64(333),
"cluster2": int64(333),
"cluster3": int64(333),
},
wantErr: assert.NoError,
},
{
name: "two cluster nodes have no gpu and one has",
resourceName: framework.ResourceGPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 3000, 3000),
makeClusterWithCPU("cluster2", 7000, 7000),
makeClusterWithGPU("cluster3", 3000, 3000),
},
supplyLimitRatio: 1.0,
},
wantWeightLimit: map[string]int64{
"cluster1": int64(0),
"cluster2": int64(0),
"cluster3": int64(1000),
},
wantErr: assert.NoError,
},
{
name: "two cluster nodes have gpu and one does not",
resourceName: framework.ResourceGPU,
args: args{
clusters: []*fedcorev1a1.FederatedCluster{
makeClusterWithCPU("cluster1", 3000, 3000),
makeClusterWithGPU("cluster2", 7000, 7000),
makeClusterWithGPU("cluster3", 3000, 3000),
},
supplyLimitRatio: 1.0,
},
wantWeightLimit: map[string]int64{
"cluster1": int64(0),
"cluster2": int64(700),
"cluster3": int64(300),
},
wantErr: assert.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotWeightLimit, err := CalcWeightLimit(tt.args.clusters, corev1.ResourceCPU, tt.args.supplyLimitRatio)
gotWeightLimit, err := CalcWeightLimit(tt.args.clusters, tt.resourceName, tt.args.supplyLimitRatio)
if !tt.wantErr(t, err, fmt.Sprintf("CalcWeightLimit(%v)", tt.args.clusters)) {
return
}
Expand All @@ -166,24 +244,27 @@ func TestAvailableToPercentage(t *testing.T) {
clusterAvailables map[string]corev1.ResourceList
weightLimit map[string]int64
}
makeArgs := func(clusters ...*fedcorev1a1.FederatedCluster) args {
makeArgs := func(resourceName corev1.ResourceName, clusters ...*fedcorev1a1.FederatedCluster) args {
return args{
clusterAvailables: QueryAvailable(clusters),
weightLimit: func() map[string]int64 {
weightLimit, _ := CalcWeightLimit(clusters, corev1.ResourceCPU, 1.0)
weightLimit, _ := CalcWeightLimit(clusters, resourceName, 1.0)
return weightLimit
}(),
}
}
tests := []struct {
name string
resourceName corev1.ResourceName
args args
wantClusterWeights map[string]int64
wantErr assert.ErrorAssertionFunc
}{
{
name: "test#1",
name: "test#1",
resourceName: corev1.ResourceCPU,
args: makeArgs(
corev1.ResourceCPU,
makeClusterWithCPU("cluster1", 100, 50),
makeClusterWithCPU("cluster2", 100, 50),
),
Expand All @@ -194,8 +275,10 @@ func TestAvailableToPercentage(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "test#2",
name: "test#2",
resourceName: corev1.ResourceCPU,
args: makeArgs(
corev1.ResourceCPU,
makeClusterWithCPU("cluster1", 100, 40),
makeClusterWithCPU("cluster2", 100, 10),
),
Expand All @@ -207,8 +290,10 @@ func TestAvailableToPercentage(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "empty node level info",
name: "empty node level info",
resourceName: corev1.ResourceCPU,
args: makeArgs(
corev1.ResourceCPU,
makeClusterWithCPU("cluster1", -1, -1),
),
wantClusterWeights: map[string]int64{
Expand All @@ -217,8 +302,10 @@ func TestAvailableToPercentage(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "1 cluster node level info missing",
name: "1 cluster node level info missing",
resourceName: corev1.ResourceCPU,
args: makeArgs(
corev1.ResourceCPU,
makeClusterWithCPU("cluster1", -1, -1),
makeClusterWithCPU("cluster2", 400, 100),
makeClusterWithCPU("cluster3", 200, 100),
Expand All @@ -231,8 +318,26 @@ func TestAvailableToPercentage(t *testing.T) {
wantErr: assert.NoError,
},
{
name: "all clusters node level info missing",
name: "all clusters node level info missing",
resourceName: corev1.ResourceCPU,
args: makeArgs(
corev1.ResourceCPU,
makeClusterWithCPU("cluster1", -1, -1),
makeClusterWithCPU("cluster2", -1, 100),
makeClusterWithCPU("cluster3", -1, 100),
),
wantClusterWeights: map[string]int64{
"cluster1": int64(333),
"cluster2": int64(333),
"cluster3": int64(333),
},
wantErr: assert.NoError,
},
{
name: "all cluster nodes have no gpu",
resourceName: framework.ResourceGPU,
args: makeArgs(
framework.ResourceGPU,
makeClusterWithCPU("cluster1", -1, -1),
makeClusterWithCPU("cluster2", -1, 100),
makeClusterWithCPU("cluster3", -1, 100),
Expand All @@ -244,10 +349,58 @@ func TestAvailableToPercentage(t *testing.T) {
},
wantErr: assert.NoError,
},
{
name: "two cluster nodes have no gpu and one has",
resourceName: framework.ResourceGPU,
args: makeArgs(
framework.ResourceGPU,
makeClusterWithCPU("cluster1", 3000, 3000),
makeClusterWithCPU("cluster2", 7000, 7000),
makeClusterWithGPU("cluster3", 3000, 3000),
),
wantClusterWeights: map[string]int64{
"cluster1": int64(0),
"cluster2": int64(0),
"cluster3": int64(1000),
},
wantErr: assert.NoError,
},
{
name: "two cluster nodes have gpu and one does not",
resourceName: framework.ResourceGPU,
args: makeArgs(
framework.ResourceGPU,
makeClusterWithCPU("cluster1", 3000, 3000),
makeClusterWithGPU("cluster2", 7000, 7000),
makeClusterWithGPU("cluster3", 3000, 3000),
),
wantClusterWeights: map[string]int64{
"cluster1": int64(0),
"cluster2": int64(700),
"cluster3": int64(300),
},
wantErr: assert.NoError,
},
{
name: "no nvidia.com/gpu resource",
resourceName: framework.ResourceGPU,
args: args{
clusterAvailables: map[string]corev1.ResourceList{
"cluster1": {
corev1.ResourceCPU: *resource.NewQuantity(1000, resource.DecimalSI),
},
"cluster2": {
framework.ResourceGPU: *resource.NewQuantity(1000, resource.DecimalSI),
},
},
},
wantClusterWeights: map[string]int64{},
wantErr: assert.Error,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotClusterWeights, err := AvailableToPercentage(tt.args.clusterAvailables, corev1.ResourceCPU, tt.args.weightLimit)
gotClusterWeights, err := AvailableToPercentage(tt.args.clusterAvailables, tt.resourceName, tt.args.weightLimit)
if !tt.wantErr(
t,
err,
Expand Down

0 comments on commit 6bfa139

Please sign in to comment.