fix: disable tensor parallel for falcon7b (#755)

**Reason for Change**: vllm requires the model specification to be exactly divisible by the number of GPUs (tensor parallel level). while falcon-7b-instruct have 71 attention heads, which is a prime number. So, give up tensor parallel inference for it. **Requirements** - [x] added unit tests and e2e tests (if applicable). --------- Signed-off-by: jerryzhuang <zhuangqhc@gmail.com> Co-authored-by: Fei Guo <guofei@microsoft.com>
kaito-project · Dec 5, 2024 · 3882218 · 3882218
1 parent fea2924
commit 3882218
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 43 deletions.
diff --git a/pkg/model/interface.go b/pkg/model/interface.go
@@ -48,6 +48,8 @@ type PresetParam struct {
 type RuntimeParam struct {
 	Transformers HuggingfaceTransformersParam
 	VLLM         VLLMParam
+	// Disable the tensor parallelism
+	DisableTensorParallelism bool
 }
 
 type HuggingfaceTransformersParam struct {
@@ -87,7 +89,7 @@ func (rp *RuntimeParam) DeepCopy() RuntimeParam {
 	if rp == nil {
 		return RuntimeParam{}
 	}
-	out := RuntimeParam{}
+	out := *rp
 	out.Transformers = rp.Transformers.DeepCopy()
 	out.VLLM = rp.VLLM.DeepCopy()
 	return out
@@ -97,9 +99,7 @@ func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam {
 	if h == nil {
 		return HuggingfaceTransformersParam{}
 	}
-	out := HuggingfaceTransformersParam{}
-	out.BaseCommand = h.BaseCommand
-	out.InferenceMainFile = h.InferenceMainFile
+	out := *h
 	out.TorchRunParams = make(map[string]string, len(h.TorchRunParams))
 	for k, v := range h.TorchRunParams {
 		out.TorchRunParams[k] = v
@@ -119,9 +119,7 @@ func (v *VLLMParam) DeepCopy() VLLMParam {
 	if v == nil {
 		return VLLMParam{}
 	}
-	out := VLLMParam{}
-	out.BaseCommand = v.BaseCommand
-	out.ModelName = v.ModelName
+	out := *v
 	out.DistributionParams = make(map[string]string, len(v.DistributionParams))
 	for k, v := range v.DistributionParams {
 		out.DistributionParams[k] = v
@@ -145,7 +143,9 @@ func (p *PresetParam) GetInferenceCommand(runtime RuntimeName, skuNumGPUs string
 		if p.VLLM.ModelName != "" {
 			p.VLLM.ModelRunParams["served-model-name"] = p.VLLM.ModelName
 		}
-		p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs
+		if !p.DisableTensorParallelism {
+			p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs
+		}
 		modelCommand := utils.BuildCmdStr(p.VLLM.BaseCommand, p.VLLM.ModelRunParams)
 		return utils.ShellCmd(modelCommand)
 	default:

diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go
@@ -10,14 +10,15 @@ import (
 	"github.com/kaito-project/kaito/pkg/utils/plugin"
 )
 
-type testModel struct{}
+type baseTestModel struct{}
 
-func (*testModel) GetInferenceParameters() *model.PresetParam {
+func (*baseTestModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		RuntimeParam: model.RuntimeParam{
 			VLLM: model.VLLMParam{
 				BaseCommand: "python3 /workspace/vllm/inference_api.py",
+				ModelName:   "mymodel",
 			},
 			Transformers: model.HuggingfaceTransformersParam{
 				BaseCommand:       "accelerate launch",
@@ -27,25 +28,40 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
 		ReadinessTimeout: time.Duration(30) * time.Minute,
 	}
 }
-func (*testModel) GetTuningParameters() *model.PresetParam {
+func (*baseTestModel) GetTuningParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		ReadinessTimeout:    time.Duration(30) * time.Minute,
 	}
 }
+func (*baseTestModel) SupportDistributedInference() bool {
+	return true
+}
+func (*baseTestModel) SupportTuning() bool {
+	return true
+}
+
+type testModel struct {
+	baseTestModel
+}
+
 func (*testModel) SupportDistributedInference() bool {
 	return false
 }
-func (*testModel) SupportTuning() bool {
-	return true
+
+type testDistributedModel struct {
+	baseTestModel
 }
 
-type testDistributedModel struct{}
+type testNoTensorParallelModel struct {
+	baseTestModel
+}
 
-func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
+func (*testNoTensorParallelModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		RuntimeParam: model.RuntimeParam{
+			DisableTensorParallelism: true,
 			VLLM: model.VLLMParam{
 				BaseCommand: "python3 /workspace/vllm/inference_api.py",
 			},
@@ -57,30 +73,23 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
 		ReadinessTimeout: time.Duration(30) * time.Minute,
 	}
 }
-func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
-	return &model.PresetParam{
-		GPUCountRequirement: "1",
-		ReadinessTimeout:    time.Duration(30) * time.Minute,
-	}
-}
-func (*testDistributedModel) SupportDistributedInference() bool {
-	return true
-}
-func (*testDistributedModel) SupportTuning() bool {
-	return true
+func (*testNoTensorParallelModel) SupportDistributedInference() bool {
+	return false
 }
 
 func RegisterTestModel() {
-	var test testModel
 	plugin.KaitoModelRegister.Register(&plugin.Registration{
 		Name:     "test-model",
-		Instance: &test,
+		Instance: &testModel{},
 	})
 
-	var testDistributed testDistributedModel
 	plugin.KaitoModelRegister.Register(&plugin.Registration{
 		Name:     "test-distributed-model",
-		Instance: &testDistributed,
+		Instance: &testDistributedModel{},
 	})
 
+	plugin.KaitoModelRegister.Register(&plugin.Registration{
+		Name:     "test-no-tensor-parallel-model",
+		Instance: &testNoTensorParallelModel{},
+	})
 }
diff --git a/pkg/workspace/inference/preset-inferences_test.go b/pkg/workspace/inference/preset-inferences_test.go
@@ -46,7 +46,21 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2",
+			expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2 --served-model-name=mymodel",
+			hasAdapters: false,
+		},
+
+		"test-model-no-parallel/vllm": {
+			workspace: test.MockWorkspaceWithPresetVLLM,
+			nodeCount: 1,
+			modelName: "test-no-tensor-parallel-model",
+			callMocks: func(c *test.MockClient) {
+				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
+			},
+			workload: "Deployment",
+			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
+			// So expected cmd consists of shell command and inference file
+			expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py",
 			hasAdapters: false,
 		},
 
@@ -58,7 +72,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
 			},
 			workload:       "Deployment",
-			expectedCmd:    "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2",
+			expectedCmd:    "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2 --served-model-name=mymodel",
 			hasAdapters:    true,
 			expectedVolume: "adapter-volume",
 		},
@@ -148,7 +162,7 @@ func TestCreatePresetInference(t *testing.T) {
 				createdWorkload = "StatefulSet"
 			}
 			if tc.workload != createdWorkload {
-				t.Errorf("%s: returned worklaod type is wrong", k)
+				t.Errorf("%s: returned workload type is wrong", k)
 			}
 
 			var workloadCmd string
@@ -162,7 +176,7 @@ func TestCreatePresetInference(t *testing.T) {
 			params := toParameterMap(strings.Split(workloadCmd, "--")[1:])
 
 			expectedMaincmd := strings.Split(tc.expectedCmd, "--")[0]
-			expectedParams := toParameterMap(strings.Split(workloadCmd, "--")[1:])
+			expectedParams := toParameterMap(strings.Split(tc.expectedCmd, "--")[1:])
 
 			if mainCmd != expectedMaincmd {
 				t.Errorf("%s main cmdline is not expected, got %s, expect %s ", k, workloadCmd, tc.expectedCmd)
@@ -204,16 +218,19 @@ func TestCreatePresetInference(t *testing.T) {
 
 func toParameterMap(in []string) map[string]string {
 	ret := make(map[string]string)
-	for _, each := range in {
-		r := strings.Split(each, "=")
-		k := r[0]
-		var v string
-		if len(r) == 1 {
-			v = ""
-		} else {
-			v = r[1]
+	for _, eachToken := range in {
+		for _, each := range strings.Split(eachToken, " ") {
+			each = strings.TrimSpace(each)
+			r := strings.Split(each, "=")
+			k := r[0]
+			var v string
+			if len(r) == 1 {
+				v = ""
+			} else {
+				v = r[1]
+			}
+			ret[k] = v
 		}
-		ret[k] = v
 	}
 	return ret
 }
diff --git a/presets/workspace/models/falcon/model.go b/presets/workspace/models/falcon/model.go
@@ -81,6 +81,11 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam {
 				ModelName:      "falcon-7b",
 				ModelRunParams: falconRunParamsVLLM,
 			},
+			// vllm requires the model specification to be exactly divisible by
+			// the number of GPUs(tensor parallel level).
+			// falcon-7b have 71 attention heads, which is a prime number.
+			// So, give up tensor parallel inference.
+			DisableTensorParallelism: true,
 		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
 		Tag:              PresetFalconTagMap["Falcon7B"],
@@ -138,6 +143,11 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam {
 				ModelName:      "falcon-7b-instruct",
 				ModelRunParams: falconRunParamsVLLM,
 			},
+			// vllm requires the model specification to be exactly divisible by
+			// the number of GPUs(tensor parallel level).
+			// falcon-7b-instruct have 71 attention heads, which is a prime number.
+			// So, give up tensor parallel inference.
+			DisableTensorParallelism: true,
 		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
 		Tag:              PresetFalconTagMap["Falcon7BInstruct"],