Skip to content

Commit

Permalink
fix: disable tensor parallel for falcon7b (#755)
Browse files Browse the repository at this point in the history
**Reason for Change**:

vllm requires the model specification to be exactly divisible by
the number of GPUs (tensor parallel level).
while falcon-7b-instruct have 71 attention heads, which is a prime
number.
So, give up tensor parallel inference for it.

**Requirements**

- [x] added unit tests and e2e tests (if applicable).

---------

Signed-off-by: jerryzhuang <zhuangqhc@gmail.com>
Co-authored-by: Fei Guo <guofei@microsoft.com>
  • Loading branch information
zhuangqh and Fei-Guo authored Dec 5, 2024
1 parent fea2924 commit 3882218
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 43 deletions.
16 changes: 8 additions & 8 deletions pkg/model/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ type PresetParam struct {
type RuntimeParam struct {
Transformers HuggingfaceTransformersParam
VLLM VLLMParam
// Disable the tensor parallelism
DisableTensorParallelism bool
}

type HuggingfaceTransformersParam struct {
Expand Down Expand Up @@ -87,7 +89,7 @@ func (rp *RuntimeParam) DeepCopy() RuntimeParam {
if rp == nil {
return RuntimeParam{}
}
out := RuntimeParam{}
out := *rp
out.Transformers = rp.Transformers.DeepCopy()
out.VLLM = rp.VLLM.DeepCopy()
return out
Expand All @@ -97,9 +99,7 @@ func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam {
if h == nil {
return HuggingfaceTransformersParam{}
}
out := HuggingfaceTransformersParam{}
out.BaseCommand = h.BaseCommand
out.InferenceMainFile = h.InferenceMainFile
out := *h
out.TorchRunParams = make(map[string]string, len(h.TorchRunParams))
for k, v := range h.TorchRunParams {
out.TorchRunParams[k] = v
Expand All @@ -119,9 +119,7 @@ func (v *VLLMParam) DeepCopy() VLLMParam {
if v == nil {
return VLLMParam{}
}
out := VLLMParam{}
out.BaseCommand = v.BaseCommand
out.ModelName = v.ModelName
out := *v
out.DistributionParams = make(map[string]string, len(v.DistributionParams))
for k, v := range v.DistributionParams {
out.DistributionParams[k] = v
Expand All @@ -145,7 +143,9 @@ func (p *PresetParam) GetInferenceCommand(runtime RuntimeName, skuNumGPUs string
if p.VLLM.ModelName != "" {
p.VLLM.ModelRunParams["served-model-name"] = p.VLLM.ModelName
}
p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs
if !p.DisableTensorParallelism {
p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs
}
modelCommand := utils.BuildCmdStr(p.VLLM.BaseCommand, p.VLLM.ModelRunParams)
return utils.ShellCmd(modelCommand)
default:
Expand Down
53 changes: 31 additions & 22 deletions pkg/utils/test/testModel.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ import (
"github.com/kaito-project/kaito/pkg/utils/plugin"
)

type testModel struct{}
type baseTestModel struct{}

func (*testModel) GetInferenceParameters() *model.PresetParam {
func (*baseTestModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
RuntimeParam: model.RuntimeParam{
VLLM: model.VLLMParam{
BaseCommand: "python3 /workspace/vllm/inference_api.py",
ModelName: "mymodel",
},
Transformers: model.HuggingfaceTransformersParam{
BaseCommand: "accelerate launch",
Expand All @@ -27,25 +28,40 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
ReadinessTimeout: time.Duration(30) * time.Minute,
}
}
func (*testModel) GetTuningParameters() *model.PresetParam {
func (*baseTestModel) GetTuningParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
}
}
func (*baseTestModel) SupportDistributedInference() bool {
return true
}
func (*baseTestModel) SupportTuning() bool {
return true
}

type testModel struct {
baseTestModel
}

func (*testModel) SupportDistributedInference() bool {
return false
}
func (*testModel) SupportTuning() bool {
return true

type testDistributedModel struct {
baseTestModel
}

type testDistributedModel struct{}
type testNoTensorParallelModel struct {
baseTestModel
}

func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
func (*testNoTensorParallelModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
RuntimeParam: model.RuntimeParam{
DisableTensorParallelism: true,
VLLM: model.VLLMParam{
BaseCommand: "python3 /workspace/vllm/inference_api.py",
},
Expand All @@ -57,30 +73,23 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
ReadinessTimeout: time.Duration(30) * time.Minute,
}
}
func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
}
}
func (*testDistributedModel) SupportDistributedInference() bool {
return true
}
func (*testDistributedModel) SupportTuning() bool {
return true
func (*testNoTensorParallelModel) SupportDistributedInference() bool {
return false
}

func RegisterTestModel() {
var test testModel
plugin.KaitoModelRegister.Register(&plugin.Registration{
Name: "test-model",
Instance: &test,
Instance: &testModel{},
})

var testDistributed testDistributedModel
plugin.KaitoModelRegister.Register(&plugin.Registration{
Name: "test-distributed-model",
Instance: &testDistributed,
Instance: &testDistributedModel{},
})

plugin.KaitoModelRegister.Register(&plugin.Registration{
Name: "test-no-tensor-parallel-model",
Instance: &testNoTensorParallelModel{},
})
}
43 changes: 30 additions & 13 deletions pkg/workspace/inference/preset-inferences_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,21 @@ func TestCreatePresetInference(t *testing.T) {
workload: "Deployment",
// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
// So expected cmd consists of shell command and inference file
expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2",
expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2 --served-model-name=mymodel",
hasAdapters: false,
},

"test-model-no-parallel/vllm": {
workspace: test.MockWorkspaceWithPresetVLLM,
nodeCount: 1,
modelName: "test-no-tensor-parallel-model",
callMocks: func(c *test.MockClient) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
},
workload: "Deployment",
// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
// So expected cmd consists of shell command and inference file
expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py",
hasAdapters: false,
},

Expand All @@ -58,7 +72,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
},
workload: "Deployment",
expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2",
expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2 --served-model-name=mymodel",
hasAdapters: true,
expectedVolume: "adapter-volume",
},
Expand Down Expand Up @@ -148,7 +162,7 @@ func TestCreatePresetInference(t *testing.T) {
createdWorkload = "StatefulSet"
}
if tc.workload != createdWorkload {
t.Errorf("%s: returned worklaod type is wrong", k)
t.Errorf("%s: returned workload type is wrong", k)
}

var workloadCmd string
Expand All @@ -162,7 +176,7 @@ func TestCreatePresetInference(t *testing.T) {
params := toParameterMap(strings.Split(workloadCmd, "--")[1:])

expectedMaincmd := strings.Split(tc.expectedCmd, "--")[0]
expectedParams := toParameterMap(strings.Split(workloadCmd, "--")[1:])
expectedParams := toParameterMap(strings.Split(tc.expectedCmd, "--")[1:])

if mainCmd != expectedMaincmd {
t.Errorf("%s main cmdline is not expected, got %s, expect %s ", k, workloadCmd, tc.expectedCmd)
Expand Down Expand Up @@ -204,16 +218,19 @@ func TestCreatePresetInference(t *testing.T) {

func toParameterMap(in []string) map[string]string {
ret := make(map[string]string)
for _, each := range in {
r := strings.Split(each, "=")
k := r[0]
var v string
if len(r) == 1 {
v = ""
} else {
v = r[1]
for _, eachToken := range in {
for _, each := range strings.Split(eachToken, " ") {
each = strings.TrimSpace(each)
r := strings.Split(each, "=")
k := r[0]
var v string
if len(r) == 1 {
v = ""
} else {
v = r[1]
}
ret[k] = v
}
ret[k] = v
}
return ret
}
10 changes: 10 additions & 0 deletions presets/workspace/models/falcon/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam {
ModelName: "falcon-7b",
ModelRunParams: falconRunParamsVLLM,
},
// vllm requires the model specification to be exactly divisible by
// the number of GPUs(tensor parallel level).
// falcon-7b have 71 attention heads, which is a prime number.
// So, give up tensor parallel inference.
DisableTensorParallelism: true,
},
ReadinessTimeout: time.Duration(30) * time.Minute,
Tag: PresetFalconTagMap["Falcon7B"],
Expand Down Expand Up @@ -138,6 +143,11 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam {
ModelName: "falcon-7b-instruct",
ModelRunParams: falconRunParamsVLLM,
},
// vllm requires the model specification to be exactly divisible by
// the number of GPUs(tensor parallel level).
// falcon-7b-instruct have 71 attention heads, which is a prime number.
// So, give up tensor parallel inference.
DisableTensorParallelism: true,
},
ReadinessTimeout: time.Duration(30) * time.Minute,
Tag: PresetFalconTagMap["Falcon7BInstruct"],
Expand Down

0 comments on commit 3882218

Please sign in to comment.