Skip to content

Commit

Permalink
⚠️ Breaking Changes: Upgrade Scheduler Plugins version to v0.25.7 (ku…
Browse files Browse the repository at this point in the history
…beflow#1824)

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y authored Jun 9, 2023
1 parent 4043955 commit 5a4cf7c
Show file tree
Hide file tree
Showing 11 changed files with 25 additions and 29 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Setup Go
uses: actions/setup-go@v3
with:
go-version-file: go.mod

- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.3.0
with:
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ require (
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280
k8s.io/utils v0.0.0-20221128185143-99ec85e7a448
sigs.k8s.io/controller-runtime v0.14.6
sigs.k8s.io/scheduler-plugins v0.24.9
sigs.k8s.io/scheduler-plugins v0.25.7
sigs.k8s.io/yaml v1.3.0
volcano.sh/apis v1.2.0-k8s1.19.6
)
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -782,8 +782,8 @@ sigs.k8s.io/controller-runtime v0.14.6 h1:oxstGVvXGNnMvY7TAESYk+lzr6S3V5VFxQ6d92
sigs.k8s.io/controller-runtime v0.14.6/go.mod h1:WqIdsAY6JBsjfc/CqO0CORmNtoCtE4S6qbPc9s68h+0=
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k=
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/scheduler-plugins v0.24.9 h1:9oGtwk6uh7mZMCX8+O+PipQzBiRq9d2+E3xq1cn7zbc=
sigs.k8s.io/scheduler-plugins v0.24.9/go.mod h1:0u2b/0SwY2ozDhOD/f1S3e5IbStoDFLUK8yP5dJTaQ8=
sigs.k8s.io/scheduler-plugins v0.25.7 h1:2qSTXfHmzfFZJF9M9UHLiDXGdDXX+sUs/cn0dHbc4qk=
sigs.k8s.io/scheduler-plugins v0.25.7/go.mod h1:CKgZ1xu9WZdB3CMSzOjro/rtrBY/bQWMf6un2M9VNS4=
sigs.k8s.io/structured-merge-diff/v4 v4.0.1/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E=
Expand Down
4 changes: 2 additions & 2 deletions manifests/base/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ rules:
- update
- watch
- apiGroups:
- scheduling.sigs.k8s.io
- scheduling.volcano.sh
resources:
- podgroups
verbs:
Expand All @@ -261,7 +261,7 @@ rules:
- update
- watch
- apiGroups:
- scheduling.volcano.sh
- scheduling.x-k8s.io
resources:
- podgroups
verbs:
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/mpi/mpijob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ type MPIJobReconciler struct {
//+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=list;watch;create;update
//+kubebuilder:rbac:groups="",resources=pods/exec,verbs=create
//+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/mxnet/mxjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ type MXJobReconciler struct {
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
//+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete

func (r *MXJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ type PaddleJobReconciler struct {
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
//+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/pytorch/pytorchjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ type PyTorchJobReconciler struct {
//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
//+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ type TFJobReconciler struct {
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
//+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/xgboost/xgboostjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ type XGBoostJobReconciler struct {
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
//+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete

// Reconcile reads that state of the cluster for a XGBoostJob object and makes changes based on the state read
Expand Down
27 changes: 9 additions & 18 deletions scripts/gha/setup-training-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# This shell script is used to build a cluster and create a namespace from our
# argo workflow


set -o errexit
set -o nounset
set -o pipefail
Expand All @@ -34,20 +30,14 @@ echo "Installing training operator manifests"
kustomize build . | kubectl apply -f -

if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
echo "Installing Scheduler Plugins..."
# We need to use latest helm chart since older helm chart has bugs in RBAC.
git clone https://github.com/kubernetes-sigs/scheduler-plugins.git
pushd scheduler-plugins/manifests/install/charts

# Since https://github.com/kubernetes-sigs/scheduler-plugins/pull/526, the scheduler-plugins switch the API group to 'x-k8s.io'.
# So we must use the specific commit version to available the older API group, 'sigs.k8s.io'.
# Details: https://github.com/kubeflow/training-operator/issues/1769
# TODO: Once we support new API group, we should switch the scheduler-plugins version.
git checkout df16b76a226e58b6961b30ba800e5a713d433c44
SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins)
git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}"

helm install scheduler-plugins as-a-second-scheduler/
popd
rm -rf scheduler-plugins
echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..."
helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \
--namespace scheduler-plugins \
--set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \
--set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}"

echo "Configure gang-scheduling using scheduler-plugins to training-operator"
kubectl patch -n kubeflow deployments training-operator --type='json' \
Expand All @@ -60,7 +50,8 @@ until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TI
TIMEOUT=$(( TIMEOUT - 1 ))
done
if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all
kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all || \
(kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins; exit 1)
fi

kubectl version
Expand Down

0 comments on commit 5a4cf7c

Please sign in to comment.