feat(pytorch): Support elastic training (#1453)

* chore: Update codegen and vendor Signed-off-by: Ce Gao <ce.gao@outlook.com> * fix: Update git config Signed-off-by: Ce Gao <ce.gao@outlook.com> * fix: Codegen Signed-off-by: Ce Gao <ce.gao@outlook.com> * feat: Add elastic examples Signed-off-by: Ce Gao <ce.gao@outlook.com> * chore: Update go mod Signed-off-by: Ce Gao <ce.gao@outlook.com> * chore: Update manifest Signed-off-by: Ce Gao <ce.gao@outlook.com> * feat: Implement elstic policy Signed-off-by: Ce Gao <ce.gao@outlook.com> * chore: Update vendor Signed-off-by: Ce Gao <ce.gao@outlook.com> * chore: Update swagger Signed-off-by: Ce Gao <ce.gao@outlook.com> * fix: Address comments Signed-off-by: Ce Gao <ce.gao@outlook.com>
kubeflow · Nov 26, 2021 · 3e11ac3 · 3e11ac3
1 parent 15712aa
commit 3e11ac3
Show file tree

Hide file tree

Showing 62 changed files with 3,997 additions and 138 deletions.
diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml
@@ -23,7 +23,7 @@ jobs:
       - name: Setup Go
         uses: actions/setup-go@v2
         with:
-          go-version: 1.17.1
+          go-version: 1.17.2
 
       - name: Check Go modules
         run: |

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ cover.out
 
 # IDEs
 .vscode/
+__debug_bin
 
 # Compiled python files.
 *.pyc
@@ -43,3 +44,7 @@ hack/python-sdk/openapi-generator-cli.jar
 
 # Coverage
 cover.out
+
+/training-operator
+
+data/
diff --git a/docs/api/pytorch_generated.asciidoc b/docs/api/pytorch_generated.asciidoc
@@ -22,6 +22,29 @@ Package v1 contains API Schema definitions for the kubeflow.org v1 API group
 
 === Definitions
 
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-elasticpolicy"]
+==== ElasticPolicy 
+
+
+
+.Appears In:
+****
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-pytorchjobspec[$$PyTorchJobSpec$$]
+****
+
+[cols="25a,75a", options="header"]
+|===
+| Field | Description
+| *`backend`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-rdzvbackend[$$RDZVBackend$$]__ | 
+| *`rdzvPort`* __integer__ | 
+| *`rdzvHost`* __string__ | 
+| *`rdzvId`* __string__ | 
+| *`rdzvConf`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-rdzvconf[$$RDZVConf$$] array__ | RDZVConf contains additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).
+| *`standalone`* __boolean__ | Start a local standalone rendezvous backend that is represented by a C10d TCP store on port 29400. Useful when launching single-node, multi-worker job. If specified --rdzv_backend, --rdzv_endpoint, --rdzv_id are auto-assigned; any explicitly set values are ignored.
+| *`nProcPerNode`* __integer__ | Number of workers per node; supported values: [auto, cpu, gpu, int].
+|===
+
+
 [id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-pytorchjob"]
 ==== PyTorchJob 
 
@@ -78,7 +101,38 @@ PyTorchJobSpec is a desired state description of the PyTorchJob.
 |===
 | Field | Description
 | *`runPolicy`* __xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-runpolicy[$$RunPolicy$$]__ | RunPolicy encapsulates various runtime policies of the distributed training job, for example how to clean up resources and how long the job can stay active.
+| *`elasticPolicy`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-elasticpolicy[$$ElasticPolicy$$]__ | 
 | *`pytorchReplicaSpecs`* __object (keys:ReplicaType, values:ReplicaSpec)__ | A map of PyTorchReplicaType (type) to ReplicaSpec (value). Specifies the PyTorch cluster configuration. For example,   {     "Master": PyTorchReplicaSpec,     "Worker": PyTorchReplicaSpec,   }
 |===
 
 
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-rdzvbackend"]
+==== RDZVBackend (string) 
+
+
+
+.Appears In:
+****
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-elasticpolicy[$$ElasticPolicy$$]
+****
+
+
+
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-rdzvconf"]
+==== RDZVConf 
+
+
+
+.Appears In:
+****
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-pytorch-v1-elasticpolicy[$$ElasticPolicy$$]
+****
+
+[cols="25a,75a", options="header"]
+|===
+| Field | Description
+| *`key`* __string__ | 
+| *`value`* __string__ | 
+|===
+
+
diff --git a/examples/pytorch/elastic/echo/Dockerfile b/examples/pytorch/elastic/echo/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.8-buster
+WORKDIR /workspace
+RUN pip install torch==1.10.0 numpy
+# TODO Replace this with the PIP version when available
+ADD echo.py echo.py
+ENV PYTHONPATH /workspace
+ENV ALLOW_NONE_AUTHENTICATION yes
+ENTRYPOINT ["python", "-m", "torch.distributed.run"]
diff --git a/examples/pytorch/elastic/echo/echo.py b/examples/pytorch/elastic/echo/echo.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+import io
+import os
+import pprint
+import sys
+import time
+
+import torch.distributed as dist
+
+
+if __name__ == "__main__":
+
+    env_dict = {
+        k: os.environ[k]
+        for k in (
+            "LOCAL_RANK",
+            "RANK",
+            "GROUP_RANK",
+            "WORLD_SIZE",
+            "MASTER_ADDR",
+            "MASTER_PORT",
+            "TORCHELASTIC_RESTART_COUNT",
+            "TORCHELASTIC_MAX_RESTARTS",
+        )
+    }
+
+    with io.StringIO() as buff:
+        print("======================================================", file=buff)
+        print(
+            f"Environment variables set by the agent on PID {os.getpid()}:", file=buff
+        )
+        pprint.pprint(env_dict, stream=buff)
+        print("======================================================", file=buff)
+        print(buff.getvalue())
+        sys.stdout.flush()
+
+    dist.init_process_group(backend="gloo")
+    dist.barrier()
+
+    print(
+        (
+            f"On PID {os.getpid()}, after init process group, "
+            f"rank={dist.get_rank()}, world_size = {dist.get_world_size()}\n"
+        )
+    )
diff --git a/examples/pytorch/elastic/echo/echo.yaml b/examples/pytorch/elastic/echo/echo.yaml
@@ -0,0 +1,29 @@
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: elastic-example-echo
+spec:
+  elasticPolicy:
+    rdzvBackend: c10d
+    minReplicas: 1
+    maxReplicas: 2
+    maxRestarts: 100
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: kubeflow/pytorch-elastic-example-echo:1.0.0
+              imagePullPolicy: IfNotPresent
+              env:
+              - name: LOGLEVEL
+                value: DEBUG
+              command:
+                - python
+                - -m
+                - torch.distributed.run
+                - --rdzv_backend=c10d
+                - ./echo.py
+
diff --git a/examples/pytorch/elastic/etcd.yaml b/examples/pytorch/elastic/etcd.yaml
@@ -0,0 +1,74 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: etcd-client
+spec:
+  ports:
+    - name: etcd-client-port
+      port: 2379
+      protocol: TCP
+      targetPort: 2379
+  selector:
+    app: etcd
+
+---
+
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: etcd
+    etcd_node: etcd-server
+  name: etcd-server
+spec:
+  containers:
+    - command:
+        - /usr/local/bin/etcd
+        - --data-dir
+        - /var/lib/etcd
+        - --enable-v2
+        - --name
+        - etcd-server
+        - --initial-advertise-peer-urls
+        - http://etcd-server:2380
+        - --listen-peer-urls
+        - http://0.0.0.0:2380
+        - --listen-client-urls
+        - http://0.0.0.0:2379
+        - --advertise-client-urls
+        - http://etcd-server:2379
+        - --initial-cluster
+        - etcd-server=http://etcd-server:2380
+        - --initial-cluster-state
+        - new
+      image: quay.io/coreos/etcd:latest
+      name: etcd-server
+      ports:
+        - containerPort: 2379
+          name: client
+          protocol: TCP
+        - containerPort: 2380
+          name: server
+          protocol: TCP
+  restartPolicy: Always
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    etcd_node: etcd-server
+  name: etcd-server
+spec:
+  ports:
+    - name: client
+      port: 2379
+      protocol: TCP
+      targetPort: 2379
+    - name: server
+      port: 2380
+      protocol: TCP
+      targetPort: 2380
+  selector:
+    etcd_node: etcd-server
diff --git a/examples/pytorch/elastic/imagenet/.dockerignore b/examples/pytorch/elastic/imagenet/.dockerignore
@@ -0,0 +1 @@
+data
diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile
@@ -0,0 +1,23 @@
+ARG BASE_IMAGE=pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
+FROM $BASE_IMAGE
+
+# install utilities and dependencies
+RUN pip install classy-vision
+
+WORKDIR /workspace
+
+# download imagenet tiny for data
+RUN apt-get -q update && apt-get -q install -y wget unzip
+RUN wget -q http://cs231n.stanford.edu/tiny-imagenet-200.zip && unzip -q tiny-imagenet-200.zip -d data && rm tiny-imagenet-200.zip
+
+COPY . ./examples
+RUN chmod -R u+x ./examples/bin
+ENV PATH=/workspace/examples/bin:${PATH}
+
+# create a template classy project in /workspace/classy_vision
+# (see https://classyvision.ai/#quickstart)
+RUN classy-project classy_vision
+
+USER root
+ENTRYPOINT ["python", "-m", "torch.distributed.run"]
+CMD ["--help"]