From d13e8895e8cc1c8965a3df6633db5d0f288abb10 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Thu, 6 Jun 2019 18:55:26 -0700 Subject: [PATCH 1/5] Debugging --- src/ray/raylet/scheduling_resources.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/raylet/scheduling_resources.cc b/src/ray/raylet/scheduling_resources.cc index 895535a9a7f0..7afe1e3caab9 100644 --- a/src/ray/raylet/scheduling_resources.cc +++ b/src/ray/raylet/scheduling_resources.cc @@ -169,7 +169,7 @@ void ResourceSet::SubtractResourcesStrict(const ResourceSet &other) { const std::string &resource_label = resource_pair.first; const FractionalResourceQuantity &resource_capacity = resource_pair.second; RAY_CHECK(resource_capacity_.count(resource_label) == 1) - << "Attempt to acquire unknown resource: " << resource_label; + << "Attempt to acquire unknown resource: " << resource_label << " capacity " << resource_capacity.ToDouble(); resource_capacity_[resource_label] -= resource_capacity; // Ensure that quantity is positive. Note, we have to have the check before From 0abcd0711d358ba9097c5809c1fbc07ac724780c Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Thu, 6 Jun 2019 23:23:23 -0700 Subject: [PATCH 2/5] Prevent ResourceSet with zero CPUs from appearing. --- src/ray/raylet/scheduling_resources.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ray/raylet/scheduling_resources.cc b/src/ray/raylet/scheduling_resources.cc index 7afe1e3caab9..5b917c63cd21 100644 --- a/src/ray/raylet/scheduling_resources.cc +++ b/src/ray/raylet/scheduling_resources.cc @@ -76,7 +76,11 @@ ResourceSet::ResourceSet() {} ResourceSet::ResourceSet( const std::unordered_map &resource_map) - : resource_capacity_(resource_map) {} + : resource_capacity_(resource_map) { + for (auto const &resource_pair : resource_map) { + RAY_CHECK(resource_pair.second > 0); + } + } ResourceSet::ResourceSet(const std::unordered_map &resource_map) { for (auto const &resource_pair : resource_map) { @@ -169,7 +173,8 @@ void ResourceSet::SubtractResourcesStrict(const ResourceSet &other) { const std::string &resource_label = resource_pair.first; const FractionalResourceQuantity &resource_capacity = resource_pair.second; RAY_CHECK(resource_capacity_.count(resource_label) == 1) - << "Attempt to acquire unknown resource: " << resource_label << " capacity " << resource_capacity.ToDouble(); + << "Attempt to acquire unknown resource: " << resource_label << " capacity " + << resource_capacity.ToDouble(); resource_capacity_[resource_label] -= resource_capacity; // Ensure that quantity is positive. Note, we have to have the check before @@ -233,8 +238,10 @@ FractionalResourceQuantity ResourceSet::GetResource( const ResourceSet ResourceSet::GetNumCpus() const { ResourceSet cpu_resource_set; - cpu_resource_set.resource_capacity_[kCPU_ResourceLabel] = - GetResource(kCPU_ResourceLabel); + const FractionalResourceQuantity cpu_quantity = GetResource(kCPU_ResourceLabel); + if (cpu_quantity > 0) { + cpu_resource_set.resource_capacity_[kCPU_ResourceLabel] = cpu_quantity; + } return cpu_resource_set; } From bb485723c2bb394813baabd2f3632b44113a6c82 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 7 Jun 2019 15:59:22 -0700 Subject: [PATCH 3/5] Fix 0 cpu case for actors. --- python/ray/tests/test_basic.py | 12 +++++++++++- src/ray/raylet/node_manager.cc | 6 +++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 50aeca025362..45ad095885ae 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -1878,13 +1878,23 @@ def test(self): def test_zero_cpus(shutdown_only): ray.init(num_cpus=0) + # We should be able to execute a task that requires 0 CPU resources. @ray.remote(num_cpus=0) def f(): return 1 - # The task should be able to execute. ray.get(f.remote()) + # We should be able to create an actor that requires 0 CPU resources. + @ray.remote(num_cpus=0) + class Actor(object): + def method(self): + pass + + a = Actor.remote() + x = a.method.remote() + ray.get(x) + def test_zero_cpus_actor(ray_start_cluster): cluster = ray_start_cluster diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index e3fd9a0df09f..5d1fcbbe0702 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -1814,9 +1814,9 @@ bool NodeManager::AssignTask(const Task &task) { cluster_resource_map_[my_client_id].Acquire(spec.GetRequiredResources()); if (spec.IsActorCreationTask()) { - // Check that we are not placing an actor creation task on a node with 0 CPUs. - RAY_CHECK(cluster_resource_map_[my_client_id].GetTotalResources().GetResourceMap().at( - kCPU_ResourceLabel) != 0); + // Check that the actor's placement resource requirements are satisfied. + RAY_CHECK(spec.GetRequiredPlacementResources().IsSubset( + cluster_resource_map_[my_client_id].GetTotalResources())); worker->SetLifetimeResourceIds(acquired_resources); } else { worker->SetTaskResourceIds(acquired_resources); From 28d73f671e0eb17c1d8688ed8281e27f124505ab Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 7 Jun 2019 16:00:38 -0700 Subject: [PATCH 4/5] linting --- src/ray/raylet/scheduling_resources.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ray/raylet/scheduling_resources.cc b/src/ray/raylet/scheduling_resources.cc index 5b917c63cd21..cdc17307755c 100644 --- a/src/ray/raylet/scheduling_resources.cc +++ b/src/ray/raylet/scheduling_resources.cc @@ -77,10 +77,10 @@ ResourceSet::ResourceSet() {} ResourceSet::ResourceSet( const std::unordered_map &resource_map) : resource_capacity_(resource_map) { - for (auto const &resource_pair : resource_map) { - RAY_CHECK(resource_pair.second > 0); - } - } + for (auto const &resource_pair : resource_map) { + RAY_CHECK(resource_pair.second > 0); + } +} ResourceSet::ResourceSet(const std::unordered_map &resource_map) { for (auto const &resource_pair : resource_map) { From b3277ec990645e790120e80b36545d9d678d0263 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 7 Jun 2019 16:09:00 -0700 Subject: [PATCH 5/5] Fix test by lengthening buffer. --- python/ray/tests/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 45ad095885ae..7f1f78d1b5c4 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -1754,7 +1754,7 @@ def f(n): def g(n): time.sleep(n) - time_buffer = 0.5 + time_buffer = 2 start_time = time.time() ray.get([f.remote(0.5), g.remote(0.5)])