ray-project · krfricke · Sep 2, 2022 · Aug 29, 2022 · Aug 30, 2022 · Aug 30, 2022
@@ -1,45 +1,91 @@
-import unittest
+import pytest
 
 from ray.tune.search.variant_generator import format_vars
+from ray.tune.utils.util import retry_fn
 
 
-class TuneUtilsTest(unittest.TestCase):
-    def testFormatVars(self):
-        # Format brackets correctly
-        self.assertTrue(
-            format_vars(
-                {
-                    ("a", "b", "c"): 8.1234567,
-                    ("a", "b", "d"): [7, 8],
-                    ("a", "b", "e"): [[[3, 4]]],
-                }
-            ),
-            "c=8.12345,d=7_8,e=3_4",
+def test_format_vars():
+
+    # Format brackets correctly
+    assert (
+        format_vars(
+            {
+                ("a", "b", "c"): 8.1234567,
+                ("a", "b", "d"): [7, 8],
+                ("a", "b", "e"): [[[3, 4]]],
+            }
         )
-        # Sorted by full keys, but only last key is reported
-        self.assertTrue(
-            format_vars(
-                {
-                    ("a", "c", "x"): [7, 8],
-                    ("a", "b", "x"): 8.1234567,
-                }
-            ),
-            "x=8.12345,x=7_8",
+        == "c=8.12345,d=7_8,e=3_4"
+    )
+    # Sorted by full keys, but only last key is reported
+    assert (
+        format_vars(
+            {
+                ("a", "c", "x"): [7, 8],
+                ("a", "b", "x"): 8.1234567,
+            }
         )
-        # Filter out invalid chars. It's ok to have empty keys or values.
-        self.assertTrue(
-            format_vars(
-                {
-                    ("a  c?x"): " <;%$ok ",
-                    ("some"): " ",
-                }
-            ),
-            "a_c_x=ok,some=",
+        == "x=8.12345,x=7_8"
+    )
+    # Filter out invalid chars. It's ok to have empty keys or values.
+    assert (
+        format_vars(
+            {
+                ("a  c?x"): " <;%$ok ",
+                ("some"): " ",
+            }
         )
+        == "a_c_x=ok,some="
+    )
+
+
+def test_retry_fn_repeat(tmpdir):
+    success = tmpdir / "success"
+    marker = tmpdir / "marker"
+
+    def _fail_once():
+        if marker.exists():
+            success.write_text(".", encoding="utf-8")
+            return
+        marker.write_text(".", encoding="utf-8")
+        raise RuntimeError("Failing")
+
+    assert not success.exists()
+    assert not marker.exists()
+
+    assert retry_fn(
+        fn=_fail_once,
+        exception_type=RuntimeError,
+        sleep_time=0,
+    )
+
+    assert success.exists()
+    assert marker.exists()
+
+
+def test_retry_fn_timeout(tmpdir):
+    success = tmpdir / "success"
+    marker = tmpdir / "marker"
+
+    def _fail_once():
+        if marker.exists():
+            success.write_text(".", encoding="utf-8")
+            return
+        marker.write_text(".", encoding="utf-8")
+        raise RuntimeError("Failing")
+
+    assert not success.exists()
+    assert not marker.exists()
+
+    assert not retry_fn(
+        fn=_fail_once, exception_type=RuntimeError, sleep_time=5, timeout=0.1
+    )
+
+    assert not success.exists()
+    assert marker.exists()
 
 
 if __name__ == "__main__":
-    import pytest
     import sys
 
     sys.exit(pytest.main(["-v", __file__]))
@@ -61,6 +61,14 @@
 SETUP_TIME_THRESHOLD = 10
 
 
+def _sync_timeout() -> Optional[float]:
+    sync_timeout = float(os.environ.get("TUNE_SYNC_TIMEOUT", "600"))
+    if sync_timeout == 0:
+        return None
+
+    return sync_timeout
+
+
 @PublicAPI
 class Trainable:
     """Abstract class for trainable models, functions, etc.
@@ -517,6 +525,7 @@ def _maybe_save_to_cloud(self, checkpoint_dir: str) -> bool:
             subprocess.CalledProcessError,
             num_retries=3,
             sleep_time=1,
+            timeout=_sync_timeout(),
         )
         return True
 
@@ -551,6 +560,7 @@ def _maybe_load_from_cloud(self, checkpoint_path: str) -> bool:
             subprocess.CalledProcessError,
             num_retries=3,
             sleep_time=1,
+            timeout=_sync_timeout(),
         )
 
         return True
@@ -724,6 +734,7 @@ def delete_checkpoint(self, checkpoint_path: Union[str, Checkpoint]):
                         subprocess.CalledProcessError,
                         num_retries=3,
                         sleep_time=1,
+                        timeout=_sync_timeout(),
                     )
 
         if os.path.exists(checkpoint_dir):

@@ -2,11 +2,13 @@
 import glob
 import inspect
 import logging
+import multiprocessing
 import os
 import threading
 import time
 from collections import defaultdict
 from datetime import datetime
+from numbers import Number
 from threading import Thread
 from typing import Dict, List, Union, Type, Callable, Any, Optional
 
@@ -124,18 +126,30 @@ def stop(self):
 @DeveloperAPI
 def retry_fn(
     fn: Callable[[], Any],
-    exception_type: Type[Exception],
+    exception_type: Type[Exception] = Exception,
     num_retries: int = 3,
     sleep_time: int = 1,
-):
-    for i in range(num_retries):
-        try:
-            fn()
-        except exception_type as e:
-            logger.warning(e)
-            time.sleep(sleep_time)
-        else:
-            break
+    timeout: Optional[Number] = None,
+) -> bool:
+    def _retry_fn():
+        for i in range(num_retries):
+            try:
+                fn()
+            except exception_type as e:
+                logger.warning(e)
+                time.sleep(sleep_time)
+            else:
+                return
+
+    proc = multiprocessing.Process(target=_retry_fn)
+    proc.start()
+    proc.join(timeout=timeout)
+
+    if proc.exitcode is None:
+        proc.terminate()
+        return False
+
+    return proc.exitcode == 0
 
 
 @ray.remote