Added AuroraClusterMonitor.setServerCheckDelayMillis

This solves a TODO note to allow the frequency at which the server master / slave status is checked. Though the default of 2 a second should be good in most cases, I can see where a small client count system may want to reduce this further, or where a larger service may want to increase the delay. Between this commit and the last commit (which introduced setClientInfo configuration), this resolves #8
threadly · Jan 5, 2019 · 9714e86 · 9714e86
1 parent e431b36
commit 9714e86
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 14 deletions.
diff --git a/arcCommon/src/main/java/org/threadly/db/aurora/AuroraClusterMonitor.java b/arcCommon/src/main/java/org/threadly/db/aurora/AuroraClusterMonitor.java
@@ -16,8 +16,9 @@
 
 import org.threadly.concurrent.CentralThreadlyPool;
 import org.threadly.concurrent.ReschedulingOperation;
-import org.threadly.concurrent.SubmitterScheduler;
+import org.threadly.concurrent.SchedulerService;
 import org.threadly.db.aurora.DelegateAuroraDriver.IllegalDriverStateException;
+import org.threadly.util.ArgumentVerifier;
 
 /**
  * Class which monitors a "cluster" of aurora servers.  It is expected that for each given cluster
@@ -32,16 +33,39 @@
 public class AuroraClusterMonitor {
   protected static final Logger LOG = Logger.getLogger(AuroraClusterMonitor.class.getSimpleName());
 
-  protected static final int CHECK_FREQUENCY_MILLIS = 500;  // TODO - make configurable
   protected static final int MAXIMUM_THREAD_POOL_SIZE = 64;
-  protected static final SubmitterScheduler MONITOR_SCHEDULER;
+  protected static final SchedulerService MONITOR_SCHEDULER;
   protected static final ConcurrentMap<AuroraServersKey, AuroraClusterMonitor> MONITORS;
+  private static volatile long CHECK_FREQUENCY_MILLIS = 500;
 
   static {
     MONITOR_SCHEDULER = CentralThreadlyPool.threadPool(MAXIMUM_THREAD_POOL_SIZE, "auroraMonitor");
 
     MONITORS = new ConcurrentHashMap<>();
   }
+
+  /**
+   * Sets or updates the delay between individual server status checks.  Reducing this from the 
+   * default of 500ms can make failover events be discovered faster.  Since this is done on a 
+   * per-client basis, it is recommended not to make this too small or it can significantly impact 
+   * server load when there is a lot of clients.  It is worth being aware that server checks will 
+   * be expedited if the driver discovers potential server stability issues anyways.
+   * 
+   * @param millis The milliseconds between server checks
+   */
+  public static void setServerCheckDelayMillis(long millis) {
+    ArgumentVerifier.assertGreaterThanZero(millis, "millis");
+
+    synchronized (AuroraClusterMonitor.class) {
+      if (CHECK_FREQUENCY_MILLIS != millis) {
+        CHECK_FREQUENCY_MILLIS = millis;
+
+        for (AuroraClusterMonitor acm : MONITORS.values()) {
+          acm.clusterStateChecker.updateServerCheckDelayMillis(millis);
+        }
+      }
+    }
+  }
 
   /**
    * Return a monitor instance for a given set of servers.  This instance will be consistent as
@@ -71,7 +95,7 @@ protected static AuroraClusterMonitor getMonitor(DelegateAuroraDriver driver, Au
   protected final ClusterChecker clusterStateChecker;
   private final AtomicLong replicaIndex;  // used to distribute replica reads
 
-  protected AuroraClusterMonitor(SubmitterScheduler scheduler, long checkIntervalMillis,
+  protected AuroraClusterMonitor(SchedulerService scheduler, long checkIntervalMillis,
                                  DelegateAuroraDriver driver, AuroraServer[] clusterServers) {
     clusterStateChecker = new ClusterChecker(scheduler, checkIntervalMillis, driver, clusterServers);
     replicaIndex = new AtomicLong();
@@ -216,14 +240,14 @@ public boolean equals(Object o) {
    * (and thus will witness the final cluster state).
    */
   protected static class ClusterChecker extends ReschedulingOperation {
-    protected final SubmitterScheduler scheduler;
+    protected final SchedulerService scheduler;
     protected final Map<AuroraServer, ServerMonitor> allServers;
     protected final List<AuroraServer> secondaryServers;
     protected final AtomicReference<AuroraServer> masterServer;
     protected final CopyOnWriteArrayList<AuroraServer> serversWaitingExpeditiedCheck;
     private volatile boolean initialized = false; // starts false to avoid updates while constructor is running
 
-    protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
+    protected ClusterChecker(SchedulerService scheduler, long checkIntervalMillis,
                              DelegateAuroraDriver driver, AuroraServer[] clusterServers) {
       super(scheduler, 0);
 
@@ -250,12 +274,7 @@ protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
           scheduler.execute(monitor);
         }
 
-        scheduler.scheduleAtFixedRate(monitor,
-                                      // hopefully well distributed hash code will distribute
-                                      // these tasks so that they are not all checked at once
-                                      // we convert to a long to avoid a possible overflow at Integer.MIN_VALUE
-                                      Math.abs((long)System.identityHashCode(monitor)) % checkIntervalMillis,
-                                      checkIntervalMillis);
+        scheduleMonitor(monitor, checkIntervalMillis);
       }
       if (masterServer.get() == null) {
         LOG.warning("No master server found!  Will use read only servers till one becomes master");
@@ -266,7 +285,7 @@ protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
     }
 
     // used in testing
-    protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
+    protected ClusterChecker(SchedulerService scheduler, 
                              Map<AuroraServer, ServerMonitor> clusterServers) {
       super(scheduler, 0);
 
@@ -279,6 +298,34 @@ protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
       initialized = true;
     }
 
+
+    /**
+     * Update how often monitors check the individual servers status.
+     * <p>
+     * This can NOT be called concurrently, 
+     * {@link AuroraClusterMonitor#setClusterCheckFrequencyMillis(long)} currently guards this.
+     * 
+     * @param millis The delay between runs for monitoring server status  
+     */
+    protected void updateServerCheckDelayMillis(long millis) {
+      for (ServerMonitor sm : allServers.values()) {
+        if (scheduler.remove(sm)) {
+          scheduleMonitor(sm, millis);
+        } else {
+          throw new IllegalStateException("Could not unschedule monitor: " + sm);
+        }
+      }
+    }
+
+    protected void scheduleMonitor(ServerMonitor monitor, long checkIntervalMillis) {
+      scheduler.scheduleAtFixedRate(monitor,
+                                    // hopefully well distributed hash code will distribute
+                                    // these tasks so that they are not all checked at once
+                                    // we convert to a long to avoid a possible overflow at Integer.MIN_VALUE
+                                    Math.abs((long)System.identityHashCode(monitor)) % checkIntervalMillis,
+                                    checkIntervalMillis);
+    }
+
     protected void expediteServerCheck(ServerMonitor serverMonitor) {
       if (serversWaitingExpeditiedCheck.addIfAbsent(serverMonitor.server)) {
         scheduler.execute(() -> {
@@ -374,6 +421,11 @@ protected ServerMonitor(DelegateAuroraDriver driver, AuroraServer server,
       masterServer = false;
     }
 
+    @Override
+    public String toString() {
+      return (masterServer ? "m:" : "r:") + server;
+    }
+
     protected void reconnect() throws SQLException {
       Connection newConnection = 
           driver.connect(server.hostAndPortString() +

diff --git a/arcCommon/src/test/java/org/threadly/db/aurora/AuroraClusterMonitorClusterCheckerTest.java b/arcCommon/src/test/java/org/threadly/db/aurora/AuroraClusterMonitorClusterCheckerTest.java
@@ -16,6 +16,7 @@
 import org.threadly.db.aurora.AuroraClusterMonitor.ClusterChecker;
 import org.threadly.db.aurora.AuroraClusterMonitor.ServerMonitor;
 import org.threadly.test.concurrent.TestableScheduler;
+import org.threadly.util.ExceptionHandler;
 
 public class AuroraClusterMonitorClusterCheckerTest {
   private static final TestableScheduler SERVER_MONITOR_SCHEDULER;
@@ -42,7 +43,7 @@ public class AuroraClusterMonitorClusterCheckerTest {
   @Before
   public void setup() {
     testScheduler = new TestableScheduler();
-    clusterChecker = new ClusterChecker(testScheduler, 1000, CLUSTER_SERVERS);
+    clusterChecker = new ClusterChecker(testScheduler, CLUSTER_SERVERS);
     SERVER_MONITOR_SCHEDULER.clearTasks();
     for (ServerMonitor sm : CLUSTER_SERVERS.values()) {
       ((TestServerMonitor)sm).resetState();
@@ -55,6 +56,18 @@ public void cleanup() {
     clusterChecker = null;
   }
 
+  @Test
+  public void updateServerCheckDelayMillisTest() {
+    for (ServerMonitor sm : CLUSTER_SERVERS.values()) {
+      clusterChecker.scheduleMonitor(sm, 500);
+    }
+
+    clusterChecker.updateServerCheckDelayMillis(100);
+
+    assertEquals(CLUSTER_SERVERS.size(), 
+                 testScheduler.advance(100, ExceptionHandler.IGNORE_HANDLER));
+  }
+
   @Test
   public void expediteServerCheckTest() {
     AuroraServer testServer = new AuroraServer("host1", new Properties());

diff --git a/arcCommon/src/test/java/org/threadly/db/aurora/AuroraClusterMonitorTest.java b/arcCommon/src/test/java/org/threadly/db/aurora/AuroraClusterMonitorTest.java
@@ -0,0 +1,13 @@
+package org.threadly.db.aurora;
+
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+
+public class AuroraClusterMonitorTest {
+  @Test (expected = IllegalArgumentException.class)
+  public void setServerCheckDelayMillisFail() {
+    AuroraClusterMonitor.setServerCheckDelayMillis(0);
+    fail("Exception expected");
+  }
+}