Skip to content

Commit

Permalink
Added AuroraClusterMonitor.setServerCheckDelayMillis
Browse files Browse the repository at this point in the history
This solves a TODO note to allow the frequency at which the server master / slave status is checked.  Though the default of 2 a second should be good in most cases, I can see where a small client count system may want to reduce this further, or where a larger service may want to increase the delay.

Between this commit and the last commit (which introduced setClientInfo configuration), this resolves #8
  • Loading branch information
jentfoo committed Jan 5, 2019
1 parent e431b36 commit 9714e86
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@

import org.threadly.concurrent.CentralThreadlyPool;
import org.threadly.concurrent.ReschedulingOperation;
import org.threadly.concurrent.SubmitterScheduler;
import org.threadly.concurrent.SchedulerService;
import org.threadly.db.aurora.DelegateAuroraDriver.IllegalDriverStateException;
import org.threadly.util.ArgumentVerifier;

/**
* Class which monitors a "cluster" of aurora servers. It is expected that for each given cluster
Expand All @@ -32,16 +33,39 @@
public class AuroraClusterMonitor {
protected static final Logger LOG = Logger.getLogger(AuroraClusterMonitor.class.getSimpleName());

protected static final int CHECK_FREQUENCY_MILLIS = 500; // TODO - make configurable
protected static final int MAXIMUM_THREAD_POOL_SIZE = 64;
protected static final SubmitterScheduler MONITOR_SCHEDULER;
protected static final SchedulerService MONITOR_SCHEDULER;
protected static final ConcurrentMap<AuroraServersKey, AuroraClusterMonitor> MONITORS;
private static volatile long CHECK_FREQUENCY_MILLIS = 500;

static {
MONITOR_SCHEDULER = CentralThreadlyPool.threadPool(MAXIMUM_THREAD_POOL_SIZE, "auroraMonitor");

MONITORS = new ConcurrentHashMap<>();
}

/**
* Sets or updates the delay between individual server status checks. Reducing this from the
* default of 500ms can make failover events be discovered faster. Since this is done on a
* per-client basis, it is recommended not to make this too small or it can significantly impact
* server load when there is a lot of clients. It is worth being aware that server checks will
* be expedited if the driver discovers potential server stability issues anyways.
*
* @param millis The milliseconds between server checks
*/
public static void setServerCheckDelayMillis(long millis) {
ArgumentVerifier.assertGreaterThanZero(millis, "millis");

synchronized (AuroraClusterMonitor.class) {
if (CHECK_FREQUENCY_MILLIS != millis) {
CHECK_FREQUENCY_MILLIS = millis;

for (AuroraClusterMonitor acm : MONITORS.values()) {
acm.clusterStateChecker.updateServerCheckDelayMillis(millis);
}
}
}
}

/**
* Return a monitor instance for a given set of servers. This instance will be consistent as
Expand Down Expand Up @@ -71,7 +95,7 @@ protected static AuroraClusterMonitor getMonitor(DelegateAuroraDriver driver, Au
protected final ClusterChecker clusterStateChecker;
private final AtomicLong replicaIndex; // used to distribute replica reads

protected AuroraClusterMonitor(SubmitterScheduler scheduler, long checkIntervalMillis,
protected AuroraClusterMonitor(SchedulerService scheduler, long checkIntervalMillis,
DelegateAuroraDriver driver, AuroraServer[] clusterServers) {
clusterStateChecker = new ClusterChecker(scheduler, checkIntervalMillis, driver, clusterServers);
replicaIndex = new AtomicLong();
Expand Down Expand Up @@ -216,14 +240,14 @@ public boolean equals(Object o) {
* (and thus will witness the final cluster state).
*/
protected static class ClusterChecker extends ReschedulingOperation {
protected final SubmitterScheduler scheduler;
protected final SchedulerService scheduler;
protected final Map<AuroraServer, ServerMonitor> allServers;
protected final List<AuroraServer> secondaryServers;
protected final AtomicReference<AuroraServer> masterServer;
protected final CopyOnWriteArrayList<AuroraServer> serversWaitingExpeditiedCheck;
private volatile boolean initialized = false; // starts false to avoid updates while constructor is running

protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
protected ClusterChecker(SchedulerService scheduler, long checkIntervalMillis,
DelegateAuroraDriver driver, AuroraServer[] clusterServers) {
super(scheduler, 0);

Expand All @@ -250,12 +274,7 @@ protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
scheduler.execute(monitor);
}

scheduler.scheduleAtFixedRate(monitor,
// hopefully well distributed hash code will distribute
// these tasks so that they are not all checked at once
// we convert to a long to avoid a possible overflow at Integer.MIN_VALUE
Math.abs((long)System.identityHashCode(monitor)) % checkIntervalMillis,
checkIntervalMillis);
scheduleMonitor(monitor, checkIntervalMillis);
}
if (masterServer.get() == null) {
LOG.warning("No master server found! Will use read only servers till one becomes master");
Expand All @@ -266,7 +285,7 @@ protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
}

// used in testing
protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
protected ClusterChecker(SchedulerService scheduler,
Map<AuroraServer, ServerMonitor> clusterServers) {
super(scheduler, 0);

Expand All @@ -279,6 +298,34 @@ protected ClusterChecker(SubmitterScheduler scheduler, long checkIntervalMillis,
initialized = true;
}


/**
* Update how often monitors check the individual servers status.
* <p>
* This can NOT be called concurrently,
* {@link AuroraClusterMonitor#setClusterCheckFrequencyMillis(long)} currently guards this.
*
* @param millis The delay between runs for monitoring server status
*/
protected void updateServerCheckDelayMillis(long millis) {
for (ServerMonitor sm : allServers.values()) {
if (scheduler.remove(sm)) {
scheduleMonitor(sm, millis);
} else {
throw new IllegalStateException("Could not unschedule monitor: " + sm);
}
}
}

protected void scheduleMonitor(ServerMonitor monitor, long checkIntervalMillis) {
scheduler.scheduleAtFixedRate(monitor,
// hopefully well distributed hash code will distribute
// these tasks so that they are not all checked at once
// we convert to a long to avoid a possible overflow at Integer.MIN_VALUE
Math.abs((long)System.identityHashCode(monitor)) % checkIntervalMillis,
checkIntervalMillis);
}

protected void expediteServerCheck(ServerMonitor serverMonitor) {
if (serversWaitingExpeditiedCheck.addIfAbsent(serverMonitor.server)) {
scheduler.execute(() -> {
Expand Down Expand Up @@ -374,6 +421,11 @@ protected ServerMonitor(DelegateAuroraDriver driver, AuroraServer server,
masterServer = false;
}

@Override
public String toString() {
return (masterServer ? "m:" : "r:") + server;
}

protected void reconnect() throws SQLException {
Connection newConnection =
driver.connect(server.hostAndPortString() +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.threadly.db.aurora.AuroraClusterMonitor.ClusterChecker;
import org.threadly.db.aurora.AuroraClusterMonitor.ServerMonitor;
import org.threadly.test.concurrent.TestableScheduler;
import org.threadly.util.ExceptionHandler;

public class AuroraClusterMonitorClusterCheckerTest {
private static final TestableScheduler SERVER_MONITOR_SCHEDULER;
Expand All @@ -42,7 +43,7 @@ public class AuroraClusterMonitorClusterCheckerTest {
@Before
public void setup() {
testScheduler = new TestableScheduler();
clusterChecker = new ClusterChecker(testScheduler, 1000, CLUSTER_SERVERS);
clusterChecker = new ClusterChecker(testScheduler, CLUSTER_SERVERS);
SERVER_MONITOR_SCHEDULER.clearTasks();
for (ServerMonitor sm : CLUSTER_SERVERS.values()) {
((TestServerMonitor)sm).resetState();
Expand All @@ -55,6 +56,18 @@ public void cleanup() {
clusterChecker = null;
}

@Test
public void updateServerCheckDelayMillisTest() {
for (ServerMonitor sm : CLUSTER_SERVERS.values()) {
clusterChecker.scheduleMonitor(sm, 500);
}

clusterChecker.updateServerCheckDelayMillis(100);

assertEquals(CLUSTER_SERVERS.size(),
testScheduler.advance(100, ExceptionHandler.IGNORE_HANDLER));
}

@Test
public void expediteServerCheckTest() {
AuroraServer testServer = new AuroraServer("host1", new Properties());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package org.threadly.db.aurora;

import static org.junit.Assert.fail;

import org.junit.Test;

public class AuroraClusterMonitorTest {
@Test (expected = IllegalArgumentException.class)
public void setServerCheckDelayMillisFail() {
AuroraClusterMonitor.setServerCheckDelayMillis(0);
fail("Exception expected");
}
}

0 comments on commit 9714e86

Please sign in to comment.