Skip to content

Commit

Permalink
CLOUDSTACK-9782: Improve scheduling of jobs
Browse files Browse the repository at this point in the history
- Removed three bg thread tasks, uses FSM event-trigger based scheduling
- On successful recovery, kicks VM HA
- Improves overall HA scheduling and task submission, lower DB access

Signed-off-by: Rohit Yadav <rohit.yadav@shapeblue.com>
  • Loading branch information
rohityadavcloud committed Aug 30, 2017
1 parent c0b33db commit d2c3408
Show file tree
Hide file tree
Showing 13 changed files with 206 additions and 216 deletions.
4 changes: 4 additions & 0 deletions api/src/org/apache/cloudstack/ha/HAConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ enum Event {
ActivityCheckFailureUnderThresholdRatio,
PowerCycle,
Recovered,
RetryRecovery,
RecoveryWaitPeriodTimeout,
RecoveryOperationThresholdExceeded,
RetryFencing,
Fenced;

public Long getServerId() {
Expand Down Expand Up @@ -123,6 +125,7 @@ public String getDescription() {

FSM.addTransition(Recovering, Event.Disabled, Disabled);
FSM.addTransition(Recovering, Event.Ineligible, Ineligible);
FSM.addTransition(Recovering, Event.RetryRecovery, Recovering);
FSM.addTransition(Recovering, Event.Recovered, Recovered);
FSM.addTransition(Recovering, Event.RecoveryOperationThresholdExceeded, Fencing);

Expand All @@ -132,6 +135,7 @@ public String getDescription() {

FSM.addTransition(Fencing, Event.Disabled, Disabled);
FSM.addTransition(Fencing, Event.Ineligible, Ineligible);
FSM.addTransition(Fencing, Event.RetryFencing, Fencing);
FSM.addTransition(Fencing, Event.Fenced, Fenced);

FSM.addTransition(Fenced, Event.Disabled, Disabled);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public Boolean checkingHB() {
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
String result = cmd.execute(parser);
s_logger.debug("KVMHAChecker pool: " + pool._poolIp);
s_logger.debug("KVMHAChecker reture: " + result);
s_logger.debug("KVMHAChecker result: " + result);
s_logger.debug("KVMHAChecker parser: " + parser.getLine());
if (result == null && parser.getLine().contains("> DEAD <")) {
s_logger.debug("read heartbeat failed: ");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ public HAResource.ResourceSubType resourceSubType() {

@Override
public boolean isEligible(final Host host) {
if (host == null) {
return false;
}
final SimulatorHAState haState = hostHAStateMap.get(host.getId());
return !isInMaintenanceMode(host) && !isDisabled(host) && haState != null
&& Hypervisor.HypervisorType.Simulator.equals(host.getHypervisorType());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
<dependency>
<groupId>br.com.autonomiccs</groupId>
<artifactId>apache-cloudstack-java-client</artifactId>
<version>1.0.4</version>
<version>1.0.5</version>
</dependency>
</dependencies>
</project>
309 changes: 133 additions & 176 deletions server/src/org/apache/cloudstack/ha/HAManagerImpl.java

Large diffs are not rendered by default.

11 changes: 7 additions & 4 deletions server/src/org/apache/cloudstack/ha/HAResourceCounter.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ public long getRecoveryCounter() {
}

public synchronized void incrActivityCounter(final boolean isFailure) {
lastActivityCheckTimestamp = System.currentTimeMillis();
activityCheckCounter.incrementAndGet();
if (isFailure) {
activityCheckFailureCounter.incrementAndGet();
Expand Down Expand Up @@ -71,8 +70,12 @@ public boolean hasActivityThresholdExceeded(final double failureRatio) {
return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio);
}

public boolean canPerformActivityCheck(final Long activityCheckInterval) {
return lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000);
public synchronized boolean canPerformActivityCheck(final Long activityCheckInterval) {
if (lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000)) {
lastActivityCheckTimestamp = System.currentTimeMillis();
return true;
}
return false;
}

public boolean canRecheckActivity(final Long maxDegradedPeriod) {
Expand Down Expand Up @@ -121,7 +124,7 @@ public void setFenceFuture(final Future<Boolean> future) {
fenceFuture = future;
}

public boolean lastFencingCompleted() {
public boolean canAttemptFencing() {
return fenceFuture == null || fenceFuture.isDone();
}

Expand Down
9 changes: 5 additions & 4 deletions server/src/org/apache/cloudstack/ha/provider/HAProvider.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@

package org.apache.cloudstack.ha.provider;

import com.cloud.utils.component.Adapter;

import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAResource;
import org.joda.time.DateTime;

import org.apache.cloudstack.ha.HAResource;
import com.cloud.utils.component.Adapter;

public interface HAProvider<R extends HAResource> extends Adapter {

Expand Down Expand Up @@ -57,7 +56,9 @@ enum HAProviderConfig {

boolean fence(R r) throws HAFenceException;

void setFenced(R r);
void fenceSubResources(R r);

void enableMaintenance(R r);

void sendAlert(R r, HAConfig.HAState nextState);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public boolean isInMaintenanceMode(final Host host) {
}

@Override
public void setFenced(final Host r) {
public void fenceSubResources(final Host r) {
if (r.getState() != Status.Down) {
try {
LOG.debug("Trying to disconnect the host without investigation and scheduling HA for the VMs on host id=" + r.getId());
Expand All @@ -80,11 +80,15 @@ public void setFenced(final Host r) {
} catch (Exception e) {
LOG.error("Failed to disconnect host and schedule HA restart of VMs after fencing the host: ", e);
}
try {
resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
} catch (NoTransitionException e) {
LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
}
}
}

@Override
public void enableMaintenance(final Host r) {
try {
resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
} catch (NoTransitionException e) {
LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
}
}

Expand Down
25 changes: 13 additions & 12 deletions server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

package org.apache.cloudstack.ha.task;

import java.util.concurrent.ExecutorService;

import javax.inject.Inject;

import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.ha.HAResource;
Expand All @@ -25,11 +29,7 @@
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HAProvider.HAProviderConfig;
import org.apache.log4j.Logger;

import javax.inject.Inject;

import org.joda.time.DateTime;
import java.util.concurrent.ExecutorService;

public class ActivityCheckTask extends BaseHATask {

Expand All @@ -38,22 +38,24 @@ public class ActivityCheckTask extends BaseHATask {
@Inject
private HAManager haManager;

private final long disconnectTime;
private long disconnectTime;
private long maxActivityChecks;
private double activityCheckFailureRatio;

public ActivityCheckTask(final HAResource resource, final HAProvider<HAResource> haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
final ExecutorService executor, final long disconnectTime) {
super(resource, haProvider, haConfig, haProviderConfig, executor);
this.disconnectTime = disconnectTime;
this.maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
this.activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
}

public boolean performAction() throws HACheckerException {
return getHaProvider().hasActivity(getResource(), new DateTime(disconnectTime));
}

public void processResult(boolean result, Throwable t) {
public synchronized void processResult(boolean result, Throwable t) {
final HAConfig haConfig = getHaConfig();
final HAProvider<HAResource> haProvider = getHaProvider();
final HAResource resource = getResource();
final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());

if (t != null && t instanceof HACheckerException) {
Expand All @@ -64,18 +66,17 @@ public void processResult(boolean result, Throwable t) {

counter.incrActivityCounter(!result);

long maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
if (counter.getActivityCheckCounter() < maxActivityChecks) {
haManager.transitionHAState(HAConfig.Event.TooFewActivityCheckSamples, haConfig);
return;
}

double activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig);
} else {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig);
counter.markResourceDegraded();
if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) {
counter.markResourceDegraded();
}
}
counter.resetActivityCounter();
}
Expand Down
23 changes: 16 additions & 7 deletions server/src/org/apache/cloudstack/ha/task/BaseHATask.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@

package org.apache.cloudstack.ha.task;

import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAResource;
import org.apache.cloudstack.ha.provider.HACheckerException;
import org.apache.cloudstack.ha.provider.HAFenceException;
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HARecoveryException;
import org.apache.log4j.Logger;

import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.joda.time.DateTime;

public abstract class BaseHATask implements Callable<Boolean> {
public static final Logger LOG = Logger.getLogger(BaseHATask.class);
Expand All @@ -40,6 +41,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
private final HAConfig haConfig;
private final ExecutorService executor;
private Long timeout;
private DateTime created;

public BaseHATask(final HAResource resource, final HAProvider<HAResource> haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
final ExecutorService executor) {
Expand All @@ -48,6 +50,7 @@ public BaseHATask(final HAResource resource, final HAProvider<HAResource> haProv
this.haConfig = haConfig;
this.executor = executor;
this.timeout = (Long)haProvider.getConfigValue(haProviderConfig, resource);
this.created = new DateTime();
}

public HAProvider<HAResource> getHaProvider() {
Expand All @@ -74,6 +77,9 @@ public boolean performAction() throws HACheckerException, HAFenceException, HARe

@Override
public Boolean call() {
if (new DateTime().minusHours(1).isAfter(getCreated())) {
return false;
}
final Future<Boolean> future = executor.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws HACheckerException, HAFenceException, HARecoveryException {
Expand All @@ -99,4 +105,7 @@ public Boolean call() throws HACheckerException, HAFenceException, HARecoveryExc
return result;
}

public DateTime getCreated() {
return created;
}
}
3 changes: 2 additions & 1 deletion server/src/org/apache/cloudstack/ha/task/FenceTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ public void processResult(boolean result, Throwable e) {
if (result) {
counter.resetRecoveryCounter();
haManager.transitionHAState(HAConfig.Event.Fenced, haConfig);
getHaProvider().setFenced(getResource());
getHaProvider().fenceSubResources(getResource());
getHaProvider().enableMaintenance(getResource());
}
getHaProvider().sendAlert(getResource(), HAConfig.HAState.Fencing);
}
Expand Down
13 changes: 10 additions & 3 deletions server/src/org/apache/cloudstack/ha/task/RecoveryTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@

package org.apache.cloudstack.ha.task;

import java.util.concurrent.ExecutorService;

import javax.inject.Inject;

import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.ha.HAResource;
import org.apache.cloudstack.ha.HAResourceCounter;
import org.apache.cloudstack.ha.provider.HACheckerException;
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HARecoveryException;

import javax.inject.Inject;
import java.util.concurrent.ExecutorService;

public class RecoveryTask extends BaseHATask {

@Inject
Expand All @@ -43,8 +45,13 @@ public boolean performAction() throws HACheckerException, HARecoveryException {

public void processResult(boolean result, Throwable e) {
final HAConfig haConfig = getHaConfig();
final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
counter.incrRecoveryCounter();
counter.resetActivityCounter();

if (result) {
haManager.transitionHAState(HAConfig.Event.Recovered, haConfig);
getHaProvider().fenceSubResources(getResource());
}
getHaProvider().sendAlert(getResource(), HAConfig.HAState.Recovering);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ private void checkOutOfBandManagementEnabledByZoneClusterHost(final Host host) {
}

public boolean isOutOfBandManagementEnabled(final Host host) {
return isOutOfBandManagementEnabledForZone(host.getDataCenterId())
return host != null && isOutOfBandManagementEnabledForZone(host.getDataCenterId())
&& isOutOfBandManagementEnabledForCluster(host.getClusterId())
&& isOutOfBandManagementEnabledForHost(host.getId());
}
Expand Down

0 comments on commit d2c3408

Please sign in to comment.