diff --git a/api/src/org/apache/cloudstack/ha/HAConfig.java b/api/src/org/apache/cloudstack/ha/HAConfig.java
index 36fe11c410d9..95b5c9bdfd7b 100644
--- a/api/src/org/apache/cloudstack/ha/HAConfig.java
+++ b/api/src/org/apache/cloudstack/ha/HAConfig.java
@@ -47,8 +47,10 @@ enum Event {
ActivityCheckFailureUnderThresholdRatio,
PowerCycle,
Recovered,
+ RetryRecovery,
RecoveryWaitPeriodTimeout,
RecoveryOperationThresholdExceeded,
+ RetryFencing,
Fenced;
public Long getServerId() {
@@ -123,6 +125,7 @@ public String getDescription() {
FSM.addTransition(Recovering, Event.Disabled, Disabled);
FSM.addTransition(Recovering, Event.Ineligible, Ineligible);
+ FSM.addTransition(Recovering, Event.RetryRecovery, Recovering);
FSM.addTransition(Recovering, Event.Recovered, Recovered);
FSM.addTransition(Recovering, Event.RecoveryOperationThresholdExceeded, Fencing);
@@ -132,6 +135,7 @@ public String getDescription() {
FSM.addTransition(Fencing, Event.Disabled, Disabled);
FSM.addTransition(Fencing, Event.Ineligible, Ineligible);
+ FSM.addTransition(Fencing, Event.RetryFencing, Fencing);
FSM.addTransition(Fencing, Event.Fenced, Fenced);
FSM.addTransition(Fenced, Event.Disabled, Disabled);
diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
index c99670ceaff7..3905b1ede304 100644
--- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
+++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
@@ -54,7 +54,7 @@ public Boolean checkingHB() {
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
String result = cmd.execute(parser);
s_logger.debug("KVMHAChecker pool: " + pool._poolIp);
- s_logger.debug("KVMHAChecker reture: " + result);
+ s_logger.debug("KVMHAChecker result: " + result);
s_logger.debug("KVMHAChecker parser: " + parser.getLine());
if (result == null && parser.getLine().contains("> DEAD <")) {
s_logger.debug("read heartbeat failed: ");
diff --git a/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java b/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java
index 3c3e92f6fffd..02f4e6531157 100644
--- a/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java
+++ b/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java
@@ -72,6 +72,9 @@ public HAResource.ResourceSubType resourceSubType() {
@Override
public boolean isEligible(final Host host) {
+ if (host == null) {
+ return false;
+ }
final SimulatorHAState haState = hostHAStateMap.get(host.getId());
return !isInMaintenanceMode(host) && !isDisabled(host) && haState != null
&& Hypervisor.HypervisorType.Simulator.equals(host.getHypervisorType());
diff --git a/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml b/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml
index 34a631a837ad..6759610ab4c2 100644
--- a/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml
+++ b/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml
@@ -40,7 +40,7 @@
br.com.autonomiccs
apache-cloudstack-java-client
- 1.0.4
+ 1.0.5
diff --git a/server/src/org/apache/cloudstack/ha/HAManagerImpl.java b/server/src/org/apache/cloudstack/ha/HAManagerImpl.java
index ad3438b9e600..c2ba528068fe 100644
--- a/server/src/org/apache/cloudstack/ha/HAManagerImpl.java
+++ b/server/src/org/apache/cloudstack/ha/HAManagerImpl.java
@@ -17,32 +17,20 @@
package org.apache.cloudstack.ha;
-import com.cloud.cluster.ClusterManagerListener;
-import com.cloud.cluster.ManagementServerHost;
-import com.cloud.dc.ClusterDetailsDao;
-import com.cloud.dc.ClusterDetailsVO;
-import com.cloud.dc.DataCenter;
-import com.cloud.dc.DataCenterDetailVO;
-import com.cloud.dc.dao.DataCenterDetailsDao;
-import com.cloud.domain.Domain;
-import com.cloud.event.ActionEvent;
-import com.cloud.event.ActionEventUtils;
-import com.cloud.event.EventTypes;
-import com.cloud.ha.Investigator;
-import com.cloud.host.Host;
-import com.cloud.host.Status;
-import com.cloud.host.dao.HostDao;
-import com.cloud.org.Cluster;
-import com.cloud.utils.component.ComponentContext;
-import com.cloud.utils.component.ManagerBase;
-import com.cloud.utils.component.PluggableService;
-import com.cloud.utils.db.Transaction;
-import com.cloud.utils.db.TransactionCallback;
-import com.cloud.utils.db.TransactionStatus;
-import com.cloud.utils.exception.CloudRuntimeException;
-import com.cloud.utils.fsm.NoTransitionException;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Strings;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import javax.inject.Inject;
+import javax.naming.ConfigurationException;
+
import org.apache.cloudstack.api.ApiErrorCode;
import org.apache.cloudstack.api.ServerApiException;
import org.apache.cloudstack.api.command.admin.ha.ConfigureHAForHostCmd;
@@ -71,20 +59,36 @@
import org.apache.cloudstack.utils.identity.ManagementServerNode;
import org.apache.log4j.Logger;
-import javax.inject.Inject;
-import javax.naming.ConfigurationException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
+import com.cloud.cluster.ClusterManagerListener;
+import com.cloud.cluster.ManagementServerHost;
+import com.cloud.dc.ClusterDetailsDao;
+import com.cloud.dc.ClusterDetailsVO;
+import com.cloud.dc.DataCenter;
+import com.cloud.dc.DataCenterDetailVO;
+import com.cloud.dc.dao.DataCenterDetailsDao;
+import com.cloud.domain.Domain;
+import com.cloud.event.ActionEvent;
+import com.cloud.event.ActionEventUtils;
+import com.cloud.event.EventTypes;
+import com.cloud.ha.Investigator;
+import com.cloud.host.Host;
+import com.cloud.host.Status;
+import com.cloud.host.dao.HostDao;
+import com.cloud.org.Cluster;
+import com.cloud.utils.component.ComponentContext;
+import com.cloud.utils.component.ManagerBase;
+import com.cloud.utils.component.PluggableService;
+import com.cloud.utils.db.Transaction;
+import com.cloud.utils.db.TransactionCallback;
+import com.cloud.utils.db.TransactionStatus;
+import com.cloud.utils.exception.CloudRuntimeException;
+import com.cloud.utils.fsm.NoTransitionException;
+import com.cloud.utils.fsm.StateListener;
+import com.cloud.utils.fsm.StateMachine2;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Strings;
-public final class HAManagerImpl extends ManagerBase implements HAManager, ClusterManagerListener, PluggableService, Configurable {
+public final class HAManagerImpl extends ManagerBase implements HAManager, ClusterManagerListener, PluggableService, Configurable, StateListener {
public static final Logger LOG = Logger.getLogger(HAManagerImpl.class);
@Inject
@@ -307,7 +311,7 @@ public Status getHostStatus(final Host host) {
LOG.debug("HA: Agent is available/suspect/checking Up " + host.getId());
}
return Status.Down;
- } else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Recovered || haConfig.getState() == HAConfig.HAState.Fencing) {
+ } else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
if (LOG.isDebugEnabled()){
LOG.debug("HA: Agent is disconnected " + host.getId());
}
@@ -455,23 +459,84 @@ public List> getCommands() {
return cmdList;
}
- //////////////////////////////////////////////////////////////////
- //////////////// Clustered Manager Listeners /////////////////////
- //////////////////////////////////////////////////////////////////
+ //////////////////////////////////////////////////////
+ //////////////// Event Listeners /////////////////////
+ //////////////////////////////////////////////////////
@Override
public void onManagementNodeJoined(List extends ManagementServerHost> nodeList, long selfNodeId) {
-
}
@Override
public void onManagementNodeLeft(List extends ManagementServerHost> nodeList, long selfNodeId) {
-
}
@Override
public void onManagementNodeIsolated() {
+ }
+
+ private boolean processHAStateChange(final HAConfig haConfig, final boolean status) {
+ if (!status || !checkHAOwnership(haConfig)) {
+ return false;
+ }
+
+ final HAResource resource = validateAndFindHAResource(haConfig);
+ if (resource == null) {
+ return false;
+ }
+
+ final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource);
+ if (haProvider == null) {
+ return false;
+ }
+
+ final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
+
+ // Perform activity checks
+ if (haConfig.getState() == HAConfig.HAState.Checking) {
+ final ActivityCheckTask job = ComponentContext.inject(new ActivityCheckTask(resource, haProvider, haConfig,
+ HAProviderConfig.ActivityCheckTimeout, activityCheckExecutor, counter.getSuspectTimeStamp()));
+ activityCheckExecutor.submit(job);
+ }
+
+ // Attempt recovery
+ if (haConfig.getState() == HAConfig.HAState.Recovering) {
+ if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
+ return false;
+ }
+ final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig,
+ HAProviderConfig.RecoveryTimeout, recoveryExecutor));
+ final Future recoveryFuture = recoveryExecutor.submit(task);
+ counter.setRecoveryFuture(recoveryFuture);
+ }
+
+ // Fencing
+ if (haConfig.getState() == HAConfig.HAState.Fencing) {
+ final FenceTask task = ComponentContext.inject(new FenceTask(resource, haProvider, haConfig,
+ HAProviderConfig.FenceTimeout, fenceExecutor));
+ final Future fenceFuture = fenceExecutor.submit(task);
+ counter.setFenceFuture(fenceFuture);
+ }
+ return true;
+ }
+
+ @Override
+ public boolean preStateTransitionEvent(final HAConfig.HAState oldState, final HAConfig.Event event, final HAConfig.HAState newState, final HAConfig haConfig, final boolean status, final Object opaque) {
+ if (oldState != newState || newState == HAConfig.HAState.Suspect || newState == HAConfig.HAState.Checking) {
+ return false;
+ }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("HA state pre-transition:: new state=" + newState + ", old state=" + oldState + ", for resource id=" + haConfig.getResourceId() + ", status=" + status + ", ha config state=" + haConfig.getState());
+ }
+ return processHAStateChange(haConfig, status);
+ }
+ @Override
+ public boolean postStateTransitionEvent(final StateMachine2.Transition transition, final HAConfig haConfig, final boolean status, final Object opaque) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("HA state post-transition:: new state=" + transition.getToState() + ", old state=" + transition.getCurrentState() + ", for resource id=" + haConfig.getResourceId() + ", status=" + status + ", ha config state=" + haConfig.getState());
+ }
+ return processHAStateChange(haConfig, status);
}
///////////////////////////////////////////////////
@@ -523,10 +588,8 @@ public boolean configure(final String name, final Map params) th
0L, TimeUnit.MILLISECONDS,
new ArrayBlockingQueue(fenceOperationQueueSize, true), new ThreadPoolExecutor.CallerRunsPolicy());
- pollManager.submitTask(new HealthCheckPollTask());
- pollManager.submitTask(new ActivityCheckPollTask());
- pollManager.submitTask(new RecoveryPollTask());
- pollManager.submitTask(new FencingPollTask());
+ pollManager.submitTask(new HAManagerBgPollTask());
+ HAConfig.HAState.getStateMachine().registerListener(this);
LOG.debug("HA manager has been configured");
return true;
@@ -559,7 +622,7 @@ public ConfigKey>[] getConfigKeys() {
//////////////// Poll Tasks /////////////////////
/////////////////////////////////////////////////
- private final class HealthCheckPollTask extends ManagedContextRunnable implements BackgroundPollTask {
+ private final class HAManagerBgPollTask extends ManagedContextRunnable implements BackgroundPollTask {
@Override
protected void runInContext() {
try {
@@ -582,20 +645,6 @@ protected void runInContext() {
continue;
}
- final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
-
- if (haConfig.getState() == HAConfig.HAState.Suspect) {
- if (counter.canPerformActivityCheck((Long)(haProvider.getConfigValue(HAProviderConfig.MaxActivityCheckInterval, resource)))) {
- transitionHAState(HAConfig.Event.PerformActivityCheck, haConfig);
- }
- }
-
- if (haConfig.getState() == HAConfig.HAState.Degraded) {
- if (counter.canRecheckActivity((Long)(haProvider.getConfigValue(HAProviderConfig.MaxDegradedWaitTimeout, resource)))) {
- transitionHAState(HAConfig.Event.PeriodicRecheckResourceActivity, haConfig);
- }
- }
-
switch (haConfig.getState()) {
case Available:
case Suspect:
@@ -608,136 +657,44 @@ protected void runInContext() {
default:
break;
}
- }
- } catch (Throwable t) {
- LOG.error("Error trying to perform health checks in HA manager", t);
- }
- }
- }
- private final class ActivityCheckPollTask extends ManagedContextRunnable implements BackgroundPollTask {
- @Override
- protected void runInContext() {
- try {
- if (LOG.isTraceEnabled()) {
- LOG.trace("HA activity check task is running...");
- }
- final List haConfigList = new ArrayList(haConfigDao.listAll());
- for (final HAConfig haConfig : haConfigList) {
- if (!checkHAOwnership(haConfig)) {
- continue;
- }
-
- final HAResource resource = validateAndFindHAResource(haConfig);
- if (resource == null) {
- continue;
- }
-
- final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource);
- if (haProvider == null) {
- continue;
- }
-
- if (haConfig.getState() == HAConfig.HAState.Checking) {
- final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
- final ActivityCheckTask job = ComponentContext.inject(new ActivityCheckTask(resource, haProvider, haConfig,
- HAProviderConfig.ActivityCheckTimeout, activityCheckExecutor, counter.getSuspectTimeStamp()));
- activityCheckExecutor.submit(job);
- }
- }
- } catch (Throwable t) {
- LOG.error("Error trying to perform activity checks in HA manager", t);
- }
- }
- }
-
- private final class RecoveryPollTask extends ManagedContextRunnable implements BackgroundPollTask {
- @Override
- protected void runInContext() {
- try {
- if (LOG.isTraceEnabled()) {
- LOG.trace("HA recovery task is running...");
- }
- final List haConfigList = new ArrayList(haConfigDao.listAll());
- for (final HAConfig haConfig : haConfigList) {
- if (!checkHAOwnership(haConfig)) {
- continue;
- }
+ final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
- final HAResource resource = validateAndFindHAResource(haConfig);
- if (resource == null) {
- continue;
+ if (haConfig.getState() == HAConfig.HAState.Suspect) {
+ if (counter.canPerformActivityCheck((Long)(haProvider.getConfigValue(HAProviderConfig.MaxActivityCheckInterval, resource)))) {
+ transitionHAState(HAConfig.Event.PerformActivityCheck, haConfig);
+ }
}
- final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource);
- if (haProvider == null) {
- continue;
+ if (haConfig.getState() == HAConfig.HAState.Degraded) {
+ if (counter.canRecheckActivity((Long)(haProvider.getConfigValue(HAProviderConfig.MaxDegradedWaitTimeout, resource)))) {
+ transitionHAState(HAConfig.Event.PeriodicRecheckResourceActivity, haConfig);
+ }
}
- final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
if (haConfig.getState() == HAConfig.HAState.Recovering) {
- if (counter.canAttemptRecovery()) {
- if (counter.getRecoveryCounter() >= (Long)(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
- transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
- continue;
- }
-
- final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig,
- HAProviderConfig.RecoveryTimeout, recoveryExecutor));
- final Future recoveryFuture = recoveryExecutor.submit(task);
- counter.setRecoveryFuture(recoveryFuture);
- counter.incrRecoveryCounter();
+ if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
+ transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
+ } else {
+ transitionHAState(HAConfig.Event.RetryRecovery, haConfig);
}
}
+
if (haConfig.getState() == HAConfig.HAState.Recovered) {
counter.markRecoveryStarted();
if (counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource)))) {
- transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig);
- counter.markRecoveryCompleted();
+ if (transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig)) {
+ counter.markRecoveryCompleted();
+ }
}
}
- }
- } catch (Throwable t) {
- LOG.error("Error trying to perform recovery operation in HA manager", t);
- }
- }
- }
-
- private final class FencingPollTask extends ManagedContextRunnable implements BackgroundPollTask {
- @Override
- protected void runInContext() {
- try {
- if (LOG.isTraceEnabled()) {
- LOG.trace("HA fencing task is running...");
- }
- final List haConfigList = new ArrayList(haConfigDao.listAll());
- for (final HAConfig haConfig : haConfigList) {
- if (!checkHAOwnership(haConfig)) {
- continue;
- }
- final HAResource resource = validateAndFindHAResource(haConfig);
- if (resource == null) {
- continue;
- }
-
- final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource);
- if (haProvider == null) {
- continue;
- }
-
- final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
- if (counter.lastFencingCompleted()) {
- if (haConfig.getState() == HAConfig.HAState.Fencing) {
- final FenceTask task = ComponentContext.inject(new FenceTask(resource, haProvider, haConfig,
- HAProviderConfig.FenceTimeout, fenceExecutor));
- final Future fenceFuture = fenceExecutor.submit(task);
- counter.setFenceFuture(fenceFuture);
- }
+ if (haConfig.getState() == HAConfig.HAState.Fencing && counter.canAttemptFencing()) {
+ transitionHAState(HAConfig.Event.RetryFencing, haConfig);
}
}
} catch (Throwable t) {
- LOG.error("Error trying to perform fencing operation in HA manager", t);
+ LOG.error("Error trying to perform health checks in HA manager", t);
}
}
}
diff --git a/server/src/org/apache/cloudstack/ha/HAResourceCounter.java b/server/src/org/apache/cloudstack/ha/HAResourceCounter.java
index f955fd2f8fda..f493f6926e07 100644
--- a/server/src/org/apache/cloudstack/ha/HAResourceCounter.java
+++ b/server/src/org/apache/cloudstack/ha/HAResourceCounter.java
@@ -41,7 +41,6 @@ public long getRecoveryCounter() {
}
public synchronized void incrActivityCounter(final boolean isFailure) {
- lastActivityCheckTimestamp = System.currentTimeMillis();
activityCheckCounter.incrementAndGet();
if (isFailure) {
activityCheckFailureCounter.incrementAndGet();
@@ -71,8 +70,12 @@ public boolean hasActivityThresholdExceeded(final double failureRatio) {
return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio);
}
- public boolean canPerformActivityCheck(final Long activityCheckInterval) {
- return lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000);
+ public synchronized boolean canPerformActivityCheck(final Long activityCheckInterval) {
+ if (lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000)) {
+ lastActivityCheckTimestamp = System.currentTimeMillis();
+ return true;
+ }
+ return false;
}
public boolean canRecheckActivity(final Long maxDegradedPeriod) {
@@ -121,7 +124,7 @@ public void setFenceFuture(final Future future) {
fenceFuture = future;
}
- public boolean lastFencingCompleted() {
+ public boolean canAttemptFencing() {
return fenceFuture == null || fenceFuture.isDone();
}
diff --git a/server/src/org/apache/cloudstack/ha/provider/HAProvider.java b/server/src/org/apache/cloudstack/ha/provider/HAProvider.java
index bcc590c965ff..9a7f27c003ec 100644
--- a/server/src/org/apache/cloudstack/ha/provider/HAProvider.java
+++ b/server/src/org/apache/cloudstack/ha/provider/HAProvider.java
@@ -17,12 +17,11 @@
package org.apache.cloudstack.ha.provider;
-import com.cloud.utils.component.Adapter;
-
import org.apache.cloudstack.ha.HAConfig;
+import org.apache.cloudstack.ha.HAResource;
import org.joda.time.DateTime;
-import org.apache.cloudstack.ha.HAResource;
+import com.cloud.utils.component.Adapter;
public interface HAProvider extends Adapter {
@@ -57,7 +56,9 @@ enum HAProviderConfig {
boolean fence(R r) throws HAFenceException;
- void setFenced(R r);
+ void fenceSubResources(R r);
+
+ void enableMaintenance(R r);
void sendAlert(R r, HAConfig.HAState nextState);
diff --git a/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java b/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
index 43aa20015fae..966c2843e65a 100644
--- a/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
+++ b/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
@@ -71,7 +71,7 @@ public boolean isInMaintenanceMode(final Host host) {
}
@Override
- public void setFenced(final Host r) {
+ public void fenceSubResources(final Host r) {
if (r.getState() != Status.Down) {
try {
LOG.debug("Trying to disconnect the host without investigation and scheduling HA for the VMs on host id=" + r.getId());
@@ -80,11 +80,15 @@ public void setFenced(final Host r) {
} catch (Exception e) {
LOG.error("Failed to disconnect host and schedule HA restart of VMs after fencing the host: ", e);
}
- try {
- resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
- } catch (NoTransitionException e) {
- LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
- }
+ }
+ }
+
+ @Override
+ public void enableMaintenance(final Host r) {
+ try {
+ resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
+ } catch (NoTransitionException e) {
+ LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
}
}
diff --git a/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java b/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java
index ab8af6124a7f..24f969632623 100644
--- a/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java
+++ b/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java
@@ -17,6 +17,10 @@
package org.apache.cloudstack.ha.task;
+import java.util.concurrent.ExecutorService;
+
+import javax.inject.Inject;
+
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.ha.HAResource;
@@ -25,11 +29,7 @@
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HAProvider.HAProviderConfig;
import org.apache.log4j.Logger;
-
-import javax.inject.Inject;
-
import org.joda.time.DateTime;
-import java.util.concurrent.ExecutorService;
public class ActivityCheckTask extends BaseHATask {
@@ -38,22 +38,24 @@ public class ActivityCheckTask extends BaseHATask {
@Inject
private HAManager haManager;
- private final long disconnectTime;
+ private long disconnectTime;
+ private long maxActivityChecks;
+ private double activityCheckFailureRatio;
public ActivityCheckTask(final HAResource resource, final HAProvider haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
final ExecutorService executor, final long disconnectTime) {
super(resource, haProvider, haConfig, haProviderConfig, executor);
this.disconnectTime = disconnectTime;
+ this.maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
+ this.activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
}
public boolean performAction() throws HACheckerException {
return getHaProvider().hasActivity(getResource(), new DateTime(disconnectTime));
}
- public void processResult(boolean result, Throwable t) {
+ public synchronized void processResult(boolean result, Throwable t) {
final HAConfig haConfig = getHaConfig();
- final HAProvider haProvider = getHaProvider();
- final HAResource resource = getResource();
final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
if (t != null && t instanceof HACheckerException) {
@@ -64,18 +66,17 @@ public void processResult(boolean result, Throwable t) {
counter.incrActivityCounter(!result);
- long maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
if (counter.getActivityCheckCounter() < maxActivityChecks) {
haManager.transitionHAState(HAConfig.Event.TooFewActivityCheckSamples, haConfig);
return;
}
- double activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig);
} else {
- haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig);
- counter.markResourceDegraded();
+ if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) {
+ counter.markResourceDegraded();
+ }
}
counter.resetActivityCounter();
}
diff --git a/server/src/org/apache/cloudstack/ha/task/BaseHATask.java b/server/src/org/apache/cloudstack/ha/task/BaseHATask.java
index 3ed873880260..9c878092a546 100644
--- a/server/src/org/apache/cloudstack/ha/task/BaseHATask.java
+++ b/server/src/org/apache/cloudstack/ha/task/BaseHATask.java
@@ -17,6 +17,13 @@
package org.apache.cloudstack.ha.task;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAResource;
import org.apache.cloudstack.ha.provider.HACheckerException;
@@ -24,13 +31,7 @@
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HARecoveryException;
import org.apache.log4j.Logger;
-
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
+import org.joda.time.DateTime;
public abstract class BaseHATask implements Callable {
public static final Logger LOG = Logger.getLogger(BaseHATask.class);
@@ -40,6 +41,7 @@ public abstract class BaseHATask implements Callable {
private final HAConfig haConfig;
private final ExecutorService executor;
private Long timeout;
+ private DateTime created;
public BaseHATask(final HAResource resource, final HAProvider haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
final ExecutorService executor) {
@@ -48,6 +50,7 @@ public BaseHATask(final HAResource resource, final HAProvider haProv
this.haConfig = haConfig;
this.executor = executor;
this.timeout = (Long)haProvider.getConfigValue(haProviderConfig, resource);
+ this.created = new DateTime();
}
public HAProvider getHaProvider() {
@@ -74,6 +77,9 @@ public boolean performAction() throws HACheckerException, HAFenceException, HARe
@Override
public Boolean call() {
+ if (new DateTime().minusHours(1).isAfter(getCreated())) {
+ return false;
+ }
final Future future = executor.submit(new Callable() {
@Override
public Boolean call() throws HACheckerException, HAFenceException, HARecoveryException {
@@ -99,4 +105,7 @@ public Boolean call() throws HACheckerException, HAFenceException, HARecoveryExc
return result;
}
+ public DateTime getCreated() {
+ return created;
+ }
}
diff --git a/server/src/org/apache/cloudstack/ha/task/FenceTask.java b/server/src/org/apache/cloudstack/ha/task/FenceTask.java
index d9fd62c164c2..700d6b8eef35 100644
--- a/server/src/org/apache/cloudstack/ha/task/FenceTask.java
+++ b/server/src/org/apache/cloudstack/ha/task/FenceTask.java
@@ -48,7 +48,8 @@ public void processResult(boolean result, Throwable e) {
if (result) {
counter.resetRecoveryCounter();
haManager.transitionHAState(HAConfig.Event.Fenced, haConfig);
- getHaProvider().setFenced(getResource());
+ getHaProvider().fenceSubResources(getResource());
+ getHaProvider().enableMaintenance(getResource());
}
getHaProvider().sendAlert(getResource(), HAConfig.HAState.Fencing);
}
diff --git a/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java b/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java
index b4eb863fbfc6..446dd5339efb 100644
--- a/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java
+++ b/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java
@@ -17,16 +17,18 @@
package org.apache.cloudstack.ha.task;
+import java.util.concurrent.ExecutorService;
+
+import javax.inject.Inject;
+
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.ha.HAResource;
+import org.apache.cloudstack.ha.HAResourceCounter;
import org.apache.cloudstack.ha.provider.HACheckerException;
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HARecoveryException;
-import javax.inject.Inject;
-import java.util.concurrent.ExecutorService;
-
public class RecoveryTask extends BaseHATask {
@Inject
@@ -43,8 +45,13 @@ public boolean performAction() throws HACheckerException, HARecoveryException {
public void processResult(boolean result, Throwable e) {
final HAConfig haConfig = getHaConfig();
+ final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
+ counter.incrRecoveryCounter();
+ counter.resetActivityCounter();
+
if (result) {
haManager.transitionHAState(HAConfig.Event.Recovered, haConfig);
+ getHaProvider().fenceSubResources(getResource());
}
getHaProvider().sendAlert(getResource(), HAConfig.HAState.Recovering);
}
diff --git a/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java b/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java
index fe58c64d8f22..7b09d297939c 100644
--- a/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java
+++ b/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java
@@ -267,7 +267,7 @@ private void checkOutOfBandManagementEnabledByZoneClusterHost(final Host host) {
}
public boolean isOutOfBandManagementEnabled(final Host host) {
- return isOutOfBandManagementEnabledForZone(host.getDataCenterId())
+ return host != null && isOutOfBandManagementEnabledForZone(host.getDataCenterId())
&& isOutOfBandManagementEnabledForCluster(host.getClusterId())
&& isOutOfBandManagementEnabledForHost(host.getId());
}