-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-17675] [CORE] Expand Blacklist for TaskSets #15249
Changes from 62 commits
9a6aaed
5bfe941
d7adc67
a34e9ae
cf58374
7fcb266
487eb66
c22aaad
dc2b3ed
338db65
fa3e34a
16afb43
7aff08a
e181546
351a9a7
572c777
8cebb01
dbf904e
f0de0db
8a12adf
c9e3662
497e626
515b18a
f0428b4
a5fbce7
b582d8e
cec36c9
290b315
8c58ad9
f012780
fc45f5b
f8b1bff
e56bb90
cc3b968
5fdfe49
e10fa10
1297788
c78964f
b679953
463b837
9a2cf84
d0f43c7
cfb653e
18ef5c6
0c3ceba
2381b25
3ca2f79
27b4bde
278fff3
882b385
21e6789
9b953ea
5568973
9c9d816
b90930f
ab2ad38
89d3c5e
a6c863f
9086106
bb654bb
354f36b
34eff27
c805a0b
445cc97
4501e6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.scheduler | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.internal.config | ||
import org.apache.spark.util.Utils | ||
|
||
private[scheduler] object BlacklistTracker extends Logging { | ||
|
||
private val DEFAULT_TIMEOUT = "1h" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the default blacklist timeout for a node/executor (before it is re-enabled) ? Ofcourse, this was specific to our cluster/jobs :-) Would like to know if the job/cluster characterstics were different for this value (or it is coming from some other expts/config). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (longer top-level comment responding to this) |
||
|
||
/** | ||
* Returns true if the blacklist is enabled, based on checking the configuration in the following | ||
* order: | ||
* 1. Is it specifically enabled or disabled? | ||
* 2. Is it enabled via the legacy timeout conf? | ||
* 3. Default is off | ||
*/ | ||
def isBlacklistEnabled(conf: SparkConf): Boolean = { | ||
conf.get(config.BLACKLIST_ENABLED) match { | ||
case Some(enabled) => | ||
enabled | ||
case None => | ||
// if they've got a non-zero setting for the legacy conf, always enable the blacklist, | ||
// otherwise, use the default. | ||
val legacyKey = config.BLACKLIST_LEGACY_TIMEOUT_CONF.key | ||
conf.get(config.BLACKLIST_LEGACY_TIMEOUT_CONF).exists { legacyTimeout => | ||
if (legacyTimeout == 0) { | ||
logWarning(s"Turning off blacklisting due to legacy configuration: $legacyKey == 0") | ||
false | ||
} else { | ||
logWarning(s"Turning on blacklisting due to legacy configuration: $legacyKey > 0") | ||
true | ||
} | ||
} | ||
} | ||
} | ||
|
||
def getBlacklistTimeout(conf: SparkConf): Long = { | ||
conf.get(config.BLACKLIST_TIMEOUT_CONF).getOrElse { | ||
conf.get(config.BLACKLIST_LEGACY_TIMEOUT_CONF).getOrElse { | ||
Utils.timeStringAsMs(DEFAULT_TIMEOUT) | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Verify that blacklist configurations are consistent; if not, throw an exception. Should only | ||
* be called if blacklisting is enabled. | ||
* | ||
* The configuration for the blacklist is expected to adhere to a few invariants. Default | ||
* values follow these rules of course, but users may unwittingly change one configuration | ||
* without making the corresponding adjustment elsewhere. This ensures we fail-fast when | ||
* there are such misconfigurations. | ||
*/ | ||
def validateBlacklistConfs(conf: SparkConf): Unit = { | ||
|
||
def mustBePos(k: String, v: String): Unit = { | ||
throw new IllegalArgumentException(s"$k was $v, but must be > 0.") | ||
} | ||
|
||
Seq( | ||
config.MAX_TASK_ATTEMPTS_PER_EXECUTOR, | ||
config.MAX_TASK_ATTEMPTS_PER_NODE, | ||
config.MAX_FAILURES_PER_EXEC_STAGE, | ||
config.MAX_FAILED_EXEC_PER_NODE_STAGE | ||
).foreach { config => | ||
val v = conf.get(config) | ||
if (v <= 0) { | ||
mustBePos(config.key, v.toString) | ||
} | ||
} | ||
|
||
val timeout = getBlacklistTimeout(conf) | ||
if (timeout <= 0) { | ||
// first, figure out where the timeout came from, to include the right conf in the message. | ||
conf.get(config.BLACKLIST_TIMEOUT_CONF) match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the duplicate code here and in getBlacklistTimeout makes me a little nervous. Do you think it's too verbose to have a method getBlacklistTimeoutConfigAndValue that returns (config_name, timeout), and then that method could be called here and by getBlacklistTimeout? If you think that seems like overkill, ignore this comment. |
||
case Some(t) => | ||
mustBePos(config.BLACKLIST_TIMEOUT_CONF.key, timeout.toString) | ||
case None => | ||
mustBePos(config.BLACKLIST_LEGACY_TIMEOUT_CONF.key, timeout.toString) | ||
} | ||
} | ||
|
||
val maxTaskFailures = conf.get(config.MAX_TASK_FAILURES) | ||
val maxNodeAttempts = conf.get(config.MAX_TASK_ATTEMPTS_PER_NODE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we also check that maxNodeAttempts is >= max exec attempts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that would be OK, actually -- it is the same as turning executor blacklisting off. Gets back to the question of what we think a user might reasonably want. An alternative would be to add the check, and use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok cool thanks for the explanation -- that makes sense and seems fine to leave as-is. |
||
|
||
if (maxNodeAttempts >= maxTaskFailures) { | ||
throw new IllegalArgumentException(s"${config.MAX_TASK_ATTEMPTS_PER_NODE.key} " + | ||
s"( = ${maxNodeAttempts}) was >= ${config.MAX_TASK_FAILURES.key} " + | ||
s"( = ${maxTaskFailures} ). Though blacklisting is enabled, with this configuration, " + | ||
s"Spark will not be robust to one bad node. Decrease " + | ||
s"${config.MAX_TASK_ATTEMPTS_PER_NODE.key}, increase ${config.MAX_TASK_FAILURES.key}, " + | ||
s"or disable blacklisting with ${config.BLACKLIST_ENABLED.key}") | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.scheduler | ||
|
||
import scala.collection.mutable.HashMap | ||
|
||
/** | ||
* Small helper for tracking failed tasks for blacklisting purposes. Info on all failures on one | ||
* executor, within one task set. | ||
*/ | ||
private[scheduler] class ExecutorFailuresInTaskSet(val node: String) { | ||
/** | ||
* Mapping from index of the tasks in the taskset, to the number of times it has failed on this | ||
* executor. | ||
*/ | ||
val taskToFailureCount = HashMap[Int, Int]() | ||
|
||
def updateWithFailure(taskIndex: Int): Unit = { | ||
val prevFailureCount = taskToFailureCount.getOrElse(taskIndex, 0) | ||
taskToFailureCount(taskIndex) = prevFailureCount + 1 | ||
} | ||
|
||
def numUniqueTasksWithFailures: Int = taskToFailureCount.size | ||
|
||
/** | ||
* Return the number of times this executor has failed on the given task index. | ||
*/ | ||
def getNumTaskFailures(index: Int): Int = { | ||
taskToFailureCount.getOrElse(index, 0) | ||
} | ||
|
||
override def toString(): String = { | ||
s"numUniqueTasksWithFailures = $numUniqueTasksWithFailures; " + | ||
s"tasksToFailureCount = $taskToFailureCount" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,14 +22,14 @@ import java.util.{Timer, TimerTask} | |
import java.util.concurrent.TimeUnit | ||
import java.util.concurrent.atomic.AtomicLong | ||
|
||
import scala.collection.mutable.ArrayBuffer | ||
import scala.collection.mutable.HashMap | ||
import scala.collection.mutable.HashSet | ||
import scala.collection.Set | ||
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} | ||
import scala.util.Random | ||
|
||
import org.apache.spark._ | ||
import org.apache.spark.TaskState.TaskState | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.internal.config | ||
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode | ||
import org.apache.spark.scheduler.TaskLocality.TaskLocality | ||
import org.apache.spark.scheduler.local.LocalSchedulerBackend | ||
|
@@ -57,7 +57,7 @@ private[spark] class TaskSchedulerImpl( | |
isLocal: Boolean = false) | ||
extends TaskScheduler with Logging | ||
{ | ||
def this(sc: SparkContext) = this(sc, sc.conf.getInt("spark.task.maxFailures", 4)) | ||
def this(sc: SparkContext) = this(sc, sc.conf.get(config.MAX_TASK_FAILURES)) | ||
|
||
val conf = sc.conf | ||
|
||
|
@@ -100,7 +100,7 @@ private[spark] class TaskSchedulerImpl( | |
|
||
// The set of executors we have on each host; this is used to compute hostsAlive, which | ||
// in turn is used to decide when we can attain data locality on a given host | ||
protected val executorsByHost = new HashMap[String, HashSet[String]] | ||
protected val hostToExecutors = new HashMap[String, HashSet[String]] | ||
|
||
protected val hostsByRack = new HashMap[String, HashSet[String]] | ||
|
||
|
@@ -243,8 +243,8 @@ private[spark] class TaskSchedulerImpl( | |
} | ||
} | ||
manager.parent.removeSchedulable(manager) | ||
logInfo("Removed TaskSet %s, whose tasks have all completed, from pool %s" | ||
.format(manager.taskSet.id, manager.parent.name)) | ||
logInfo(s"Removed TaskSet ${manager.taskSet.id}, whose tasks have all completed, from pool" + | ||
s" ${manager.parent.name}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious - are we preferring String interpolation to format ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I think the general idea is while touching something, update to string interpolation, I've seen that in a lot of prs. (In this particular case, I could leave it -- probably I had touched the log msg somewhere along teh way and then backed out.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've seen this where people are using more string interpolation but I never saw discussion on this or anything, do you know if there is there a specific reason for this? Performance difference or someone just decided it should be standard? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont' think there was ever an official discussion, just seemed like there was a gradual switch till it became the norm. I think the motivation was readabillity, not performance. |
||
} | ||
|
||
private def resourceOfferSingleTaskSet( | ||
|
@@ -291,11 +291,11 @@ private[spark] class TaskSchedulerImpl( | |
// Also track if new executor is added | ||
var newExecAvail = false | ||
for (o <- offers) { | ||
if (!executorsByHost.contains(o.host)) { | ||
executorsByHost(o.host) = new HashSet[String]() | ||
if (!hostToExecutors.contains(o.host)) { | ||
hostToExecutors(o.host) = new HashSet[String]() | ||
} | ||
if (!executorIdToTaskCount.contains(o.executorId)) { | ||
executorsByHost(o.host) += o.executorId | ||
hostToExecutors(o.host) += o.executorId | ||
executorAdded(o.executorId, o.host) | ||
executorIdToHost(o.executorId) = o.host | ||
executorIdToTaskCount(o.executorId) = 0 | ||
|
@@ -334,7 +334,7 @@ private[spark] class TaskSchedulerImpl( | |
} while (launchedTaskAtCurrentMaxLocality) | ||
} | ||
if (!launchedAnyTask) { | ||
taskSet.abortIfCompletelyBlacklisted(executorIdToHost.keys) | ||
taskSet.abortIfCompletelyBlacklisted(hostToExecutors) | ||
} | ||
} | ||
|
||
|
@@ -542,10 +542,10 @@ private[spark] class TaskSchedulerImpl( | |
executorIdToTaskCount -= executorId | ||
|
||
val host = executorIdToHost(executorId) | ||
val execs = executorsByHost.getOrElse(host, new HashSet) | ||
val execs = hostToExecutors.getOrElse(host, new HashSet) | ||
execs -= executorId | ||
if (execs.isEmpty) { | ||
executorsByHost -= host | ||
hostToExecutors -= host | ||
for (rack <- getRackForHost(host); hosts <- hostsByRack.get(rack)) { | ||
hosts -= host | ||
if (hosts.isEmpty) { | ||
|
@@ -565,11 +565,11 @@ private[spark] class TaskSchedulerImpl( | |
} | ||
|
||
def getExecutorsAliveOnHost(host: String): Option[Set[String]] = synchronized { | ||
executorsByHost.get(host).map(_.toSet) | ||
hostToExecutors.get(host).map(_.toSet) | ||
} | ||
|
||
def hasExecutorsAliveOnHost(host: String): Boolean = synchronized { | ||
executorsByHost.contains(host) | ||
hostToExecutors.contains(host) | ||
} | ||
|
||
def hasHostAliveOnRack(rack: String): Boolean = synchronized { | ||
|
@@ -662,5 +662,4 @@ private[spark] object TaskSchedulerImpl { | |
|
||
retval.toList | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
super nit: upper case comes before lowercase (so Logging should be before config)