-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-17443][SPARK-11035] Stop Spark Application if launcher goes down and use reflection #15009
Changes from 4 commits
8986aa9
99b1d1b
70a67fb
bac5cd2
38a7e3a
67f8de7
57defa9
eaa1bca
a3250ac
58c6bac
0a57684
1fe498b
25c3258
14050f5
92e4445
64a21b3
1e6311a
c207b30
ad20ccc
6a7ba5b
3cb8e85
3091105
2fdcec9
64a0e45
4357107
2707d21
cc2c0be
82df055
99d8c29
41cf6da
b098ecd
b66243d
bc99435
517fed0
a3d18b4
c17f15f
026d026
fe5b5d6
677edf7
ee3f24a
30b460c
7323200
0cfd4a7
8609874
ab50dd8
14c6365
04e56fc
7ee465f
3f060b6
f1b49d8
2996fb1
a311721
d906072
81fd297
10513ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,38 +17,71 @@ | |
|
||
package org.apache.spark.launcher | ||
|
||
import java.io.IOException | ||
import java.net.{InetAddress, Socket} | ||
|
||
import org.apache.spark.SPARK_VERSION | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.launcher.LauncherProtocol._ | ||
import org.apache.spark.util.{ThreadUtils, Utils} | ||
import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils} | ||
|
||
/** | ||
* A class that can be used to talk to a launcher server. Users should extend this class to | ||
* provide implementation for the abstract methods. | ||
* | ||
* See `LauncherServer` for an explanation of how launcher communication works. | ||
*/ | ||
private[spark] abstract class LauncherBackend { | ||
private[spark] abstract class LauncherBackend extends Logging { | ||
|
||
private var clientThread: Thread = _ | ||
private var connection: BackendConnection = _ | ||
private var lastState: SparkAppHandle.State = _ | ||
private var stopFlag: Boolean = false | ||
@volatile private var _isConnected = false | ||
|
||
def connect(): Unit = { | ||
val port = sys.env.get(LauncherProtocol.ENV_LAUNCHER_PORT).map(_.toInt) | ||
val secret = sys.env.get(LauncherProtocol.ENV_LAUNCHER_SECRET) | ||
val stopFlag = sys.env.get(LauncherProtocol.ENV_LAUNCHER_STOP_FLAG).map(_.toBoolean) | ||
if (port != None && secret != None) { | ||
val s = new Socket(InetAddress.getLoopbackAddress(), port.get) | ||
connection = new BackendConnection(s) | ||
connection.send(new Hello(secret.get, SPARK_VERSION)) | ||
clientThread = LauncherBackend.threadFactory.newThread(connection) | ||
clientThread.start() | ||
_isConnected = true | ||
if (stopFlag != None) { | ||
connect(port.get, secret.get, stopFlag.get) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can't we just set this.stopFlag here and have one connect method since that is all this one is doing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ignore this I didn't see it being called from Client There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of the |
||
} else { | ||
connect(port.get, secret.get) | ||
} | ||
} | ||
} | ||
|
||
def connect(port: Int, secret: String): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method (but not the code!) seems redundant. If you had just |
||
val s = new Socket(InetAddress.getLoopbackAddress(), port) | ||
connection = new BackendConnection(s) | ||
connection.send(new Hello(secret, SPARK_VERSION)) | ||
clientThread = LauncherBackend.threadFactory.newThread(connection) | ||
clientThread.start() | ||
_isConnected = true | ||
if (stopFlag) { | ||
logDebug("Adding shutdown hook") // force eager creation of logger | ||
var _shutdownHookRef = ShutdownHookManager.addShutdownHook( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're not using |
||
ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () => | ||
logInfo("Invoking onStopRequest() from shutdown hook") | ||
try { | ||
if (_isConnected && stopFlag) { | ||
onStopRequest() | ||
} | ||
} | ||
catch { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: move to previous line |
||
case anotherIOE: IOException => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why |
||
logError("Error while running LauncherBackend shutdownHook...", anotherIOE) | ||
} | ||
} | ||
} | ||
} | ||
|
||
def connect(port: Int, secret: String, stopFlag: Boolean): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You haven't addressed my previous feedback here: #15009 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This addressed now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You haven't addressed it, one way or another. You don't need all these |
||
this.stopFlag = stopFlag | ||
connect(port, secret) | ||
} | ||
|
||
def close(): Unit = { | ||
if (connection != null) { | ||
try { | ||
|
@@ -71,6 +104,9 @@ private[spark] abstract class LauncherBackend { | |
if (connection != null && lastState != state) { | ||
connection.send(new SetState(state)) | ||
lastState = state | ||
if (!_isConnected && stopFlag) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we still need this if its being done in close() now? |
||
fireStopRequest() | ||
} | ||
} | ||
} | ||
|
||
|
@@ -110,12 +146,14 @@ private[spark] abstract class LauncherBackend { | |
override def close(): Unit = { | ||
try { | ||
super.close() | ||
if (!_isConnected && stopFlag) { | ||
fireStopRequest() | ||
} | ||
} finally { | ||
onDisconnected() | ||
_isConnected = false | ||
} | ||
} | ||
|
||
} | ||
|
||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.launcher; | ||
|
||
import java.io.IOException; | ||
import java.lang.reflect.Method; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: too many blank lines |
||
public abstract class AbstractSparkAppHandle implements SparkAppHandle { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These classes should not be public; only |
||
private static final Logger LOG = Logger.getLogger(AbstractSparkAppHandle.class.getName()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind, forgot this is java.util.logging not slf4j... |
||
protected final String secret; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: please add a blank line between static and non-static fields. |
||
protected final LauncherServer server; | ||
protected boolean disposed; | ||
protected List<Listener> listeners; | ||
protected State state; | ||
private LauncherConnection connection; | ||
private String appId; | ||
OutputRedirector redirector; | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove extra blank newlines |
||
public AbstractSparkAppHandle(LauncherServer server, String secret) { | ||
this.server = server; | ||
this.secret = secret; | ||
this.state = State.UNKNOWN; | ||
} | ||
|
||
@Override | ||
public synchronized void addListener(Listener l) { | ||
if (listeners == null) { | ||
listeners = new ArrayList<>(); | ||
} | ||
listeners.add(l); | ||
} | ||
|
||
@Override | ||
public State getState() { | ||
return state; | ||
} | ||
|
||
@Override | ||
public String getAppId() { | ||
return appId; | ||
} | ||
|
||
@Override | ||
public void stop() { | ||
CommandBuilderUtils.checkState(connection != null, "Application is still not connected."); | ||
try { | ||
connection.send(new LauncherProtocol.Stop()); | ||
} catch (IOException ioe) { | ||
throw new RuntimeException(ioe); | ||
} | ||
} | ||
|
||
@Override | ||
public synchronized void disconnect() { | ||
if (!disposed) { | ||
disposed = true; | ||
if (connection != null) { | ||
try { | ||
connection.close(); | ||
} catch (IOException ioe) { | ||
// no-op. | ||
} | ||
} | ||
server.unregister(this); | ||
if (redirector != null) { | ||
redirector.stop(); | ||
} | ||
} | ||
} | ||
|
||
String getSecret() { | ||
return secret; | ||
} | ||
|
||
void setConnection(LauncherConnection connection) { | ||
this.connection = connection; | ||
} | ||
|
||
LauncherServer getServer() { | ||
return server; | ||
} | ||
|
||
LauncherConnection getConnection() { | ||
return connection; | ||
} | ||
|
||
void setState(State s) { | ||
if (!state.isFinal()) { | ||
state = s; | ||
fireEvent(false); | ||
} else { | ||
LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.", | ||
new Object[]{state, s}); | ||
} | ||
} | ||
|
||
void setAppId(String appId) { | ||
this.appId = appId; | ||
fireEvent(true); | ||
} | ||
|
||
private synchronized void fireEvent(boolean isInfoChanged) { | ||
if (listeners != null) { | ||
for (Listener l : listeners) { | ||
if (isInfoChanged) { | ||
l.infoChanged(this); | ||
} else { | ||
l.stateChanged(this); | ||
} | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,71 +27,14 @@ | |
/** | ||
* Handle implementation for monitoring apps started as a child process. | ||
*/ | ||
class ChildProcAppHandle implements SparkAppHandle { | ||
class ChildProcAppHandle extends AbstractSparkAppHandle { | ||
|
||
private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName()); | ||
|
||
private final String secret; | ||
private final LauncherServer server; | ||
|
||
private Process childProc; | ||
private boolean disposed; | ||
private LauncherConnection connection; | ||
private List<Listener> listeners; | ||
private State state; | ||
private String appId; | ||
private OutputRedirector redirector; | ||
|
||
ChildProcAppHandle(String secret, LauncherServer server) { | ||
this.secret = secret; | ||
this.server = server; | ||
this.state = State.UNKNOWN; | ||
} | ||
|
||
@Override | ||
public synchronized void addListener(Listener l) { | ||
if (listeners == null) { | ||
listeners = new ArrayList<>(); | ||
} | ||
listeners.add(l); | ||
} | ||
|
||
@Override | ||
public State getState() { | ||
return state; | ||
} | ||
|
||
@Override | ||
public String getAppId() { | ||
return appId; | ||
} | ||
|
||
@Override | ||
public void stop() { | ||
CommandBuilderUtils.checkState(connection != null, "Application is still not connected."); | ||
try { | ||
connection.send(new LauncherProtocol.Stop()); | ||
} catch (IOException ioe) { | ||
throw new RuntimeException(ioe); | ||
} | ||
} | ||
|
||
@Override | ||
public synchronized void disconnect() { | ||
if (!disposed) { | ||
disposed = true; | ||
if (connection != null) { | ||
try { | ||
connection.close(); | ||
} catch (IOException ioe) { | ||
// no-op. | ||
} | ||
} | ||
server.unregister(this); | ||
if (redirector != null) { | ||
redirector.stop(); | ||
} | ||
} | ||
super(server, secret); | ||
} | ||
|
||
@Override | ||
|
@@ -103,67 +46,22 @@ public synchronized void kill() { | |
try { | ||
childProc.exitValue(); | ||
} catch (IllegalThreadStateException e) { | ||
// Child is still alive. Try to use Java 8's "destroyForcibly()" if available, | ||
// fall back to the old API if it's not there. | ||
try { | ||
Method destroy = childProc.getClass().getMethod("destroyForcibly"); | ||
destroy.invoke(childProc); | ||
} catch (Exception inner) { | ||
childProc.destroy(); | ||
} catch (Exception inner) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will never happen. You're breaking the existing functionality. See my previous comment (https://github.com/apache/spark/pull/15009/files#r84788125). |
||
// no-op | ||
} | ||
} finally { | ||
childProc = null; | ||
} | ||
} | ||
} | ||
|
||
String getSecret() { | ||
return secret; | ||
} | ||
|
||
void setChildProc(Process childProc, String loggerName) { | ||
this.childProc = childProc; | ||
this.redirector = new OutputRedirector(childProc.getInputStream(), loggerName, | ||
SparkLauncher.REDIRECTOR_FACTORY); | ||
} | ||
|
||
void setConnection(LauncherConnection connection) { | ||
this.connection = connection; | ||
} | ||
|
||
LauncherServer getServer() { | ||
return server; | ||
} | ||
|
||
LauncherConnection getConnection() { | ||
return connection; | ||
} | ||
|
||
void setState(State s) { | ||
if (!state.isFinal()) { | ||
state = s; | ||
fireEvent(false); | ||
} else { | ||
LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.", | ||
new Object[] { state, s }); | ||
} | ||
} | ||
|
||
void setAppId(String appId) { | ||
this.appId = appId; | ||
fireEvent(true); | ||
} | ||
|
||
private synchronized void fireEvent(boolean isInfoChanged) { | ||
if (listeners != null) { | ||
for (Listener l : listeners) { | ||
if (isInfoChanged) { | ||
l.infoChanged(this); | ||
} else { | ||
l.stateChanged(this); | ||
} | ||
} | ||
} | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: type is not necessary. Also, not a big fan of the name of this variable, it should be more descriptive. Like
stopOnShutdown
.