Skip to content

Commit d6ce440

Browse files
shangm2NikhilCollooru
authored andcommitted
Revert "Use a safe eventLoop wrapper to avoid thread shutdown and fail the task correctly"
This reverts commit 48f2fb1.
1 parent 84efc90 commit d6ce440

File tree

3 files changed

+20
-165
lines changed

3 files changed

+20
-165
lines changed

presto-main/pom.xml

-5
Original file line numberDiff line numberDiff line change
@@ -516,11 +516,6 @@
516516
<artifactId>netty-transport</artifactId>
517517
</dependency>
518518

519-
<dependency>
520-
<groupId>io.netty</groupId>
521-
<artifactId>netty-common</artifactId>
522-
</dependency>
523-
524519
<dependency>
525520
<groupId>com.squareup.okhttp3</groupId>
526521
<artifactId>mockwebserver</artifactId>

presto-main/src/main/java/com/facebook/presto/server/remotetask/HttpRemoteTask.java

+16-111
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
import com.sun.management.ThreadMXBean;
7777
import io.airlift.units.DataSize;
7878
import io.airlift.units.Duration;
79+
import io.netty.channel.EventLoop;
7980
import it.unimi.dsi.fastutil.longs.LongArrayList;
8081
import org.joda.time.DateTime;
8182

@@ -142,7 +143,7 @@
142143
* This class now uses an event loop concurrency model to eliminate the need for explicit synchronization:
143144
* <ul>
144145
* <li>All mutable state access and modifications are performed on a single dedicated event loop thread</li>
145-
* <li>External threads submit operations to the event loop using {@code safeExecuteOnEventLoop()}</li>
146+
* <li>External threads submit operations to the event loop using {@code taskEventLoop.execute()}</li>
146147
* <li>The event loop serializes all operations, eliminating race conditions without using locks</li>
147148
* </ul>
148149
* <p>
@@ -230,9 +231,9 @@ public final class HttpRemoteTask
230231
private final boolean taskUpdateSizeTrackingEnabled;
231232
private final SchedulerStatsTracker schedulerStatsTracker;
232233

233-
private final HttpRemoteTaskFactory.SafeEventLoop taskEventLoop;
234+
private final EventLoop taskEventLoop;
234235

235-
public static HttpRemoteTask createHttpRemoteTask(
236+
public HttpRemoteTask(
236237
Session session,
237238
TaskId taskId,
238239
String nodeId,
@@ -268,84 +269,7 @@ public static HttpRemoteTask createHttpRemoteTask(
268269
HandleResolver handleResolver,
269270
ConnectorTypeSerdeManager connectorTypeSerdeManager,
270271
SchedulerStatsTracker schedulerStatsTracker,
271-
HttpRemoteTaskFactory.SafeEventLoop taskEventLoop)
272-
{
273-
HttpRemoteTask task = new HttpRemoteTask(session,
274-
taskId,
275-
nodeId,
276-
location,
277-
remoteLocation,
278-
planFragment,
279-
initialSplits,
280-
outputBuffers,
281-
httpClient,
282-
maxErrorDuration,
283-
taskStatusRefreshMaxWait,
284-
taskInfoRefreshMaxWait,
285-
taskInfoUpdateInterval,
286-
summarizeTaskInfo,
287-
taskStatusCodec,
288-
taskInfoCodec,
289-
taskInfoJsonCodec,
290-
taskUpdateRequestCodec,
291-
planFragmentCodec,
292-
metadataUpdatesCodec,
293-
nodeStatsTracker,
294-
stats,
295-
binaryTransportEnabled,
296-
thriftTransportEnabled,
297-
taskInfoThriftTransportEnabled,
298-
thriftProtocol,
299-
tableWriteInfo,
300-
maxTaskUpdateSizeInBytes,
301-
metadataManager,
302-
queryManager,
303-
taskUpdateRequestSize,
304-
taskUpdateSizeTrackingEnabled,
305-
handleResolver,
306-
connectorTypeSerdeManager,
307-
schedulerStatsTracker,
308-
taskEventLoop);
309-
task.initialize();
310-
return task;
311-
}
312-
313-
private HttpRemoteTask(Session session,
314-
TaskId taskId,
315-
String nodeId,
316-
URI location,
317-
URI remoteLocation,
318-
PlanFragment planFragment,
319-
Multimap<PlanNodeId, Split> initialSplits,
320-
OutputBuffers outputBuffers,
321-
HttpClient httpClient,
322-
Duration maxErrorDuration,
323-
Duration taskStatusRefreshMaxWait,
324-
Duration taskInfoRefreshMaxWait,
325-
Duration taskInfoUpdateInterval,
326-
boolean summarizeTaskInfo,
327-
Codec<TaskStatus> taskStatusCodec,
328-
Codec<TaskInfo> taskInfoCodec,
329-
Codec<TaskInfo> taskInfoJsonCodec,
330-
Codec<TaskUpdateRequest> taskUpdateRequestCodec,
331-
Codec<PlanFragment> planFragmentCodec,
332-
Codec<MetadataUpdates> metadataUpdatesCodec,
333-
NodeStatsTracker nodeStatsTracker,
334-
RemoteTaskStats stats,
335-
boolean binaryTransportEnabled,
336-
boolean thriftTransportEnabled,
337-
boolean taskInfoThriftTransportEnabled,
338-
Protocol thriftProtocol,
339-
TableWriteInfo tableWriteInfo,
340-
int maxTaskUpdateSizeInBytes,
341-
MetadataManager metadataManager,
342-
QueryManager queryManager,
343-
DecayCounter taskUpdateRequestSize,
344-
boolean taskUpdateSizeTrackingEnabled,
345-
HandleResolver handleResolver,
346-
ConnectorTypeSerdeManager connectorTypeSerdeManager,
347-
SchedulerStatsTracker schedulerStatsTracker,
348-
HttpRemoteTaskFactory.SafeEventLoop taskEventLoop)
272+
EventLoop taskEventLoop)
349273
{
350274
requireNonNull(session, "session is null");
351275
requireNonNull(taskId, "taskId is null");
@@ -468,11 +392,7 @@ private HttpRemoteTask(Session session,
468392
handleResolver,
469393
connectorTypeSerdeManager,
470394
thriftProtocol);
471-
}
472395

473-
// this is a separate method to ensure that the `this` reference is not leaked during construction
474-
private void initialize()
475-
{
476396
taskStatusFetcher.addStateChangeListener(newStatus -> {
477397
verify(taskEventLoop.inEventLoop());
478398

@@ -487,7 +407,7 @@ private void initialize()
487407
});
488408

489409
updateTaskStats();
490-
safeExecuteOnEventLoop(this::updateSplitQueueSpace);
410+
taskEventLoop.execute(this::updateSplitQueueSpace);
491411
}
492412

493413
public PlanFragment getPlanFragment()
@@ -528,7 +448,7 @@ public URI getRemoteTaskLocation()
528448
@Override
529449
public void start()
530450
{
531-
safeExecuteOnEventLoop(() -> {
451+
taskEventLoop.execute(() -> {
532452
// to start we just need to trigger an update
533453
started = true;
534454
scheduleUpdate();
@@ -548,7 +468,7 @@ public void addSplits(Multimap<PlanNodeId, Split> splitsBySource)
548468
return;
549469
}
550470

551-
safeExecuteOnEventLoop(() -> {
471+
taskEventLoop.execute(() -> {
552472
boolean updateNeeded = false;
553473
for (Entry<PlanNodeId, Collection<Split>> entry : splitsBySource.asMap().entrySet()) {
554474
PlanNodeId sourceId = entry.getKey();
@@ -585,7 +505,7 @@ public void addSplits(Multimap<PlanNodeId, Split> splitsBySource)
585505
@Override
586506
public void noMoreSplits(PlanNodeId sourceId)
587507
{
588-
safeExecuteOnEventLoop(() -> {
508+
taskEventLoop.execute(() -> {
589509
if (noMoreSplits.containsKey(sourceId)) {
590510
return;
591511
}
@@ -599,7 +519,7 @@ public void noMoreSplits(PlanNodeId sourceId)
599519
@Override
600520
public void noMoreSplits(PlanNodeId sourceId, Lifespan lifespan)
601521
{
602-
safeExecuteOnEventLoop(() -> {
522+
taskEventLoop.execute(() -> {
603523
if (pendingNoMoreSplitsForLifespan.put(sourceId, lifespan)) {
604524
needsUpdate = true;
605525
scheduleUpdate();
@@ -614,7 +534,7 @@ public void setOutputBuffers(OutputBuffers newOutputBuffers)
614534
return;
615535
}
616536

617-
safeExecuteOnEventLoop(() -> {
537+
taskEventLoop.execute(() -> {
618538
if (newOutputBuffers.getVersion() > outputBuffers.getVersion()) {
619539
outputBuffers = newOutputBuffers;
620540
needsUpdate = true;
@@ -779,7 +699,7 @@ public ListenableFuture<?> whenSplitQueueHasSpace(long weightThreshold)
779699
return immediateFuture(null);
780700
}
781701
SettableFuture<?> future = SettableFuture.create();
782-
safeExecuteOnEventLoop(() -> {
702+
taskEventLoop.execute(() -> {
783703
if (whenSplitQueueHasSpaceThreshold.isPresent()) {
784704
checkArgument(weightThreshold == whenSplitQueueHasSpaceThreshold.getAsLong(), "Multiple split queue space notification thresholds not supported");
785705
}
@@ -949,7 +869,7 @@ private void scheduleUpdate()
949869

950870
private void sendUpdate()
951871
{
952-
safeExecuteOnEventLoop(() -> {
872+
taskEventLoop.execute(() -> {
953873
TaskStatus taskStatus = getTaskStatus();
954874
// don't update if the task hasn't been started yet or if it is already finished
955875
if (!started || !needsUpdate || taskStatus.getState().isDone()) {
@@ -1072,7 +992,7 @@ private TaskSource getSource(PlanNodeId planNodeId)
1072992
@Override
1073993
public void cancel()
1074994
{
1075-
safeExecuteOnEventLoop(() -> {
995+
taskEventLoop.execute(() -> {
1076996
TaskStatus taskStatus = getTaskStatus();
1077997
if (taskStatus.getState().isDone()) {
1078998
return;
@@ -1092,7 +1012,7 @@ public void cancel()
10921012

10931013
private void cleanUpTask()
10941014
{
1095-
safeExecuteOnEventLoop(() -> {
1015+
taskEventLoop.execute(() -> {
10961016
checkState(getTaskStatus().getState().isDone(), "attempt to clean up a task that is not done yet");
10971017

10981018
// clear pending splits to free memory
@@ -1140,7 +1060,7 @@ public void abort()
11401060

11411061
private void abort(TaskStatus status)
11421062
{
1143-
safeExecuteOnEventLoop(() -> {
1063+
taskEventLoop.execute(() -> {
11441064
checkState(status.getState().isDone(), "cannot abort task with an incomplete status");
11451065

11461066
taskStatusFetcher.updateTaskStatus(status);
@@ -1402,19 +1322,4 @@ public void onFailure(Throwable throwable)
14021322
onFailureTaskInfo(throwable, this.action, this.request, this.cleanupBackoff);
14031323
}
14041324
}
1405-
1406-
/***
1407-
* Wrap the task execution on event loop to fail the entire task on any failure.
1408-
*/
1409-
private void safeExecuteOnEventLoop(Runnable r)
1410-
{
1411-
taskEventLoop.execute(() -> {
1412-
try {
1413-
r.run();
1414-
}
1415-
catch (Throwable t) {
1416-
failTask(t);
1417-
}
1418-
});
1419-
}
14201325
}

presto-main/src/main/java/com/facebook/presto/server/remotetask/HttpRemoteTaskFactory.java

+4-49
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import com.facebook.airlift.json.Codec;
1818
import com.facebook.airlift.json.JsonCodec;
1919
import com.facebook.airlift.json.smile.SmileCodec;
20-
import com.facebook.airlift.log.Logger;
2120
import com.facebook.airlift.stats.DecayCounter;
2221
import com.facebook.airlift.stats.ExponentialDecay;
2322
import com.facebook.drift.codec.ThriftCodec;
@@ -50,25 +49,20 @@
5049
import com.google.common.collect.Multimap;
5150
import com.google.common.util.concurrent.ThreadFactoryBuilder;
5251
import io.airlift.units.Duration;
53-
import io.netty.channel.DefaultEventLoop;
5452
import io.netty.channel.DefaultEventLoopGroup;
55-
import io.netty.channel.EventLoop;
5653
import io.netty.channel.EventLoopGroup;
5754
import org.weakref.jmx.Managed;
5855

5956
import javax.annotation.PreDestroy;
6057
import javax.inject.Inject;
6158

62-
import java.util.concurrent.Executor;
63-
6459
import static com.facebook.presto.server.thrift.ThriftCodecWrapper.wrapThriftCodec;
6560
import static java.lang.Math.toIntExact;
6661
import static java.util.Objects.requireNonNull;
6762

6863
public class HttpRemoteTaskFactory
6964
implements RemoteTaskFactory
7065
{
71-
private static final Logger log = Logger.get(HttpRemoteTaskFactory.class);
7266
private final HttpClient httpClient;
7367
private final LocationFactory locationFactory;
7468
private final Codec<TaskStatus> taskStatusCodec;
@@ -95,7 +89,8 @@ public class HttpRemoteTaskFactory
9589
private final QueryManager queryManager;
9690
private final DecayCounter taskUpdateRequestSize;
9791
private final boolean taskUpdateSizeTrackingEnabled;
98-
private final EventLoopGroup eventLoopGroup;
92+
private final EventLoopGroup eventLoopGroup = new DefaultEventLoopGroup(Runtime.getRuntime().availableProcessors(),
93+
new ThreadFactoryBuilder().setNameFormat("task-event-loop-%s").setDaemon(true).build());
9994

10095
@Inject
10196
public HttpRemoteTaskFactory(
@@ -174,15 +169,6 @@ else if (binaryTransportEnabled) {
174169

175170
this.taskUpdateRequestSize = new DecayCounter(ExponentialDecay.oneMinute());
176171
this.taskUpdateSizeTrackingEnabled = taskConfig.isTaskUpdateSizeTrackingEnabled();
177-
this.eventLoopGroup = new DefaultEventLoopGroup(config.getRemoteTaskMaxCallbackThreads(),
178-
new ThreadFactoryBuilder().setNameFormat("task-event-loop-%s").setDaemon(true).build())
179-
{
180-
@Override
181-
protected EventLoop newChild(Executor executor, Object... args)
182-
{
183-
return new SafeEventLoop(this, executor);
184-
}
185-
};
186172
}
187173

188174
@Managed
@@ -210,7 +196,7 @@ public RemoteTask createRemoteTask(
210196
TableWriteInfo tableWriteInfo,
211197
SchedulerStatsTracker schedulerStatsTracker)
212198
{
213-
return HttpRemoteTask.createHttpRemoteTask(
199+
return new HttpRemoteTask(
214200
session,
215201
taskId,
216202
node.getNodeIdentifier(),
@@ -246,37 +232,6 @@ public RemoteTask createRemoteTask(
246232
handleResolver,
247233
connectorTypeSerdeManager,
248234
schedulerStatsTracker,
249-
(SafeEventLoop) eventLoopGroup.next());
250-
}
251-
252-
/***
253-
* One observation about event loop is if submitted task fails, it could kill the thread but the event loop group will not create a new one.
254-
* Here, we wrap it as safe event loop so that if any submitted job fails, we chose to log the error and fail the entire task.
255-
*/
256-
static class SafeEventLoop
257-
extends DefaultEventLoop
258-
{
259-
public SafeEventLoop(EventLoopGroup parent, Executor executor)
260-
{
261-
super(parent, executor);
262-
}
263-
264-
@Override
265-
protected void run()
266-
{
267-
do {
268-
Runnable task = takeTask();
269-
if (task != null) {
270-
try {
271-
runTask(task);
272-
}
273-
catch (Throwable t) {
274-
log.error("Error executing task on event loop", t);
275-
}
276-
updateLastExecutionTime();
277-
}
278-
}
279-
while (!this.confirmShutdown());
280-
}
235+
eventLoopGroup.next());
281236
}
282237
}

0 commit comments

Comments
 (0)