-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Fix failure expiry race condition
Motivation A bug was encountered whereby model load failure records would not be cleaned up after expiration while there were continuous (failing) requests for that model, meaning no reload attempt occurred and those requests failed indefinitely. Modifications - Add unit test to reproduce the problem, which required - Making expiry time configurable - Modifying the behaviour of the dummy model so that triggered loading failures only occur on the first attempt - Add some log statements - Fix the main bug: When a failed cache entry without any record in the registry is encountered during a load attempt, remove it rather than treating as a load failure - Fix some other logic related to failure expiry - Use a shorter expiry time (2/3 of configured time) for recently-used models Result Failures expire as intended in all situations Signed-off-by: Nick Hill <nickhill@us.ibm.com>
- Loading branch information
Showing
5 changed files
with
223 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
140 changes: 140 additions & 0 deletions
140
src/test/java/com/ibm/watson/modelmesh/ModelMeshFailureExpiryTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
/* | ||
* Copyright 2022 IBM Corporation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy | ||
* of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package com.ibm.watson.modelmesh; | ||
|
||
import com.ibm.watson.modelmesh.api.ModelInfo; | ||
import com.ibm.watson.modelmesh.api.ModelMeshGrpc; | ||
import com.ibm.watson.modelmesh.api.ModelMeshGrpc.ModelMeshBlockingStub; | ||
import com.ibm.watson.modelmesh.api.ModelStatusInfo; | ||
import com.ibm.watson.modelmesh.api.ModelStatusInfo.ModelStatus; | ||
import com.ibm.watson.modelmesh.api.RegisterModelRequest; | ||
import com.ibm.watson.modelmesh.api.UnregisterModelRequest; | ||
import com.ibm.watson.modelmesh.example.api.ExamplePredictorGrpc; | ||
import com.ibm.watson.modelmesh.example.api.ExamplePredictorGrpc.ExamplePredictorBlockingStub; | ||
import com.ibm.watson.modelmesh.example.api.Predictor.PredictRequest; | ||
import com.ibm.watson.modelmesh.example.api.Predictor.PredictResponse; | ||
import io.grpc.ManagedChannel; | ||
import io.grpc.StatusRuntimeException; | ||
import io.grpc.netty.NettyChannelBuilder; | ||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.atomic.AtomicBoolean; | ||
|
||
import static com.ibm.watson.modelmesh.ModelMeshEnvVars.LOAD_FAILURE_EXPIRY_ENV_VAR; | ||
import static org.junit.jupiter.api.Assertions.*; | ||
|
||
/** | ||
* Test load failure expiry while model is receiving continuous stream of inference requests | ||
*/ | ||
public class ModelMeshFailureExpiryTest extends SingleInstanceModelMeshTest { | ||
|
||
@Override | ||
@BeforeEach | ||
public void initialize() throws Exception { | ||
System.setProperty("tas.janitor_freq_secs", "1"); // set janitor to run every second | ||
System.setProperty(LOAD_FAILURE_EXPIRY_ENV_VAR, "2000"); // expire failures in 2 seconds | ||
try { | ||
super.initialize(); | ||
} finally { | ||
System.clearProperty("tas.janitor_freq_secs"); | ||
System.clearProperty(LOAD_FAILURE_EXPIRY_ENV_VAR); | ||
} | ||
} | ||
|
||
@Test | ||
public void failureExpiryTest() throws Exception { | ||
ExecutorService es = Executors.newSingleThreadExecutor(); | ||
|
||
ManagedChannel channel = NettyChannelBuilder.forAddress("localhost", 8088) | ||
.usePlaintext().build(); | ||
String modelId = "myFailExpireModel"; | ||
ModelMeshBlockingStub manageModels = ModelMeshGrpc.newBlockingStub(channel); | ||
|
||
final AtomicBoolean stop = new AtomicBoolean(); | ||
try { | ||
ExamplePredictorBlockingStub useModels = ExamplePredictorGrpc.newBlockingStub(channel); | ||
|
||
String errorMessage = "load should fail with this error"; | ||
|
||
// add a model which should fail to load | ||
ModelStatusInfo statusInfo = manageModels.registerModel(RegisterModelRequest.newBuilder() | ||
.setModelId(modelId).setModelInfo(ModelInfo.newBuilder().setType("ExampleType") | ||
.setPath("FAIL_" + errorMessage).build()) // special prefix to trigger load in dummy runtime | ||
.setLoadNow(true).setSync(true).build()); | ||
|
||
System.out.println("registerModel returned: " + statusInfo); | ||
assertEquals(ModelStatus.LOADING_FAILED, statusInfo.getStatus()); | ||
|
||
|
||
final PredictRequest req = PredictRequest.newBuilder().setText("predict me!").build(); | ||
|
||
es.execute(() -> { | ||
while (!stop.get()) try { | ||
forModel(useModels, modelId).predict(req); | ||
} catch (StatusRuntimeException sre) { | ||
// ignore | ||
} finally { | ||
try { | ||
Thread.sleep(400L); | ||
} catch (InterruptedException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
}); | ||
|
||
// call predict on the model | ||
try { | ||
forModel(useModels, modelId).predict(req); | ||
fail("failed model should fail predict"); | ||
} catch (StatusRuntimeException sre) { | ||
assertTrue(sre.getStatus().getDescription().endsWith(errorMessage)); | ||
} | ||
|
||
Thread.sleep(1000); | ||
|
||
// not expired yet | ||
try { | ||
forModel(useModels, modelId).predict(req); | ||
fail("failed model should fail predict"); | ||
} catch (StatusRuntimeException sre) { | ||
assertTrue(sre.getStatus().getDescription().endsWith(errorMessage)); | ||
} | ||
|
||
Thread.sleep(3000); | ||
|
||
// failure should now be expired and it should load successfully second time | ||
// per logic in dummy Model class | ||
|
||
PredictResponse response = forModel(useModels, modelId).predict(req); | ||
System.out.println("predict returned: " + response.getResultsList()); | ||
assertEquals(1.0, response.getResults(0).getConfidence(), 0); | ||
assertEquals("classification for predict me! by model " + modelId, | ||
response.getResults(0).getCategory()); | ||
|
||
} finally { | ||
stop.set(true); | ||
manageModels.unregisterModel(UnregisterModelRequest.newBuilder() | ||
.setModelId(modelId).build()); | ||
channel.shutdown(); | ||
es.shutdown(); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters