|
23 | 23 | import org.apache.flink.runtime.checkpoint.channel.InputChannelInfo; |
24 | 24 | import org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionInfo; |
25 | 25 | import org.apache.flink.runtime.concurrent.ManuallyTriggeredScheduledExecutor; |
| 26 | +import org.apache.flink.runtime.executiongraph.ExecutionAttemptID; |
26 | 27 | import org.apache.flink.runtime.executiongraph.ExecutionGraph; |
27 | 28 | import org.apache.flink.runtime.executiongraph.ExecutionVertex; |
28 | 29 | import org.apache.flink.runtime.jobgraph.JobVertexID; |
29 | 30 | import org.apache.flink.runtime.jobgraph.OperatorID; |
30 | 31 | import org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint; |
| 32 | +import org.apache.flink.runtime.persistence.PossibleInconsistentStateException; |
31 | 33 | import org.apache.flink.runtime.state.InputChannelStateHandle; |
32 | 34 | import org.apache.flink.runtime.state.KeyedStateHandle; |
33 | 35 | import org.apache.flink.runtime.state.OperatorStateHandle; |
34 | 36 | import org.apache.flink.runtime.state.OperatorStreamStateHandle; |
35 | 37 | import org.apache.flink.runtime.state.ResultSubpartitionStateHandle; |
36 | 38 | import org.apache.flink.runtime.state.StreamStateHandle; |
| 39 | +import org.apache.flink.util.FlinkRuntimeException; |
37 | 40 | import org.apache.flink.util.TestLogger; |
38 | 41 | import org.apache.flink.util.function.TriConsumerWithException; |
39 | 42 |
|
| 43 | +import org.junit.Rule; |
40 | 44 | import org.junit.Test; |
| 45 | +import org.junit.rules.TemporaryFolder; |
41 | 46 |
|
42 | 47 | import java.util.Collections; |
43 | 48 | import java.util.List; |
| 49 | +import java.util.concurrent.Executor; |
| 50 | +import java.util.concurrent.atomic.AtomicInteger; |
44 | 51 |
|
| 52 | +import static org.hamcrest.CoreMatchers.is; |
45 | 53 | import static org.junit.Assert.assertEquals; |
46 | 54 | import static org.junit.Assert.assertFalse; |
| 55 | +import static org.junit.Assert.assertThat; |
47 | 56 | import static org.junit.Assert.assertTrue; |
48 | 57 | import static org.junit.Assert.fail; |
49 | 58 | import static org.mockito.Mockito.mock; |
|
54 | 63 | /** Tests for failure of checkpoint coordinator. */ |
55 | 64 | public class CheckpointCoordinatorFailureTest extends TestLogger { |
56 | 65 |
|
| 66 | + @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); |
| 67 | + |
57 | 68 | /** |
58 | 69 | * Tests that a failure while storing a completed checkpoint in the completed checkpoint store |
59 | 70 | * will properly fail the originating pending checkpoint and clean upt the completed checkpoint. |
@@ -166,6 +177,81 @@ public void testFailingCompletedCheckpointStoreAdd() throws Exception { |
166 | 177 | .discardState(); |
167 | 178 | } |
168 | 179 |
|
| 180 | + @Test |
| 181 | + public void testCleanupForGenericFailure() throws Exception { |
| 182 | + testStoringFailureHandling(new FlinkRuntimeException("Expected exception"), 1); |
| 183 | + } |
| 184 | + |
| 185 | + @Test |
| 186 | + public void testCleanupOmissionForPossibleInconsistentStateException() throws Exception { |
| 187 | + testStoringFailureHandling(new PossibleInconsistentStateException(), 0); |
| 188 | + } |
| 189 | + |
| 190 | + private void testStoringFailureHandling(Exception failure, int expectedCleanupCalls) |
| 191 | + throws Exception { |
| 192 | + final JobVertexID jobVertexID1 = new JobVertexID(); |
| 193 | + |
| 194 | + final ExecutionGraph graph = |
| 195 | + new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder() |
| 196 | + .addJobVertex(jobVertexID1) |
| 197 | + .build(); |
| 198 | + |
| 199 | + final ExecutionVertex vertex = graph.getJobVertex(jobVertexID1).getTaskVertices()[0]; |
| 200 | + final ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId(); |
| 201 | + |
| 202 | + final StandaloneCheckpointIDCounter checkpointIDCounter = |
| 203 | + new StandaloneCheckpointIDCounter(); |
| 204 | + |
| 205 | + final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = |
| 206 | + new ManuallyTriggeredScheduledExecutor(); |
| 207 | + |
| 208 | + final CompletedCheckpointStore completedCheckpointStore = |
| 209 | + new FailingCompletedCheckpointStore( |
| 210 | + (checkpoint, ignoredCleaner, ignoredPostCleanCallback) -> { |
| 211 | + throw failure; |
| 212 | + }); |
| 213 | + |
| 214 | + final AtomicInteger cleanupCallCount = new AtomicInteger(0); |
| 215 | + final CheckpointCoordinator checkpointCoordinator = |
| 216 | + new CheckpointCoordinatorBuilder() |
| 217 | + .setExecutionGraph(graph) |
| 218 | + .setCheckpointIDCounter(checkpointIDCounter) |
| 219 | + .setCheckpointsCleaner( |
| 220 | + new CheckpointsCleaner() { |
| 221 | + |
| 222 | + private static final long serialVersionUID = |
| 223 | + 2029876992397573325L; |
| 224 | + |
| 225 | + @Override |
| 226 | + public void cleanCheckpointOnFailedStoring( |
| 227 | + CompletedCheckpoint completedCheckpoint, |
| 228 | + Executor executor) { |
| 229 | + cleanupCallCount.incrementAndGet(); |
| 230 | + super.cleanCheckpointOnFailedStoring( |
| 231 | + completedCheckpoint, executor); |
| 232 | + } |
| 233 | + }) |
| 234 | + .setCompletedCheckpointStore(completedCheckpointStore) |
| 235 | + .setTimer(manuallyTriggeredScheduledExecutor) |
| 236 | + .build(); |
| 237 | + checkpointCoordinator.triggerSavepoint(tmpFolder.newFolder().getAbsolutePath()); |
| 238 | + manuallyTriggeredScheduledExecutor.triggerAll(); |
| 239 | + |
| 240 | + try { |
| 241 | + checkpointCoordinator.receiveAcknowledgeMessage( |
| 242 | + new AcknowledgeCheckpoint( |
| 243 | + graph.getJobID(), attemptId, checkpointIDCounter.getLast()), |
| 244 | + "unknown location"); |
| 245 | + fail("CheckpointException should have been thrown."); |
| 246 | + } catch (CheckpointException e) { |
| 247 | + assertThat( |
| 248 | + e.getCheckpointFailureReason(), |
| 249 | + is(CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE)); |
| 250 | + } |
| 251 | + |
| 252 | + assertThat(cleanupCallCount.get(), is(expectedCleanupCalls)); |
| 253 | + } |
| 254 | + |
169 | 255 | private static final class FailingCompletedCheckpointStore implements CompletedCheckpointStore { |
170 | 256 |
|
171 | 257 | private final TriConsumerWithException< |
|
0 commit comments