Skip to content

Commit ae26929

Browse files
committed
HBASE-28638 Fail-fast retry limit for specific errors to recover from remote procedure failure using server crash (#6564) (#6462)
Signed-off-by: Duo Zhang <[email protected]>
1 parent 8d90173 commit ae26929

File tree

6 files changed

+380
-8
lines changed

6 files changed

+380
-8
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@
159159
import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
160160
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
161161
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
162+
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
162163
import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure;
163164
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
164165
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
@@ -231,6 +232,7 @@
231232
import org.apache.hadoop.hbase.util.IdLock;
232233
import org.apache.hadoop.hbase.util.ModifyRegionUtils;
233234
import org.apache.hadoop.hbase.util.Pair;
235+
import org.apache.hadoop.hbase.util.ReflectionUtils;
234236
import org.apache.hadoop.hbase.util.RetryCounter;
235237
import org.apache.hadoop.hbase.util.RetryCounterFactory;
236238
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
@@ -446,6 +448,15 @@ public class HMaster extends HRegionServer implements MasterServices {
446448
public static final String WARMUP_BEFORE_MOVE = "hbase.master.warmup.before.move";
447449
private static final boolean DEFAULT_WARMUP_BEFORE_MOVE = true;
448450

451+
/**
452+
* Use RSProcedureDispatcher instance to initiate master to rs remote procedure execution. Use
453+
* this config to extend RSProcedureDispatcher (mainly for testing purpose).
454+
*/
455+
public static final String HBASE_MASTER_RSPROC_DISPATCHER_CLASS =
456+
"hbase.master.rsproc.dispatcher.class";
457+
private static final String DEFAULT_HBASE_MASTER_RSPROC_DISPATCHER_CLASS =
458+
RSProcedureDispatcher.class.getName();
459+
449460
private TaskGroup startupTaskGroup;
450461

451462
/**
@@ -1736,7 +1747,11 @@ protected void stopServiceThreads() {
17361747
}
17371748

17381749
private void createProcedureExecutor() throws IOException {
1739-
MasterProcedureEnv procEnv = new MasterProcedureEnv(this);
1750+
final String procedureDispatcherClassName =
1751+
conf.get(HBASE_MASTER_RSPROC_DISPATCHER_CLASS, DEFAULT_HBASE_MASTER_RSPROC_DISPATCHER_CLASS);
1752+
final RSProcedureDispatcher procedureDispatcher = ReflectionUtils.instantiateWithCustomCtor(
1753+
procedureDispatcherClassName, new Class[] { MasterServices.class }, new Object[] { this });
1754+
final MasterProcedureEnv procEnv = new MasterProcedureEnv(this, procedureDispatcher);
17401755
procedureStore = new RegionProcedureStore(this, masterRegion,
17411756
new MasterProcedureEnv.FsUtilsLeaseRecovery(this));
17421757
procedureStore.registerListener(new ProcedureStoreListener() {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureEnv.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,6 @@ private boolean isRunning() {
7474
private final MasterProcedureScheduler procSched;
7575
private final MasterServices master;
7676

77-
public MasterProcedureEnv(final MasterServices master) {
78-
this(master, new RSProcedureDispatcher(master));
79-
}
80-
8177
public MasterProcedureEnv(final MasterServices master,
8278
final RSProcedureDispatcher remoteDispatcher) {
8379
this.master = master;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,14 @@
2727
import org.apache.hadoop.hbase.DoNotRetryIOException;
2828
import org.apache.hadoop.hbase.ServerName;
2929
import org.apache.hadoop.hbase.client.RegionInfo;
30+
import org.apache.hadoop.hbase.exceptions.ConnectionClosedException;
3031
import org.apache.hadoop.hbase.ipc.RpcConnectionConstants;
3132
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
3233
import org.apache.hadoop.hbase.master.MasterServices;
3334
import org.apache.hadoop.hbase.master.ServerListener;
3435
import org.apache.hadoop.hbase.master.ServerManager;
3536
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
3637
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
37-
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
3838
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
3939
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
4040
import org.apache.hadoop.ipc.RemoteException;
@@ -249,6 +249,22 @@ protected class ExecuteProceduresRemoteCall implements RemoteProcedureResolver,
249249
"hbase.regionserver.rpc.retry.interval";
250250
private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100;
251251

252+
/**
253+
* Config to determine the retry limit while executing remote regionserver procedure. This retry
254+
* limit applies to only specific errors. These errors could potentially get the remote
255+
* procedure stuck for several minutes unless the retry limit is applied.
256+
*/
257+
private static final String RS_REMOTE_PROC_FAIL_FAST_LIMIT =
258+
"hbase.master.rs.remote.proc.fail.fast.limit";
259+
/**
260+
* The default retry limit. Waiting for more than {@value} attempts is not going to help much
261+
* for genuine connectivity errors. Therefore, consider fail-fast after {@value} retries. Value
262+
* = {@value}
263+
*/
264+
private static final int DEFAULT_RS_REMOTE_PROC_RETRY_LIMIT = 5;
265+
266+
private final int failFastRetryLimit;
267+
252268
private ExecuteProceduresRequest.Builder request = null;
253269

254270
public ExecuteProceduresRemoteCall(final ServerName serverName,
@@ -257,6 +273,8 @@ public ExecuteProceduresRemoteCall(final ServerName serverName,
257273
this.remoteProcedures = remoteProcedures;
258274
this.rsRpcRetryInterval = master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY,
259275
DEFAULT_RS_RPC_RETRY_INTERVAL);
276+
this.failFastRetryLimit = master.getConfiguration().getInt(RS_REMOTE_PROC_FAIL_FAST_LIMIT,
277+
DEFAULT_RS_REMOTE_PROC_RETRY_LIMIT);
260278
}
261279

262280
private AdminService.BlockingInterface getRsAdmin() throws IOException {
@@ -305,13 +323,28 @@ private boolean scheduleForRetry(IOException e) {
305323
if (numberOfAttemptsSoFar == 0 && unableToConnectToServer(e)) {
306324
return false;
307325
}
326+
327+
// Check if the num of attempts have crossed the retry limit, and if the error type can
328+
// fail-fast.
329+
if (numberOfAttemptsSoFar >= failFastRetryLimit - 1 && isErrorTypeFailFast(e)) {
330+
LOG
331+
.warn("Number of retries {} exceeded limit {} for the given error type. Scheduling server"
332+
+ " crash for {}", numberOfAttemptsSoFar + 1, failFastRetryLimit, serverName, e);
333+
// Expiring the server will schedule SCP and also reject the regionserver report from the
334+
// regionserver if regionserver is somehow able to send the regionserver report to master.
335+
// The master rejects the report by throwing YouAreDeadException, which would eventually
336+
// result in the regionserver abort.
337+
// This will also remove "serverName" from the ServerManager's onlineServers map.
338+
master.getServerManager().expireServer(serverName);
339+
return false;
340+
}
308341
// Always retry for other exception types if the region server is not dead yet.
309342
if (!master.getServerManager().isServerOnline(serverName)) {
310343
LOG.warn("Request to {} failed due to {}, try={} and the server is not online, give up",
311344
serverName, e.toString(), numberOfAttemptsSoFar);
312345
return false;
313346
}
314-
if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) {
347+
if (e instanceof RegionServerStoppedException) {
315348
// A better way is to return true here to let the upper layer quit, and then schedule a
316349
// background task to check whether the region server is dead. And if it is dead, call
317350
// remoteCallFailed to tell the upper layer. Keep retrying here does not lead to incorrect
@@ -329,7 +362,8 @@ private boolean scheduleForRetry(IOException e) {
329362
// retry^2 on each try
330363
// up to max of 10 seconds (don't want to back off too much in case of situation change).
331364
submitTask(this,
332-
Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar),
365+
Math.min(
366+
rsRpcRetryInterval * ((long) this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar),
333367
10 * 1000),
334368
TimeUnit.MILLISECONDS);
335369
return true;
@@ -376,6 +410,39 @@ private boolean isSaslError(IOException e) {
376410
}
377411
}
378412

413+
/**
414+
* Returns true if the error or its cause is of type ConnectionClosedException.
415+
* @param e IOException thrown by the underlying rpc framework.
416+
* @return True if the error or its cause is of type ConnectionClosedException.
417+
*/
418+
private boolean isConnectionClosedError(IOException e) {
419+
if (e instanceof ConnectionClosedException) {
420+
return true;
421+
}
422+
Throwable cause = e;
423+
while (true) {
424+
if (cause instanceof IOException) {
425+
IOException unwrappedCause = unwrapException((IOException) cause);
426+
if (unwrappedCause instanceof ConnectionClosedException) {
427+
return true;
428+
}
429+
}
430+
cause = cause.getCause();
431+
if (cause == null) {
432+
return false;
433+
}
434+
}
435+
}
436+
437+
/**
438+
* Returns true if the error type can allow fail-fast.
439+
* @param e IOException thrown by the underlying rpc framework.
440+
* @return True if the error type can allow fail-fast.
441+
*/
442+
private boolean isErrorTypeFailFast(IOException e) {
443+
return e instanceof CallQueueTooBigException || isSaslError(e) || isConnectionClosedError(e);
444+
}
445+
379446
private long getMaxWaitTime() {
380447
if (this.maxWaitTime < 0) {
381448
// This is the max attempts, not retries, so it should be at least 1.

hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ protected void setupConfiguration(Configuration conf) throws Exception {
146146
// make retry for TRSP more frequent
147147
conf.setLong(ProcedureUtil.PROCEDURE_RETRY_SLEEP_INTERVAL_MS, 10);
148148
conf.setLong(ProcedureUtil.PROCEDURE_RETRY_MAX_SLEEP_TIME_MS, 100);
149+
conf.setInt("hbase.master.rs.remote.proc.fail.fast.limit", Integer.MAX_VALUE);
149150
}
150151

151152
@Before
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.util;
19+
20+
import java.io.IOException;
21+
import java.util.Set;
22+
import java.util.concurrent.atomic.AtomicInteger;
23+
import org.apache.hadoop.hbase.ServerName;
24+
import org.apache.hadoop.hbase.exceptions.ConnectionClosedException;
25+
import org.apache.hadoop.hbase.master.MasterServices;
26+
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
27+
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
28+
import org.slf4j.Logger;
29+
import org.slf4j.LoggerFactory;
30+
31+
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
32+
33+
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
34+
35+
/**
36+
* Test implementation of RSProcedureDispatcher that throws desired errors for testing purpose.
37+
*/
38+
public class RSProcDispatcher extends RSProcedureDispatcher {
39+
40+
private static final Logger LOG = LoggerFactory.getLogger(RSProcDispatcher.class);
41+
42+
private static final AtomicInteger i = new AtomicInteger();
43+
44+
public RSProcDispatcher(MasterServices master) {
45+
super(master);
46+
}
47+
48+
@Override
49+
protected void remoteDispatch(final ServerName serverName,
50+
final Set<RemoteProcedure> remoteProcedures) {
51+
if (!master.getServerManager().isServerOnline(serverName)) {
52+
// fail fast
53+
submitTask(new DeadRSRemoteCall(serverName, remoteProcedures));
54+
} else {
55+
submitTask(new TestExecuteProceduresRemoteCall(serverName, remoteProcedures));
56+
}
57+
}
58+
59+
class TestExecuteProceduresRemoteCall extends ExecuteProceduresRemoteCall {
60+
61+
public TestExecuteProceduresRemoteCall(ServerName serverName,
62+
Set<RemoteProcedure> remoteProcedures) {
63+
super(serverName, remoteProcedures);
64+
}
65+
66+
@Override
67+
public AdminProtos.ExecuteProceduresResponse sendRequest(final ServerName serverName,
68+
final AdminProtos.ExecuteProceduresRequest request) throws IOException {
69+
int j = i.addAndGet(1);
70+
LOG.info("sendRequest() req: {} , j: {}", request, j);
71+
if (j == 12 || j == 22) {
72+
// Execute the remote close and open region requests in the last (5th) retry before
73+
// throwing ConnectionClosedException. This is to ensure even if the region open/close
74+
// is successfully completed by regionserver, master still schedules SCP because
75+
// sendRequest() throws error which has retry-limit exhausted.
76+
try {
77+
getRsAdmin().executeProcedures(null, request);
78+
} catch (ServiceException e) {
79+
throw new RuntimeException(e);
80+
}
81+
}
82+
// For one of the close region requests and one of the open region requests,
83+
// throw ConnectionClosedException until retry limit is exhausted and master
84+
// schedules recoveries for the server.
85+
// We will have ABNORMALLY_CLOSED regions, and they are expected to recover on their own.
86+
if (j >= 10 && j <= 15 || j >= 18 && j <= 23) {
87+
throw new ConnectionClosedException("test connection closed error...");
88+
}
89+
try {
90+
return getRsAdmin().executeProcedures(null, request);
91+
} catch (ServiceException e) {
92+
throw new RuntimeException(e);
93+
}
94+
}
95+
96+
private AdminProtos.AdminService.BlockingInterface getRsAdmin() throws IOException {
97+
return master.getServerManager().getRsAdmin(getServerName());
98+
}
99+
}
100+
101+
private class DeadRSRemoteCall extends ExecuteProceduresRemoteCall {
102+
103+
public DeadRSRemoteCall(ServerName serverName, Set<RemoteProcedure> remoteProcedures) {
104+
super(serverName, remoteProcedures);
105+
}
106+
107+
@Override
108+
public void run() {
109+
remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(),
110+
new RegionServerStoppedException("Server " + getServerName() + " is not online"));
111+
}
112+
}
113+
114+
}

0 commit comments

Comments
 (0)