2727import org .apache .hadoop .hbase .DoNotRetryIOException ;
2828import org .apache .hadoop .hbase .ServerName ;
2929import org .apache .hadoop .hbase .client .RegionInfo ;
30+ import org .apache .hadoop .hbase .exceptions .ConnectionClosedException ;
3031import org .apache .hadoop .hbase .ipc .RpcConnectionConstants ;
3132import org .apache .hadoop .hbase .ipc .ServerNotRunningYetException ;
3233import org .apache .hadoop .hbase .master .MasterServices ;
3334import org .apache .hadoop .hbase .master .ServerListener ;
3435import org .apache .hadoop .hbase .master .ServerManager ;
3536import org .apache .hadoop .hbase .procedure2 .ProcedureExecutor ;
3637import org .apache .hadoop .hbase .procedure2 .RemoteProcedureDispatcher ;
37- import org .apache .hadoop .hbase .regionserver .RegionServerAbortedException ;
3838import org .apache .hadoop .hbase .regionserver .RegionServerStoppedException ;
3939import org .apache .hadoop .hbase .util .EnvironmentEdgeManager ;
4040import org .apache .hadoop .ipc .RemoteException ;
@@ -249,6 +249,22 @@ protected class ExecuteProceduresRemoteCall implements RemoteProcedureResolver,
249249 "hbase.regionserver.rpc.retry.interval" ;
250250 private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100 ;
251251
252+ /**
253+ * Config to determine the retry limit while executing remote regionserver procedure. This retry
254+ * limit applies to only specific errors. These errors could potentially get the remote
255+ * procedure stuck for several minutes unless the retry limit is applied.
256+ */
257+ private static final String RS_REMOTE_PROC_FAIL_FAST_LIMIT =
258+ "hbase.master.rs.remote.proc.fail.fast.limit" ;
259+ /**
260+ * The default retry limit. Waiting for more than {@value} attempts is not going to help much
261+ * for genuine connectivity errors. Therefore, consider fail-fast after {@value} retries. Value
262+ * = {@value}
263+ */
264+ private static final int DEFAULT_RS_REMOTE_PROC_RETRY_LIMIT = 5 ;
265+
266+ private final int failFastRetryLimit ;
267+
252268 private ExecuteProceduresRequest .Builder request = null ;
253269
254270 public ExecuteProceduresRemoteCall (final ServerName serverName ,
@@ -257,6 +273,8 @@ public ExecuteProceduresRemoteCall(final ServerName serverName,
257273 this .remoteProcedures = remoteProcedures ;
258274 this .rsRpcRetryInterval = master .getConfiguration ().getLong (RS_RPC_RETRY_INTERVAL_CONF_KEY ,
259275 DEFAULT_RS_RPC_RETRY_INTERVAL );
276+ this .failFastRetryLimit = master .getConfiguration ().getInt (RS_REMOTE_PROC_FAIL_FAST_LIMIT ,
277+ DEFAULT_RS_REMOTE_PROC_RETRY_LIMIT );
260278 }
261279
262280 private AdminService .BlockingInterface getRsAdmin () throws IOException {
@@ -305,13 +323,28 @@ private boolean scheduleForRetry(IOException e) {
305323 if (numberOfAttemptsSoFar == 0 && unableToConnectToServer (e )) {
306324 return false ;
307325 }
326+
327+ // Check if the num of attempts have crossed the retry limit, and if the error type can
328+ // fail-fast.
329+ if (numberOfAttemptsSoFar >= failFastRetryLimit - 1 && isErrorTypeFailFast (e )) {
330+ LOG
331+ .warn ("Number of retries {} exceeded limit {} for the given error type. Scheduling server"
332+ + " crash for {}" , numberOfAttemptsSoFar + 1 , failFastRetryLimit , serverName , e );
333+ // Expiring the server will schedule SCP and also reject the regionserver report from the
334+ // regionserver if regionserver is somehow able to send the regionserver report to master.
335+ // The master rejects the report by throwing YouAreDeadException, which would eventually
336+ // result in the regionserver abort.
337+ // This will also remove "serverName" from the ServerManager's onlineServers map.
338+ master .getServerManager ().expireServer (serverName );
339+ return false ;
340+ }
308341 // Always retry for other exception types if the region server is not dead yet.
309342 if (!master .getServerManager ().isServerOnline (serverName )) {
310343 LOG .warn ("Request to {} failed due to {}, try={} and the server is not online, give up" ,
311344 serverName , e .toString (), numberOfAttemptsSoFar );
312345 return false ;
313346 }
314- if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException ) {
347+ if (e instanceof RegionServerStoppedException ) {
315348 // A better way is to return true here to let the upper layer quit, and then schedule a
316349 // background task to check whether the region server is dead. And if it is dead, call
317350 // remoteCallFailed to tell the upper layer. Keep retrying here does not lead to incorrect
@@ -329,7 +362,8 @@ private boolean scheduleForRetry(IOException e) {
329362 // retry^2 on each try
330363 // up to max of 10 seconds (don't want to back off too much in case of situation change).
331364 submitTask (this ,
332- Math .min (rsRpcRetryInterval * (this .numberOfAttemptsSoFar * this .numberOfAttemptsSoFar ),
365+ Math .min (
366+ rsRpcRetryInterval * ((long ) this .numberOfAttemptsSoFar * this .numberOfAttemptsSoFar ),
333367 10 * 1000 ),
334368 TimeUnit .MILLISECONDS );
335369 return true ;
@@ -376,6 +410,39 @@ private boolean isSaslError(IOException e) {
376410 }
377411 }
378412
413+ /**
414+ * Returns true if the error or its cause is of type ConnectionClosedException.
415+ * @param e IOException thrown by the underlying rpc framework.
416+ * @return True if the error or its cause is of type ConnectionClosedException.
417+ */
418+ private boolean isConnectionClosedError (IOException e ) {
419+ if (e instanceof ConnectionClosedException ) {
420+ return true ;
421+ }
422+ Throwable cause = e ;
423+ while (true ) {
424+ if (cause instanceof IOException ) {
425+ IOException unwrappedCause = unwrapException ((IOException ) cause );
426+ if (unwrappedCause instanceof ConnectionClosedException ) {
427+ return true ;
428+ }
429+ }
430+ cause = cause .getCause ();
431+ if (cause == null ) {
432+ return false ;
433+ }
434+ }
435+ }
436+
437+ /**
438+ * Returns true if the error type can allow fail-fast.
439+ * @param e IOException thrown by the underlying rpc framework.
440+ * @return True if the error type can allow fail-fast.
441+ */
442+ private boolean isErrorTypeFailFast (IOException e ) {
443+ return e instanceof CallQueueTooBigException || isSaslError (e ) || isConnectionClosedError (e );
444+ }
445+
379446 private long getMaxWaitTime () {
380447 if (this .maxWaitTime < 0 ) {
381448 // This is the max attempts, not retries, so it should be at least 1.
0 commit comments