@@ -324,8 +324,24 @@ public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDea
324324 // the ServerName to use. Here we presume a master has already done
325325 // that so we'll press on with whatever it gave us for ServerName.
326326 if (!checkAndRecordNewServer (sn , sl )) {
327- LOG .info ("RegionServerReport ignored, could not record the server: " + sn );
328- return ; // Not recorded, so no need to move on
327+ // Master already registered server with same (host + port) and higher startcode.
328+ // This can happen if regionserver report comes late from old server
329+ // (possible race condition), by that time master has already processed SCP for that
330+ // server and started accepting regionserver report from new server i.e. server with
331+ // same (host + port) and higher startcode.
332+ // The exception thrown here is not meant to tell the region server it is dead because if
333+ // there is a new server on the same host port, the old server should have already been
334+ // dead in ideal situation.
335+ // The exception thrown here is to skip the later steps of the whole regionServerReport
336+ // request processing. Usually, after recording it in ServerManager, we will call the
337+ // related methods in AssignmentManager to record region states. If the region server
338+ // is already dead, we should not do these steps anymore, so here we throw an exception
339+ // to let the upper layer know that they should not continue processing anymore.
340+ final String errorMsg = "RegionServerReport received from " + sn
341+ + ", but another server with the same name and higher startcode is already registered,"
342+ + " ignoring" ;
343+ LOG .warn (errorMsg );
344+ throw new YouAreDeadException (errorMsg );
329345 }
330346 }
331347 updateLastFlushedSequenceIds (sn , sl );
0 commit comments