@@ -890,7 +890,15 @@ void NodeManager::ResourceCreateUpdated(const NodeID &node_id,
890890 const ResourceSet &createUpdatedResources) {
891891 RAY_LOG (DEBUG) << " [ResourceCreateUpdated] received callback from node id " << node_id
892892 << " with created or updated resources: "
893- << createUpdatedResources.ToString () << " . Updating resource map." ;
893+ << createUpdatedResources.ToString () << " . Updating resource map."
894+ << " skip=" << (node_id == self_node_id_);
895+
896+ // Skip updating local node since local node always has the latest information.
897+ // Updating local node could result in a inconsistence view in cluster resource
898+ // scheduler which could make task hang.
899+ if (node_id == self_node_id_) {
900+ return ;
901+ }
894902
895903 // Update local_available_resources_ and SchedulingResources
896904 for (const auto &resource_pair : createUpdatedResources.GetResourceMap ()) {
@@ -900,11 +908,7 @@ void NodeManager::ResourceCreateUpdated(const NodeID &node_id,
900908 new_resource_capacity);
901909 }
902910 RAY_LOG (DEBUG) << " [ResourceCreateUpdated] Updated cluster_resource_map." ;
903-
904- if (node_id == self_node_id_) {
905- // The resource update is on the local node, check if we can reschedule tasks.
906- cluster_task_manager_->ScheduleAndDispatchTasks ();
907- }
911+ cluster_task_manager_->ScheduleAndDispatchTasks ();
908912}
909913
910914void NodeManager::ResourceDeleted (const NodeID &node_id,
@@ -916,7 +920,14 @@ void NodeManager::ResourceDeleted(const NodeID &node_id,
916920 }
917921 RAY_LOG (DEBUG) << " [ResourceDeleted] received callback from node id " << node_id
918922 << " with deleted resources: " << oss.str ()
919- << " . Updating resource map." ;
923+ << " . Updating resource map. skip=" << (node_id == self_node_id_);
924+ }
925+
926+ // Skip updating local node since local node always has the latest information.
927+ // Updating local node could result in a inconsistence view in cluster resource
928+ // scheduler which could make task hang.
929+ if (node_id == self_node_id_) {
930+ return ;
920931 }
921932
922933 // Update local_available_resources_ and SchedulingResources
@@ -1474,39 +1485,44 @@ void NodeManager::HandleUpdateResourceUsage(
14741485 rpc::SendReplyCallback send_reply_callback) {
14751486 rpc::ResourceUsageBroadcastData resource_usage_batch;
14761487 resource_usage_batch.ParseFromString (request.serialized_resource_usage_batch ());
1477-
1478- if (resource_usage_batch.seq_no () != next_resource_seq_no_) {
1488+ // When next_resource_seq_no_ == 0 it means it just started.
1489+ // TODO: Fetch a snapshot from gcs for lightweight resource broadcasting
1490+ if (next_resource_seq_no_ != 0 &&
1491+ resource_usage_batch.seq_no () != next_resource_seq_no_) {
1492+ // TODO (Alex): Ideally we would be really robust, and potentially eagerly
1493+ // pull a full resource "snapshot" from gcs to make sure our state doesn't
1494+ // diverge from GCS.
14791495 RAY_LOG (WARNING)
14801496 << " Raylet may have missed a resource broadcast. This either means that GCS has "
14811497 " restarted, the network is heavily congested and is dropping, reordering, or "
14821498 " duplicating packets. Expected seq#: "
14831499 << next_resource_seq_no_ << " , but got: " << resource_usage_batch.seq_no () << " ." ;
1484- // TODO (Alex): Ideally we would be really robust, and potentially eagerly
1485- // pull a full resource "snapshot" from gcs to make sure our state doesn't
1486- // diverge from GCS.
1500+ if (resource_usage_batch.seq_no () < next_resource_seq_no_) {
1501+ RAY_LOG (WARNING) << " Discard the the resource update since local version is newer" ;
1502+ return ;
1503+ }
14871504 }
14881505 next_resource_seq_no_ = resource_usage_batch.seq_no () + 1 ;
14891506
14901507 for (const auto &resource_change_or_data : resource_usage_batch.batch ()) {
14911508 if (resource_change_or_data.has_data ()) {
14921509 const auto &resource_usage = resource_change_or_data.data ();
1493- const NodeID & node_id = NodeID::FromBinary (resource_usage.node_id ());
1494- if (node_id == self_node_id_) {
1495- // Skip messages from self.
1496- continue ;
1510+ auto node_id = NodeID::FromBinary (resource_usage.node_id ());
1511+ // Skip messages from self.
1512+ if (node_id != self_node_id_) {
1513+ UpdateResourceUsage (node_id, resource_usage) ;
14971514 }
1498- UpdateResourceUsage (node_id, resource_usage);
14991515 } else if (resource_change_or_data.has_change ()) {
15001516 const auto &resource_notification = resource_change_or_data.change ();
1501- auto id = NodeID::FromBinary (resource_notification.node_id ());
1517+ auto node_id = NodeID::FromBinary (resource_notification.node_id ());
15021518 if (resource_notification.updated_resources_size () != 0 ) {
15031519 ResourceSet resource_set (
15041520 MapFromProtobuf (resource_notification.updated_resources ()));
1505- ResourceCreateUpdated (id , resource_set);
1521+ ResourceCreateUpdated (node_id , resource_set);
15061522 }
15071523
15081524 if (resource_notification.deleted_resources_size () != 0 ) {
1509- ResourceDeleted (id ,
1525+ ResourceDeleted (node_id ,
15101526 VectorFromProtobuf (resource_notification.deleted_resources ()));
15111527 }
15121528 }
0 commit comments