Skip to content

Commit 66f93a3

Browse files
authored
Revert "Fix OSX error and re-merge unhandled exceptions handling (ray-project#14138)" (ray-project#14180)
This reverts commit ee584e8.
1 parent 9299462 commit 66f93a3

File tree

10 files changed

+68
-144
lines changed

10 files changed

+68
-144
lines changed

BUILD.bazel

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -702,16 +702,6 @@ cc_test(
702702
],
703703
)
704704

705-
cc_test(
706-
name = "memory_store_test",
707-
srcs = ["src/ray/core_worker/test/memory_store_test.cc"],
708-
copts = COPTS,
709-
deps = [
710-
":core_worker_lib",
711-
"@com_google_googletest//:gtest_main",
712-
],
713-
)
714-
715705
cc_test(
716706
name = "direct_actor_transport_test",
717707
srcs = ["src/ray/core_worker/test/direct_actor_transport_test.cc"],

python/ray/_raylet.pyx

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -734,20 +734,6 @@ cdef void delete_spilled_objects_handler(
734734
job_id=None)
735735

736736

737-
cdef void unhandled_exception_handler(const CRayObject& error) nogil:
738-
with gil:
739-
worker = ray.worker.global_worker
740-
data = None
741-
metadata = None
742-
if error.HasData():
743-
data = Buffer.make(error.GetData())
744-
if error.HasMetadata():
745-
metadata = Buffer.make(error.GetMetadata()).to_pybytes()
746-
# TODO(ekl) why does passing a ObjectRef.nil() lead to shutdown errors?
747-
object_ids = [None]
748-
worker.raise_errors([(data, metadata)], object_ids)
749-
750-
751737
# This function introduces ~2-7us of overhead per call (i.e., it can be called
752738
# up to hundreds of thousands of times per second).
753739
cdef void get_py_stack(c_string* stack_out) nogil:
@@ -857,7 +843,6 @@ cdef class CoreWorker:
857843
options.spill_objects = spill_objects_handler
858844
options.restore_spilled_objects = restore_spilled_objects_handler
859845
options.delete_spilled_objects = delete_spilled_objects_handler
860-
options.unhandled_exception_handler = unhandled_exception_handler
861846
options.get_lang_stack = get_py_stack
862847
options.ref_counting_enabled = True
863848
options.is_local_mode = local_mode
@@ -1468,13 +1453,9 @@ cdef class CoreWorker:
14681453
object_ref.native())
14691454

14701455
def remove_object_ref_reference(self, ObjectRef object_ref):
1471-
cdef:
1472-
CObjectID c_object_id = object_ref.native()
1473-
# We need to release the gil since object destruction may call the
1474-
# unhandled exception handler.
1475-
with nogil:
1476-
CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference(
1477-
c_object_id)
1456+
# Note: faster to not release GIL for short-running op.
1457+
CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference(
1458+
object_ref.native())
14781459

14791460
def serialize_and_promote_object_ref(self, ObjectRef object_ref):
14801461
cdef:

python/ray/includes/libcoreworker.pxd

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
250250
(void(
251251
const c_vector[c_string]&,
252252
CWorkerType) nogil) delete_spilled_objects
253-
(void(const CRayObject&) nogil) unhandled_exception_handler
254253
(void(c_string *stack_out) nogil) get_lang_stack
255254
c_bool ref_counting_enabled
256255
c_bool is_local_mode

python/ray/tests/test_failure.py

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -20,52 +20,6 @@
2020
get_error_message, Semaphore)
2121

2222

23-
def test_unhandled_errors(ray_start_regular):
24-
@ray.remote
25-
def f():
26-
raise ValueError()
27-
28-
@ray.remote
29-
class Actor:
30-
def f(self):
31-
raise ValueError()
32-
33-
a = Actor.remote()
34-
num_exceptions = 0
35-
36-
def interceptor(e):
37-
nonlocal num_exceptions
38-
num_exceptions += 1
39-
40-
# Test we report unhandled exceptions.
41-
ray.worker._unhandled_error_handler = interceptor
42-
x1 = f.remote()
43-
x2 = a.f.remote()
44-
del x1
45-
del x2
46-
wait_for_condition(lambda: num_exceptions == 2)
47-
48-
# Test we don't report handled exceptions.
49-
x1 = f.remote()
50-
x2 = a.f.remote()
51-
with pytest.raises(ray.exceptions.RayError) as err: # noqa
52-
ray.get([x1, x2])
53-
del x1
54-
del x2
55-
time.sleep(1)
56-
assert num_exceptions == 2, num_exceptions
57-
58-
# Test suppression with env var works.
59-
try:
60-
os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
61-
x1 = f.remote()
62-
del x1
63-
time.sleep(1)
64-
assert num_exceptions == 2, num_exceptions
65-
finally:
66-
del os.environ["RAY_IGNORE_UNHANDLED_ERRORS"]
67-
68-
6923
def test_failed_task(ray_start_regular, error_pubsub):
7024
@ray.remote
7125
def throw_exception_fct1():

python/ray/worker.py

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import logging
1010
import os
1111
import redis
12+
from six.moves import queue
1213
import sys
1314
import threading
1415
import time
@@ -68,12 +69,6 @@
6869
logger = logging.getLogger(__name__)
6970

7071

71-
# Visible for testing.
72-
def _unhandled_error_handler(e: Exception):
73-
logger.error("Unhandled error (suppress with "
74-
"RAY_IGNORE_UNHANDLED_ERRORS=1): {}".format(e))
75-
76-
7772
class Worker:
7873
"""A class used to define the control flow of a worker process.
7974
@@ -282,14 +277,6 @@ def put_object(self, value, object_ref=None):
282277
self.core_worker.put_serialized_object(
283278
serialized_value, object_ref=object_ref))
284279

285-
def raise_errors(self, data_metadata_pairs, object_refs):
286-
context = self.get_serialization_context()
287-
out = context.deserialize_objects(data_metadata_pairs, object_refs)
288-
if "RAY_IGNORE_UNHANDLED_ERRORS" in os.environ:
289-
return
290-
for e in out:
291-
_unhandled_error_handler(e)
292-
293280
def deserialize_objects(self, data_metadata_pairs, object_refs):
294281
context = self.get_serialization_context()
295282
return context.deserialize_objects(data_metadata_pairs, object_refs)
@@ -878,6 +865,13 @@ def custom_excepthook(type, value, tb):
878865

879866
sys.excepthook = custom_excepthook
880867

868+
# The last time we raised a TaskError in this process. We use this value to
869+
# suppress redundant error messages pushed from the workers.
870+
last_task_error_raise_time = 0
871+
872+
# The max amount of seconds to wait before printing out an uncaught error.
873+
UNCAUGHT_ERROR_GRACE_PERIOD = 5
874+
881875

882876
def print_logs(redis_client, threads_stopped, job_id):
883877
"""Prints log messages from workers on all of the nodes.
@@ -1028,14 +1022,51 @@ def color_for(data: Dict[str, str]) -> str:
10281022
file=print_file)
10291023

10301024

1031-
def listen_error_messages_raylet(worker, threads_stopped):
1025+
def print_error_messages_raylet(task_error_queue, threads_stopped):
1026+
"""Prints message received in the given output queue.
1027+
1028+
This checks periodically if any un-raised errors occurred in the
1029+
background.
1030+
1031+
Args:
1032+
task_error_queue (queue.Queue): A queue used to receive errors from the
1033+
thread that listens to Redis.
1034+
threads_stopped (threading.Event): A threading event used to signal to
1035+
the thread that it should exit.
1036+
"""
1037+
1038+
while True:
1039+
# Exit if we received a signal that we should stop.
1040+
if threads_stopped.is_set():
1041+
return
1042+
1043+
try:
1044+
error, t = task_error_queue.get(block=False)
1045+
except queue.Empty:
1046+
threads_stopped.wait(timeout=0.01)
1047+
continue
1048+
# Delay errors a little bit of time to attempt to suppress redundant
1049+
# messages originating from the worker.
1050+
while t + UNCAUGHT_ERROR_GRACE_PERIOD > time.time():
1051+
threads_stopped.wait(timeout=1)
1052+
if threads_stopped.is_set():
1053+
break
1054+
if t < last_task_error_raise_time + UNCAUGHT_ERROR_GRACE_PERIOD:
1055+
logger.debug(f"Suppressing error from worker: {error}")
1056+
else:
1057+
logger.error(f"Possible unhandled error from worker: {error}")
1058+
1059+
1060+
def listen_error_messages_raylet(worker, task_error_queue, threads_stopped):
10321061
"""Listen to error messages in the background on the driver.
10331062
10341063
This runs in a separate thread on the driver and pushes (error, time)
10351064
tuples to the output queue.
10361065
10371066
Args:
10381067
worker: The worker class that this thread belongs to.
1068+
task_error_queue (queue.Queue): A queue used to communicate with the
1069+
thread that prints the errors found by this thread.
10391070
threads_stopped (threading.Event): A threading event used to signal to
10401071
the thread that it should exit.
10411072
"""
@@ -1074,9 +1105,8 @@ def listen_error_messages_raylet(worker, threads_stopped):
10741105

10751106
error_message = error_data.error_message
10761107
if (error_data.type == ray_constants.TASK_PUSH_ERROR):
1077-
# TODO(ekl) remove task push errors entirely now that we have
1078-
# the separate unhandled exception handler.
1079-
pass
1108+
# Delay it a bit to see if we can suppress it
1109+
task_error_queue.put((error_message, time.time()))
10801110
else:
10811111
logger.warning(error_message)
10821112
except (OSError, redis.exceptions.ConnectionError) as e:
@@ -1239,12 +1269,19 @@ def connect(node,
12391269
# temporarily using this implementation which constantly queries the
12401270
# scheduler for new error messages.
12411271
if mode == SCRIPT_MODE:
1272+
q = queue.Queue()
12421273
worker.listener_thread = threading.Thread(
12431274
target=listen_error_messages_raylet,
12441275
name="ray_listen_error_messages",
1245-
args=(worker, worker.threads_stopped))
1276+
args=(worker, q, worker.threads_stopped))
1277+
worker.printer_thread = threading.Thread(
1278+
target=print_error_messages_raylet,
1279+
name="ray_print_error_messages",
1280+
args=(q, worker.threads_stopped))
12461281
worker.listener_thread.daemon = True
12471282
worker.listener_thread.start()
1283+
worker.printer_thread.daemon = True
1284+
worker.printer_thread.start()
12481285
if log_to_driver:
12491286
global_worker_stdstream_dispatcher.add_handler(
12501287
"ray_print_logs", print_to_stdstream)
@@ -1297,6 +1334,8 @@ def disconnect(exiting_interpreter=False):
12971334
worker.import_thread.join_import_thread()
12981335
if hasattr(worker, "listener_thread"):
12991336
worker.listener_thread.join()
1337+
if hasattr(worker, "printer_thread"):
1338+
worker.printer_thread.join()
13001339
if hasattr(worker, "logger_thread"):
13011340
worker.logger_thread.join()
13021341
worker.threads_stopped.clear()
@@ -1408,11 +1447,13 @@ def get(object_refs, *, timeout=None):
14081447
raise ValueError("'object_refs' must either be an object ref "
14091448
"or a list of object refs.")
14101449

1450+
global last_task_error_raise_time
14111451
# TODO(ujvl): Consider how to allow user to retrieve the ready objects.
14121452
values, debugger_breakpoint = worker.get_objects(
14131453
object_refs, timeout=timeout)
14141454
for i, value in enumerate(values):
14151455
if isinstance(value, RayError):
1456+
last_task_error_raise_time = time.time()
14161457
if isinstance(value, ray.exceptions.ObjectLostError):
14171458
worker.core_worker.dump_object_store_memory_usage()
14181459
if isinstance(value, RayTaskError):

src/ray/common/ray_object.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,20 +92,12 @@ class RayObject {
9292
/// large to return directly as part of a gRPC response).
9393
bool IsInPlasmaError() const;
9494

95-
/// Mark this object as accessed before.
96-
void SetAccessed() { accessed_ = true; };
97-
98-
/// Check if this object was accessed before.
99-
bool WasAccessed() const { return accessed_; }
100-
10195
private:
10296
std::shared_ptr<Buffer> data_;
10397
std::shared_ptr<Buffer> metadata_;
10498
const std::vector<ObjectID> nested_ids_;
10599
/// Whether this class holds a data copy.
106100
bool has_data_copy_;
107-
/// Whether this object was accessed.
108-
bool accessed_ = false;
109101
};
110102

111103
} // namespace ray

src/ray/core_worker/core_worker.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
422422
return Status::OK();
423423
},
424424
options_.ref_counting_enabled ? reference_counter_ : nullptr, local_raylet_client_,
425-
options_.check_signals, options_.unhandled_exception_handler));
425+
options_.check_signals));
426426

427427
auto check_node_alive_fn = [this](const NodeID &node_id) {
428428
auto node = gcs_client_->Nodes().Get(node_id);

src/ray/core_worker/core_worker.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ struct CoreWorkerOptions {
8282
spill_objects(nullptr),
8383
restore_spilled_objects(nullptr),
8484
delete_spilled_objects(nullptr),
85-
unhandled_exception_handler(nullptr),
8685
get_lang_stack(nullptr),
8786
kill_main(nullptr),
8887
ref_counting_enabled(false),
@@ -147,8 +146,6 @@ struct CoreWorkerOptions {
147146
/// Application-language callback to delete objects from external storage.
148147
std::function<void(const std::vector<std::string> &, rpc::WorkerType)>
149148
delete_spilled_objects;
150-
/// Function to call on error objects never retrieved.
151-
std::function<void(const RayObject &error)> unhandled_exception_handler;
152149
/// Language worker callback to get the current call stack.
153150
std::function<void(std::string *)> get_lang_stack;
154151
// Function that tries to interrupt the currently running Python thread.

0 commit comments

Comments
 (0)