From 2789b94f55662430e5d2e6d0b828440f151de478 Mon Sep 17 00:00:00 2001 From: Romain Date: Thu, 30 Oct 2025 13:59:05 +0100 Subject: [PATCH 1/6] ignore SIGPIPE in Scheduler.run --- lib_scheduler/scheduler.ml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib_scheduler/scheduler.ml b/lib_scheduler/scheduler.ml index 466aba5..b815fa4 100644 --- a/lib_scheduler/scheduler.ml +++ b/lib_scheduler/scheduler.ml @@ -1262,6 +1262,10 @@ let run ?worker_idle_timeout ?worker_kill_timeout ?(on_worker_kill_timeout = fun () -> ()) ?(on_empty_queue = fun () -> ()) ?(on_message = fun _ -> ()) ?(on_unexpected_worker_exit = fun _ -> ()) ~fork max_worker_count = + (* Make sure we get EPIPE and not SIGPIPE when writing. *) + let old_sigpipe_behavior = Sys.(signal sigpipe) Signal_ignore in + Fun.protect ~finally:(fun () -> Sys.(set_signal sigpipe) old_sigpipe_behavior) + @@ fun () -> (* Set up a pipe that will become readable when a worker terminates. *) let sigchld_pipe_exit, sigchld_pipe_entrance = Unix.pipe () in Fun.protect ~finally:(fun () -> -- GitLab From daa885d8815fd5dd21606b7f54545c096b881a94 Mon Sep 17 00:00:00 2001 From: Romain Date: Thu, 30 Oct 2025 14:00:47 +0100 Subject: [PATCH 2/6] backport changes in tezt.opam --- tezt.opam | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tezt.opam b/tezt.opam index e3ca09b..f20642e 100644 --- a/tezt.opam +++ b/tezt.opam @@ -5,6 +5,7 @@ homepage: "https://gitlab.com/nomadic-labs/tezt/" bug-reports: "https://gitlab.com/nomadic-labs/tezt/issues" dev-repo: "git+https://gitlab.com/nomadic-labs/tezt.git" license: "MIT" +x-maintenance-intent: ["(latest)"] depends: [ "dune" { >= "3.19.1" } "ocaml" { >= "4.13" } @@ -29,6 +30,6 @@ conflicts: [ ] build: [ ["dune" "build" "-p" name "-j" jobs] - ["dune" "runtest" "-p" name "-j" jobs] {with-test & arch = "x86_64" & os = "linux"} + ["dune" "runtest" "-p" name "-j" jobs] {with-test & arch = "x86_64" & os = "linux" & os-distribution = "debian"} ] synopsis: "Test framework for unit tests, integration tests, and regression tests" -- GitLab From e685a04effb042ad53628aa87e89d8cdb4de150c Mon Sep 17 00:00:00 2001 From: Romain Date: Thu, 30 Oct 2025 14:25:00 +0100 Subject: [PATCH 3/6] fix the call to close the pipe The previous version should be causing EBADF. --- lib_scheduler/scheduler.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib_scheduler/scheduler.ml b/lib_scheduler/scheduler.ml index b815fa4..230a144 100644 --- a/lib_scheduler/scheduler.ml +++ b/lib_scheduler/scheduler.ml @@ -680,7 +680,7 @@ module Worker = struct try_close pipe_from_worker_exit ; Fun.protect ~finally:(fun () -> try_close pipe_to_worker_exit ; - try_close pipe_to_worker_entrance) + try_close pipe_from_worker_entrance) @@ fun () -> (* Clear global state in case the worker wants to run a scheduler too. And also to free memory maybe. *) -- GitLab From 09e8823ffb40d3d8fda0871abb9d303fbf6f8cd2 Mon Sep 17 00:00:00 2001 From: Romain Date: Thu, 30 Oct 2025 14:30:40 +0100 Subject: [PATCH 4/6] try to fix a case where the scheduler would get stuck --- lib_scheduler/scheduler.ml | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/lib_scheduler/scheduler.ml b/lib_scheduler/scheduler.ml index 230a144..09a0669 100644 --- a/lib_scheduler/scheduler.ml +++ b/lib_scheduler/scheduler.ml @@ -659,6 +659,8 @@ module Worker = struct (* This is the parent process. *) try_close pipe_to_worker_exit ; try_close pipe_from_worker_entrance ; + Unix.set_nonblock pipe_from_worker_exit ; + Unix.set_nonblock pipe_to_worker_entrance ; { status = Alive @@ -1089,6 +1091,12 @@ type select_result = { } let select {read; write; timeout} ~sigchld_pipe_exit = + let timeout = + (* The mechanism to detect [SIGCHLD] is unreliable (see [handle_sigchld]). + We use a maximum timeout so that the scheduler can at least recover. *) + let max_timeout = 10. in + if timeout < 0. || timeout > max_timeout then max_timeout else timeout + in let readable, writeable = match Unix.select (List.map fst read) (List.map fst write) [] timeout with | exception Unix.Unix_error (EINTR, _, _) -> ([], []) @@ -1151,10 +1159,11 @@ let write_to_writeable_worker (worker : Worker.parent_state) = Message.Writer.write_non_blocking pipe_to_worker_entrance let read_from_sigchld_pipe_exit sigchld_pipe_exit = - let (_ : int) = + match Unix.read sigchld_pipe_exit dummy_bytes 0 (Bytes.length dummy_bytes) - in - () + with + | exception Unix.Unix_error ((EAGAIN | EWOULDBLOCK | EINTR), _, _) -> () + | (_ : int) -> () let check_whether_workers_exited scheduler = Fun.flip Array.iter scheduler.workers @@ fun worker_opt -> @@ -1268,15 +1277,20 @@ let run ?worker_idle_timeout ?worker_kill_timeout @@ fun () -> (* Set up a pipe that will become readable when a worker terminates. *) let sigchld_pipe_exit, sigchld_pipe_entrance = Unix.pipe () in + Unix.set_nonblock sigchld_pipe_exit ; + Unix.set_nonblock sigchld_pipe_entrance ; Fun.protect ~finally:(fun () -> try_close sigchld_pipe_exit ; try_close sigchld_pipe_entrance) @@ fun () -> let handle_sigchld (_ : int) = - (* This could in theory block. - But writing 1 byte, that will likely be consumed quickly, should be fine. *) - let (_ : int) = Unix.write sigchld_pipe_entrance dummy_bytes 0 1 in - () + (* This could in theory fail to write. + This is why we have the timeout on [select]. *) + match Unix.single_write sigchld_pipe_entrance dummy_bytes 0 1 with + | exception Unix.Unix_error ((EAGAIN | EWOULDBLOCK | EINTR | EPIPE), _, _) + -> + () + | (_ : int) -> () in let old_sigchld_behavior = Sys.(signal sigchld) (Signal_handle handle_sigchld) -- GitLab From de43a1fc79f8fd1240f006925063377ed914b9c9 Mon Sep 17 00:00:00 2001 From: Romain Date: Thu, 30 Oct 2025 14:33:23 +0100 Subject: [PATCH 5/6] update changelog --- CHANGES.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 59d05f7..d2142dc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -47,6 +47,15 @@ Same for `~worker_count`. The latter only affects custom schedulers. +### Bug Fixes + +- Fixed a case where the scheduler would exit because of `SIGPIPE`. + +- Fixed a case where workers would trigger `EBADF` on close. + +- Fixed a case where the scheduler would get stuck forever. + As a side-effect, the scheduler now wakes up at least every 10 seconds. + ## Version 4.2.0 ### New Features -- GitLab From 2f8c19c2d186d9356a86d181fda7c2066709b44c Mon Sep 17 00:00:00 2001 From: Romain Date: Thu, 30 Oct 2025 16:55:47 +0100 Subject: [PATCH 6/6] set version to 4.3.0 --- lib_core/version.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib_core/version.ml b/lib_core/version.ml index 6eaf79e..0a314c3 100644 --- a/lib_core/version.ml +++ b/lib_core/version.ml @@ -23,4 +23,4 @@ (* *) (*****************************************************************************) -let full = "4.3.0+dev" +let full = "4.3.0" -- GitLab