From 7fee278e1b3b3bc7122f2e9417c8ca927004849d Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Wed, 15 Mar 2023 09:59:14 +0100 Subject: [PATCH 1/3] External_validation: proper initialization request --- src/lib_shell/block_validator_process.ml | 207 ++++++++++++++------- src/lib_validation/external_validation.ml | 27 +-- src/lib_validation/external_validation.mli | 1 - src/lib_validation/external_validator.ml | 36 ++-- 4 files changed, 168 insertions(+), 103 deletions(-) diff --git a/src/lib_shell/block_validator_process.ml b/src/lib_shell/block_validator_process.ml index e16a40cc4376..ecb443502983 100644 --- a/src/lib_shell/block_validator_process.ml +++ b/src/lib_shell/block_validator_process.ml @@ -553,8 +553,8 @@ module External_validator_process = struct type validator_process = { process : Lwt_process.process_none; - stdin : Lwt_io.output_channel; - stdout : Lwt_io.input_channel; + input : Lwt_io.output_channel; + output : Lwt_io.input_channel; canceler : Lwt_canceler.t; clean_up_callback_id : Lwt_exit.clean_up_callback_id; } @@ -595,6 +595,81 @@ module External_validator_process = struct | Some xdg_runtime_dir when xdg_runtime_dir <> "" -> xdg_runtime_dir | Some _ | None -> Filename.get_temp_dir_name () + (* Ad-hoc request to make the handshake with the external validator. + This is expected to be used each time the external validator + process is (re)started. + The scenario of the handshake is the following: + - the node sends some magic bytes, + - the external validator checks the bytes and sends it's magic + bytes, + - the node checks the received bytes, + - handshake is finished. *) + let process_handshake process_input process_output = + let open Lwt_result_syntax in + let*! () = + External_validation.send + process_input + Data_encoding.Variable.bytes + External_validation.magic + in + let*! magic = + External_validation.recv process_output Data_encoding.Variable.bytes + in + fail_when + (not (Bytes.equal magic External_validation.magic)) + (Block_validator_errors.Validation_process_failed + (Inconsistent_handshake "bad magic")) + + (* Ad-hoc request to pass startup arguments to the external + validator. This is expected to be run after the + [process_handshake]. + This is expected to be used each time the external validator + process is (re)started. + The scenario of the init is the following: + - execute the handshake, + - the node sends some parameters and waits for an ack, + - the external validator initializes it's state thanks to the + given parameters, + - the external validator returns a ack to confirm a successful + initialization, + - the node receives the ack and continues, + - initialization is finished. + *) + let process_init vp process_input process_output = + let open Lwt_result_syntax in + let parameters = + { + External_validation.context_root = vp.context_root; + protocol_root = vp.protocol_root; + sandbox_parameters = vp.sandbox_parameters; + genesis = vp.genesis; + user_activated_upgrades = vp.user_activated_upgrades; + user_activated_protocol_overrides = vp.user_activated_protocol_overrides; + operation_metadata_size_limit = vp.operation_metadata_size_limit; + dal_config = vp.dal_config; + log_config = vp.log_config; + } + in + let*! () = + External_validation.send + process_input + External_validation.parameters_encoding + parameters + in + let* () = + External_validation.recv + process_output + (Error_monad.result_encoding Data_encoding.empty) + in + return_unit + + (* Proceeds to a full initialization of the external validator + process by opening the communication channels, spawning the + external validator and calling the handshake and initialization + functions. + TODO: Add critical section for external validator launch, see + https://gitlab.com/tezos/tezos/-/issues/5175 + *) let start_process vp = let open Lwt_result_syntax in let canceler = Lwt_canceler.create () in @@ -678,82 +753,61 @@ module External_validator_process = struct Lwt_exit.register_clean_up_callback ~loc:__LOC__ (fun _ -> Lwt.return (Stdlib.at_exit (fun () -> process#terminate))) in - let process_stdin = Lwt_io.of_fd ~mode:Output process_socket in - let process_stdout = Lwt_io.of_fd ~mode:Input process_socket in + let process_input = Lwt_io.of_fd ~mode:Output process_socket in + let process_output = Lwt_io.of_fd ~mode:Input process_socket in let*! () = Events.(emit validator_started process#pid) in - let parameters = - { - External_validation.context_root = vp.context_root; - protocol_root = vp.protocol_root; - sandbox_parameters = vp.sandbox_parameters; - genesis = vp.genesis; - user_activated_upgrades = vp.user_activated_upgrades; - user_activated_protocol_overrides = vp.user_activated_protocol_overrides; - operation_metadata_size_limit = vp.operation_metadata_size_limit; - dal_config = vp.dal_config; - log_config = vp.log_config; - } - in vp.validator_process <- Running { process; - stdin = process_stdin; - stdout = process_stdout; + input = process_input; + output = process_output; canceler; clean_up_callback_id; } ; - let*! () = - External_validation.send - process_stdin - Data_encoding.Variable.bytes - External_validation.magic - in - let*! magic = - External_validation.recv process_stdout Data_encoding.Variable.bytes - in - let* () = - fail_when - (not (Bytes.equal magic External_validation.magic)) - (Block_validator_errors.Validation_process_failed - (Inconsistent_handshake "bad magic")) - in - let*! () = - External_validation.send - process_stdin - External_validation.parameters_encoding - parameters - in - return (process, process_stdin, process_stdout) - + let* () = process_handshake process_input process_output in + let* () = process_init vp process_input process_output in + return (process, process_input, process_output) + + (* Inspects the validator's state and return it. If the process is + in an inconsistent state, it will be restarted automatically -- + by running [start_process]. *) + let validator_state vp = + let open Lwt_result_syntax in + match vp.validator_process with + | Running + { + process; + input = process_input; + output = process_output; + canceler; + clean_up_callback_id; + } -> ( + match process#state with + | Running -> return (process, process_input, process_output) + | Exited status -> + (* When the process is in an inconsistent state, we restart + it automatically. *) + let*! () = Error_monad.cancel_with_exceptions canceler in + Lwt_exit.unregister_clean_up_callback clean_up_callback_id ; + vp.validator_process <- Uninitialized ; + let*! () = Events.(emit process_exited_abnormally status) in + start_process vp) + | Uninitialized -> start_process vp + | Exiting -> + let*! () = Events.(emit cannot_start_process ()) in + tzfail Block_validator_errors.Cannot_validate_while_shutting_down + + (* Sends the given request to the external validator. If the request + failed to be fulfilled, the status of the external validator is + set to Uninitialized and the associated error is propagated. *) let send_request vp request result_encoding = let open Lwt_result_syntax in - let* process, process_stdin, process_stdout = - match vp.validator_process with - | Running - { - process; - stdin = process_stdin; - stdout = process_stdout; - canceler; - clean_up_callback_id; - } -> ( - match process#state with - | Running -> return (process, process_stdin, process_stdout) - | Exited status -> - let*! () = Error_monad.cancel_with_exceptions canceler in - Lwt_exit.unregister_clean_up_callback clean_up_callback_id ; - vp.validator_process <- Uninitialized ; - let*! () = Events.(emit process_exited_abnormally status) in - start_process vp) - | Uninitialized -> start_process vp - | Exiting -> - let*! () = Events.(emit cannot_start_process ()) in - tzfail Block_validator_errors.Cannot_validate_while_shutting_down - in + let* process, process_input, process_output = validator_state vp in Lwt.catch (fun () -> - (* Make sure that the promise is not canceled between a send and recv *) + (* Make sure that the promise is not cancelled between a send + and recv *) let* res = Lwt.protected (Lwt_mutex.with_lock vp.lock (fun () -> @@ -761,13 +815,13 @@ module External_validator_process = struct let*! () = Events.(emit request_for request) in let*! () = External_validation.send - process_stdin + process_input External_validation.request_encoding request in let*! res = External_validation.recv_result - process_stdout + process_output result_encoding in let timespan = @@ -794,6 +848,12 @@ module External_validator_process = struct in fail_with_exn exn) + (* The initialization phase aims to configure the external validator + and start it's associated process. This will result in the call + of [process_handshake] and [process_init]. + Note that it is important to have [init] as a blocking promise as + the external validator must initialize the context (in RW) before + the node tries to open it (in RO).*) let init ({ user_activated_upgrades; @@ -823,8 +883,11 @@ module External_validator_process = struct log_config; } in - let* () = - send_request validator External_validation.Init Data_encoding.empty + let* (_ : + Lwt_process.process_none + * Lwt_io.output Lwt_io.channel + * Lwt_io.input Lwt_io.channel) = + start_process validator in return validator @@ -940,7 +1003,7 @@ module External_validator_process = struct let open Lwt_syntax in let* () = Events.(emit close ()) in match vp.validator_process with - | Running {process; stdin = process_stdin; canceler; _} -> + | Running {process; input = process_input; canceler; _} -> let request = External_validation.Terminate in let* () = Events.(emit request_for request) in let* () = @@ -950,7 +1013,7 @@ module External_validator_process = struct (* Try to trigger the clean shutdown of the validation process. *) External_validation.send - process_stdin + process_input External_validation.request_encoding request) (function diff --git a/src/lib_validation/external_validation.ml b/src/lib_validation/external_validation.ml index 3211af2d6b69..98c87d7a211e 100644 --- a/src/lib_validation/external_validation.ml +++ b/src/lib_validation/external_validation.ml @@ -42,7 +42,6 @@ type parameters = { } type request = - | Init | Validate of { chain_id : Chain_id.t; block_header : Block_header.t; @@ -96,7 +95,6 @@ type request = Tezos_base_unix.Internal_event_unix.Configuration.t let request_pp ppf = function - | Init -> Format.fprintf ppf "process handshake" | Validate {block_header; chain_id; _} -> Format.fprintf ppf @@ -432,21 +430,15 @@ let request_encoding = let open Data_encoding in union [ + case_validate (Tag 0); case - (Tag 0) - ~title:"init" - empty - (function Init -> Some () | _ -> None) - (fun () -> Init); - case_validate (Tag 1); - case - (Tag 2) + (Tag 1) ~title:"commit_genesis" (obj1 (req "chain_id" Chain_id.encoding)) (function Commit_genesis {chain_id} -> Some chain_id | _ -> None) (fun chain_id -> Commit_genesis {chain_id}); case - (Tag 3) + (Tag 2) ~title:"fork_test_chain" (obj3 (req "chain_id" Chain_id.encoding) @@ -459,22 +451,21 @@ let request_encoding = (fun (chain_id, context_hash, forked_header) -> Fork_test_chain {chain_id; context_hash; forked_header}); case - (Tag 4) + (Tag 3) ~title:"terminate" unit (function Terminate -> Some () | _ -> None) (fun () -> Terminate); - (* Tag 5 was ["restore_integrity"]. *) case - (Tag 6) + (Tag 4) ~title:"reconfigure_event_logging" Tezos_base_unix.Internal_event_unix.Configuration.encoding (function Reconfigure_event_logging c -> Some c | _ -> None) (fun c -> Reconfigure_event_logging c); - case_preapply (Tag 7); - case_precheck (Tag 8); - case_context_gc (Tag 9); - case_context_split (Tag 10); + case_preapply (Tag 5); + case_precheck (Tag 6); + case_context_gc (Tag 7); + case_context_split (Tag 8); ] let send pin encoding data = diff --git a/src/lib_validation/external_validation.mli b/src/lib_validation/external_validation.mli index 7c6bf36b7a6e..786e7d067cb3 100644 --- a/src/lib_validation/external_validation.mli +++ b/src/lib_validation/external_validation.mli @@ -42,7 +42,6 @@ type parameters = { } type request = - | Init | Validate of { chain_id : Chain_id.t; block_header : Block_header.t; diff --git a/src/lib_validation/external_validator.ml b/src/lib_validation/external_validator.ml index f421143c8185..54ed42f7b4ca 100644 --- a/src/lib_validation/external_validator.ml +++ b/src/lib_validation/external_validator.ml @@ -165,6 +165,9 @@ let inconsistent_handshake msg = Block_validator_errors.( Validation_process_failed (Inconsistent_handshake msg)) +(* Handshake with the node. See + [Block_validator_process.process_handshake] for the handshake + scenario. *) let handshake input output = let open Lwt_syntax in let* () = @@ -178,14 +181,18 @@ let handshake input output = (not (Bytes.equal magic External_validation.magic)) (inconsistent_handshake "bad magic") -let init ~readonly input = +(* Initialization of the external process thanks to the parameters + sent by the node. This is expected to be run after the + [handshake]. See [Block_validator_process.process_init] for + the init scenario. *) +let init ~readonly input output = let open Lwt_result_syntax in let*! () = Events.(emit initialization_request ()) in let*! { context_root; protocol_root; - sandbox_parameters; genesis; + sandbox_parameters; user_activated_upgrades; user_activated_protocol_overrides; operation_metadata_size_limit; @@ -213,6 +220,16 @@ let init ~readonly input = ~readonly context_root in + (* It is necessary to send the ok result, as a blocking promise for + the node (see [Block_validator_process.process_init]), after a + complete initialization as the node relies on the external + validator for the context's initialization. *) + let*! () = + External_validation.send + output + (Error_monad.result_encoding Data_encoding.empty) + (Ok ()) + in return ( context_index, protocol_root, @@ -232,7 +249,7 @@ let run ~readonly ~using_std_channel input output = user_activated_protocol_overrides, operation_metadata_size_limit, log_config ) = - init ~readonly input + init ~readonly input output in let*! () = (* if the external validator is spawned in a standalone way and communicates @@ -244,21 +261,16 @@ let run ~readonly ~using_std_channel input output = ~lwt_log_sink:log_config.lwt_log_sink_unix () in + (* Main loop waiting for request to be processed, forever, until the + [Terminate] request is received. + TODO: https://gitlab.com/tezos/tezos/-/issues/5177 + *) let rec loop (cache : Tezos_protocol_environment.Context.block_cache option) cached_result = let*! recved = External_validation.recv input External_validation.request_encoding in match recved with - | External_validation.Init -> - let init : unit Lwt.t = - External_validation.send - output - (Error_monad.result_encoding Data_encoding.empty) - (Ok ()) - in - let*! () = init in - loop cache None | External_validation.Commit_genesis {chain_id} -> let*! () = Events.(emit commit_genesis_request genesis.block) in let*! commit = -- GitLab From e0d718a0e51ad48a84bdcf027820eec8866aa173 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Thu, 16 Mar 2023 10:17:56 +0100 Subject: [PATCH 2/3] Tezt: add external validator test --- tezt/tests/external_validation.ml | 107 ++++++++++++++++++++++++++++++ tezt/tests/main.ml | 1 + 2 files changed, 108 insertions(+) create mode 100644 tezt/tests/external_validation.ml diff --git a/tezt/tests/external_validation.ml b/tezt/tests/external_validation.ml new file mode 100644 index 000000000000..b0c2d858fd42 --- /dev/null +++ b/tezt/tests/external_validation.ml @@ -0,0 +1,107 @@ +(*****************************************************************************) +(* *) +(* Open Source License *) +(* Copyright (c) 2023 Nomadic Labs *) +(* *) +(* Permission is hereby granted, free of charge, to any person obtaining a *) +(* copy of this software and associated documentation files (the "Software"),*) +(* to deal in the Software without restriction, including without limitation *) +(* the rights to use, copy, modify, merge, publish, distribute, sublicense, *) +(* and/or sell copies of the Software, and to permit persons to whom the *) +(* Software is furnished to do so, subject to the following conditions: *) +(* *) +(* The above copyright notice and this permission notice shall be included *) +(* in all copies or substantial portions of the Software. *) +(* *) +(* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR*) +(* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *) +(* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *) +(* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER*) +(* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING *) +(* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER *) +(* DEALINGS IN THE SOFTWARE. *) +(* *) +(*****************************************************************************) + +(* Testing + ------- + Component: Node's external validator + Invocation: dune exec tezt/tests/main.exe -- --file + external_validation.ml + Subject: Tests the resilience of the external validator + failures +*) + +let wait_for_external_validator_pid node = + let filter json = JSON.(json |> as_int_opt) in + Node.wait_for node "proc_validator_started.v0" filter + +(* Typical signals that could be sent. This could be enriched but the + effects are expected to be similar. + Note that the behaviour of SIGSTOP is undefined here, as the node + will hang forever. *) +type signal = SIGABRT | SIGINT | SIGKILL | SIGQUIT | SIGTERM + +let all_signals = [SIGABRT; SIGINT; SIGKILL; SIGQUIT; SIGTERM] + +let signal_to_int = function + | SIGABRT -> Sys.sigabrt + | SIGINT -> Sys.sigint + | SIGKILL -> Sys.sigkill + | SIGQUIT -> Sys.sigquit + | SIGTERM -> Sys.sigterm + +let pp_signal ppf signal = + let str = + match signal with + | SIGABRT -> "sigabrt" + | SIGINT -> "sigint" + | SIGKILL -> "sigkill" + | SIGQUIT -> "sigquit" + | SIGTERM -> "sigterm" + in + Format.fprintf ppf "%s" str + +let wait_for_external_validator_failure node = + let filter json = JSON.(json |> as_int_opt) in + Node.wait_for node "proc_status.v0" filter + +let kill_process ~pid ~signal = Unix.kill pid (signal_to_int signal) + +let test_kill = + Protocol.register_test + ~__FILE__ + ~title:"external validator kill" + ~tags:["node"; "external"; "validator"; "kill"] + @@ fun protocol -> + let node = Node.create [] in + let wait_for_validator_pid = wait_for_external_validator_pid node in + let* () = Node.run node [] in + let* () = Node.wait_for_ready node in + let* client = Client.init ~endpoint:(Node node) () in + let* validator_pid = wait_for_validator_pid in + let* () = Client.activate_protocol_and_wait ~protocol client in + Log.info "Wait for level 1" ; + let* (_ : int) = Node.wait_for_level node 1 in + Log.info + "Kill the external validator (pid %d), and then, bake a block" + validator_pid ; + let kill_loop (level, validator_pid) signal = + (* Starts with a running process. *) + let wait_for_new_validator_pid = wait_for_external_validator_pid node in + let () = kill_process ~pid:validator_pid ~signal in + Log.info "External validator was killed by %a" pp_signal signal ; + Log.info "Baking a block with a dead validator" ; + let wait_for_failure = wait_for_external_validator_failure node in + let* () = Client.bake_for_and_wait client in + let* (_ : int) = wait_for_failure in + let* new_validator_pid = wait_for_new_validator_pid in + let* level = Node.wait_for_level node (level + 1) in + return (level, new_validator_pid) + in + let* (_ : int * int) = + Lwt_list.fold_left_s kill_loop (1, validator_pid) all_signals + in + unit + +let register ~protocols = test_kill protocols diff --git a/tezt/tests/main.ml b/tezt/tests/main.ml index a0f342719ff4..7172774f84c6 100644 --- a/tezt/tests/main.ml +++ b/tezt/tests/main.ml @@ -135,6 +135,7 @@ let register_protocol_tests_that_use_supports_correctly () = Double_bake.register ~protocols ; Encoding.register ~protocols ; Events.register ~protocols ; + External_validation.register ~protocols ; Forge.register ~protocols ; Fork.register ~protocols ; Gas_bound.register ~protocols ; -- GitLab From e9fe3e30369a1af8150081da59aed80e11058271 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Thu, 16 Mar 2023 14:07:57 +0100 Subject: [PATCH 3/3] Validation: add cloexec flag when creating validator's socket --- src/lib_validation/external_validation.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib_validation/external_validation.ml b/src/lib_validation/external_validation.ml index 98c87d7a211e..50292ed8e16f 100644 --- a/src/lib_validation/external_validation.ml +++ b/src/lib_validation/external_validation.ml @@ -504,7 +504,7 @@ let make_socket socket_path = Unix.ADDR_UNIX socket_path let create_socket ~canceler = let open Lwt_syntax in - let socket = Lwt_unix.socket PF_UNIX SOCK_STREAM 0o000 in + let socket = Lwt_unix.socket ~cloexec:true PF_UNIX SOCK_STREAM 0o000 in Lwt_unix.set_close_on_exec socket ; Lwt_canceler.on_cancel canceler (fun () -> let* (_ : unit tzresult) = Lwt_utils_unix.safe_close socket in -- GitLab