From c0fb11e2723b2900cfc7b9afc36251ef12e2df14 Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Tue, 13 May 2025 13:27:09 +0200 Subject: [PATCH 1/6] DAL/Node: make it possible to get the latest head seen by the L1 crawler --- src/lib_dal_node/crawler.ml | 25 ++++++++++++++++++++----- src/lib_dal_node/crawler.mli | 4 ++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/lib_dal_node/crawler.ml b/src/lib_dal_node/crawler.ml index fbc5fe367243..abb5d0153c4e 100644 --- a/src/lib_dal_node/crawler.ml +++ b/src/lib_dal_node/crawler.ml @@ -15,9 +15,11 @@ type headers_cache = (Block_header.shell_header, tztrace) Blocks_cache.t type error += Cannot_find_block of Block_hash.t +type block_info = Block_hash.t * Block_header.shell_header + type finalized_heads = { - stream : (Block_hash.t * Block_header.shell_header) Lwt_stream.t; - stream_push : (Block_hash.t * Block_header.shell_header) option -> unit; + stream : block_info Lwt_stream.t; + stream_push : block_info option -> unit; iter_heads_promise : unit tzresult Lwt.t; } @@ -26,6 +28,7 @@ type t = { headers_cache : headers_cache; (** Global block headers cache. *) cctxt : Tezos_rpc.Context.generic; finalized_heads : finalized_heads; + last_seen_head : block_info option ref; } let () = @@ -67,8 +70,8 @@ let get_predecessor crawler_lib hash level = (* This function initiates a call to function iter_heads and pushes finalized L1 heads into the stream whose push function is given. *) -let finalized_heads_monitor ~name ~last_notified_level crawler_lib cctxt - headers_cache stream_push = +let finalized_heads_monitor ~name ~last_notified_level ~last_seen_head_ref + crawler_lib cctxt headers_cache stream_push = let open Lwt_result_syntax in let last_notified_level = ref last_notified_level in let rec catch_up_if_needed hash (shell_header : Block_header.shell_header) acc @@ -114,6 +117,7 @@ let finalized_heads_monitor ~name ~last_notified_level crawler_lib cctxt ~level:shell_header_level ~fitness:shell_header.fitness in + last_seen_head_ref := Some (hash, shell_header) ; Dal_metrics.new_layer1_head ~head_level:shell_header_level ; cache_shell_header headers_cache hash shell_header ; if shell_header_level <= !last_notified_level then return_unit @@ -165,17 +169,26 @@ let start ~name ~chain ~reconnection_delay ~l1_blocks_cache_size let last_notified_level = Int32.max 0l last_notified_level in let headers_cache = Blocks_cache.create l1_blocks_cache_size in let stream, stream_push = Lwt_stream.create () in + let last_seen_head_ref = ref None in let iter_heads_promise = finalized_heads_monitor ~name ~last_notified_level + ~last_seen_head_ref crawler_lib cctxt headers_cache stream_push in let finalized_heads = {stream; stream_push; iter_heads_promise} in - return {crawler_lib; cctxt; headers_cache; finalized_heads} + return + { + crawler_lib; + cctxt; + headers_cache; + finalized_heads; + last_seen_head = last_seen_head_ref; + } let finalized_heads_stream {finalized_heads; _} = Lwt_stream.clone finalized_heads.stream @@ -183,3 +196,5 @@ let finalized_heads_stream {finalized_heads; _} = let shutdown {finalized_heads = {stream_push; iter_heads_promise; _}; _} = stream_push None ; Lwt.cancel iter_heads_promise + +let last_seen_head {last_seen_head; _} = !last_seen_head diff --git a/src/lib_dal_node/crawler.mli b/src/lib_dal_node/crawler.mli index c9278ab2b87d..26db45cf8eaf 100644 --- a/src/lib_dal_node/crawler.mli +++ b/src/lib_dal_node/crawler.mli @@ -41,3 +41,7 @@ val finalized_heads_stream : (** [shutdown t] shuts down the stream of finalized heads and cancels the monad instance of {!iter_heads} that populates the stream. *) val shutdown : t -> unit + +(** [last_seen_head t] returns the information of the last block that the L1 + crawler saw. *) +val last_seen_head : t -> (Block_hash.t * Block_header.shell_header) option -- GitLab From 116c34c9cd03cf226b4d87ffbab6325bc9e45b26 Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Tue, 13 May 2025 14:01:49 +0200 Subject: [PATCH 2/6] DAL/Node: introduce a file l1_crawler_status, with some helper functions --- src/lib_dal_node/l1_crawler_status.ml | 31 +++++++++++++++++++++ src/lib_dal_node/l1_crawler_status.mli | 38 ++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 src/lib_dal_node/l1_crawler_status.ml create mode 100644 src/lib_dal_node/l1_crawler_status.mli diff --git a/src/lib_dal_node/l1_crawler_status.ml b/src/lib_dal_node/l1_crawler_status.ml new file mode 100644 index 000000000000..ff65966a9074 --- /dev/null +++ b/src/lib_dal_node/l1_crawler_status.ml @@ -0,0 +1,31 @@ +(*****************************************************************************) +(* *) +(* SPDX-License-Identifier: MIT *) +(* SPDX-FileCopyrightText: 2024 Functori, *) +(* SPDX-FileCopyrightText: 2024 Nomadic Labs, *) +(* *) +(*****************************************************************************) + +type t = + | Catching_up of {levels_to_process : int32} + | Synced + | Lagging of {levels_to_process : int32} + | L1_bootstrapping + | L1_unreachable + | Unknown + +let catching_up_lagging_or_synced_status ~delta_kind ~head_level + ~last_processed_level = + let last_finalized = Int32.sub head_level 2l in + let levels_to_process = Int32.(sub last_finalized last_processed_level) in + if levels_to_process > 0l then + match delta_kind with + | `Catching_up -> Catching_up {levels_to_process} + | `Lagging -> Lagging {levels_to_process} + else Synced + +let catching_up_or_synced_status = + catching_up_lagging_or_synced_status ~delta_kind:`Catching_up + +let lagging_or_synced_status = + catching_up_lagging_or_synced_status ~delta_kind:`Lagging diff --git a/src/lib_dal_node/l1_crawler_status.mli b/src/lib_dal_node/l1_crawler_status.mli new file mode 100644 index 000000000000..478d87728667 --- /dev/null +++ b/src/lib_dal_node/l1_crawler_status.mli @@ -0,0 +1,38 @@ +(*****************************************************************************) +(* *) +(* SPDX-License-Identifier: MIT *) +(* SPDX-FileCopyrightText: 2024 Functori, *) +(* SPDX-FileCopyrightText: 2024 Nomadic Labs, *) +(* *) +(*****************************************************************************) + +(** Status of the DAL node's L1 crawler, indicating its sync state with the L1 + chain. *) +type t = + | Catching_up of {levels_to_process : int32} + (** The crawler is processing a large number of levels at startup to catch + up with the last L1 finalized level. *) + | Synced + (** The crawler is fully synchronized with the L1 chain (it processed the + last finalized L1 block. *) + | Lagging of {levels_to_process : int32} + (** The crawler is behind the L1 finalized block and not progressing. + This may indicate issues or temporary delays. *) + | L1_bootstrapping + (** The L1 node is currently bootstrapping and has not yet reached a + stable state. This status is possible when starting the DAL node. *) + | L1_unreachable (** The DAL node is unable to reach the L1 node. *) + | Unknown (** The crawler's status is unknown. *) + +(** [catching_up_or_synced_status ~head_level ~last_processed_level] returns the + appropriate status of the crawler: [Catching_up] if the + [last_processed_level] is more than 2 levels behind [head_level], + considering the Tenderbake finality lag; [Synced] otherwise. +*) +val catching_up_or_synced_status : + head_level:int32 -> last_processed_level:int32 -> t + +(** Similar to {!catching_up_or_synced_status} below, but returns [Lagging] if + the crawler is behind instead of [Catching_up]. *) +val lagging_or_synced_status : + head_level:int32 -> last_processed_level:int32 -> t -- GitLab From 84159c7c7c638092f4ecb50b159a6f694eebb93c Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Tue, 13 May 2025 15:19:54 +0200 Subject: [PATCH 3/6] DAL/Node: add a field l1_crawler_status in the node's context --- src/lib_dal_node/node_context.ml | 10 ++++++++-- src/lib_dal_node/node_context.mli | 7 +++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/lib_dal_node/node_context.ml b/src/lib_dal_node/node_context.ml index 77229f19b21b..fa41308ce7de 100644 --- a/src/lib_dal_node/node_context.ml +++ b/src/lib_dal_node/node_context.ml @@ -40,8 +40,9 @@ type t = { transport_layer : Gossipsub.Transport_layer.t; mutable profile_ctxt : Profile_manager.t; mutable last_finalized_level : int32; - (* the highest finalized level the DAL node is aware of (except at start-up, where - it is the highest level the node is aware of) *) + (* the highest finalized level the DAL node is aware of (except at start-up, where + it is the highest level the node is aware of) *) + mutable l1_crawler_status : L1_crawler_status.t; } let init config ~network_name profile_ctxt cryptobox @@ -63,10 +64,15 @@ let init config ~network_name profile_ctxt cryptobox transport_layer; profile_ctxt; last_finalized_level; + l1_crawler_status = Unknown; } let get_tezos_node_cctxt ctxt = ctxt.tezos_node_cctxt +let set_l1_crawler_status ctxt status = ctxt.l1_crawler_status <- status + +let get_l1_crawler_status ctxt = ctxt.l1_crawler_status + let may_reconstruct ~reconstruct slot_id t = let open Lwt_result_syntax in let p = diff --git a/src/lib_dal_node/node_context.mli b/src/lib_dal_node/node_context.mli index 0a65cbc8ab2e..76027ef6e3fe 100644 --- a/src/lib_dal_node/node_context.mli +++ b/src/lib_dal_node/node_context.mli @@ -98,6 +98,13 @@ val may_reconstruct : t -> (bytes, Errors.other) result Lwt.t +(** Returns the status of the L1 crawler currently stored in the node + context. *) +val get_l1_crawler_status : t -> L1_crawler_status.t + +(** Updates the status of the L1 crawler with the given value. *) +val set_l1_crawler_status : t -> L1_crawler_status.t -> unit + (** [get_profile_ctxt ctxt] returns the profile context. *) val get_profile_ctxt : t -> Profile_manager.t -- GitLab From 679a55f26c0dd9d61e942f76e92621fb7296710e Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Tue, 13 May 2025 15:21:28 +0200 Subject: [PATCH 4/6] DAL/Node: update the L1 crawler status depending on situations --- src/lib_dal_node/block_handler.ml | 19 ++++++++++++++++--- src/lib_dal_node/block_handler.mli | 3 ++- src/lib_dal_node/daemon.ml | 6 ++++++ src/lib_dal_node/store_cleanup.ml | 6 ++++++ 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/lib_dal_node/block_handler.ml b/src/lib_dal_node/block_handler.ml index 204e8f8b730a..c82db241c6e0 100644 --- a/src/lib_dal_node/block_handler.ml +++ b/src/lib_dal_node/block_handler.ml @@ -437,7 +437,7 @@ let process_block_data ctxt cctxt store proto_parameters block_level cctxt ~attested_level:block_level -let process_block ctxt cctxt proto_parameters finalized_shell_header +let process_block ctxt cctxt l1_crawler proto_parameters finalized_shell_header finalized_block_hash = let open Lwt_result_syntax in let store = Node_context.get_store ctxt in @@ -469,28 +469,40 @@ let process_block ctxt cctxt proto_parameters finalized_shell_header ~level:block_level ~round:block_round in + let () = + match Crawler.last_seen_head l1_crawler with + | None -> assert false (* Not reachable *) + | Some (_hash, head) -> + L1_crawler_status.lagging_or_synced_status + ~head_level:head.Block_header.level + ~last_processed_level:block_level + |> Node_context.set_l1_crawler_status ctxt + in (* This should be done at the end of the function. *) let last_processed_level_store = Store.last_processed_level store in Store.Last_processed_level.save last_processed_level_store block_level -let rec try_process_block ~retries ctxt cctxt proto_parameters +let rec try_process_block ~retries ctxt cctxt l1_crawler proto_parameters finalized_shell_header finalized_block_hash = let open Lwt_syntax in let* res = process_block ctxt cctxt + l1_crawler proto_parameters finalized_shell_header finalized_block_hash in match res with | Error e when Layer_1.is_connection_error e && retries > 0 -> + Node_context.set_l1_crawler_status ctxt L1_crawler_status.L1_unreachable ; let* () = Lwt_unix.sleep Constants.crawler_re_processing_delay in try_process_block ~retries:(retries - 1) ctxt cctxt + l1_crawler proto_parameters finalized_shell_header finalized_block_hash @@ -502,7 +514,7 @@ let rec try_process_block ~retries ctxt cctxt proto_parameters Tenderbake. Note that this means that shard propagation is delayed by two levels with respect to the publication level of the corresponding slot header. *) -let new_finalized_head ctxt cctxt cryptobox finalized_block_hash +let new_finalized_head ctxt cctxt l1_crawler cryptobox finalized_block_hash finalized_shell_header ~launch_time = let open Lwt_result_syntax in let level = finalized_shell_header.Block_header.level in @@ -552,6 +564,7 @@ let new_finalized_head ctxt cctxt cryptobox finalized_block_hash ~retries:Constants.crawler_retries_on_disconnection ctxt cctxt + l1_crawler proto_parameters finalized_shell_header finalized_block_hash diff --git a/src/lib_dal_node/block_handler.mli b/src/lib_dal_node/block_handler.mli index 6c1ce43cb09b..fb33fb5eedb7 100644 --- a/src/lib_dal_node/block_handler.mli +++ b/src/lib_dal_node/block_handler.mli @@ -23,7 +23,7 @@ (* *) (*****************************************************************************) -(** [new_finalized_head ctxt cctxt cryptobox block_hash shell_header +(** [new_finalized_head ctxt cctxt l1_crawler cryptobox block_hash shell_header ~launch_time] processes a new finalized L1 block. It performs cleanup of old DAL data, updates the committee cache, re-registers the gossipsub message validation hook, and triggers block-level processing (e.g. slot header @@ -32,6 +32,7 @@ val new_finalized_head : Node_context.t -> Rpc_context.t -> + Crawler.t -> Cryptobox.t -> Block_hash.t -> Block_header.shell_header -> diff --git a/src/lib_dal_node/daemon.ml b/src/lib_dal_node/daemon.ml index fc93b2928503..5d23431aac31 100644 --- a/src/lib_dal_node/daemon.ml +++ b/src/lib_dal_node/daemon.ml @@ -69,6 +69,7 @@ let on_new_finalized_head ctxt cctxt crawler = Block_handler.new_finalized_head ctxt cctxt + crawler cryptobox finalized_block_hash finalized_shell_header @@ -491,6 +492,7 @@ let run ~data_dir ~configuration_override = let _ = RPC_server.install_finalizer rpc_server in let*! () = Event.emit_rpc_server_is_ready ~point:rpc_addr in (* Wait for the L1 node to be bootstrapped. *) + Node_context.(set_l1_crawler_status ctxt L1_bootstrapping) ; let* () = L1_helpers.wait_for_l1_bootstrapped cctxt in let* proto_plugins = Proto_plugins.get_proto_plugins @@ -506,6 +508,10 @@ let run ~data_dir ~configuration_override = match last_processed_level with | None -> (* there's nothing to clean up *) return_unit | Some last_processed_level -> + L1_crawler_status.catching_up_or_synced_status + ~head_level + ~last_processed_level + |> Node_context.set_l1_crawler_status ctxt ; Store_cleanup.clean_up_store_and_catch_up ctxt cctxt diff --git a/src/lib_dal_node/store_cleanup.ml b/src/lib_dal_node/store_cleanup.ml index af511fbd87ef..a3e3f9b76de5 100644 --- a/src/lib_dal_node/store_cleanup.ml +++ b/src/lib_dal_node/store_cleanup.ml @@ -118,6 +118,12 @@ let clean_up_store_and_catch_up_for_refutation_support ctxt cctxt Shell_services.Blocks.Header.shell_header cctxt ~block:(`Head 0) () in let new_head_level = header.Block_header.level in + + L1_crawler_status.catching_up_or_synced_status + ~head_level:new_head_level + ~last_processed_level:last_level + |> Node_context.set_l1_crawler_status ctxt ; + if new_head_level > head_level then do_clean_up last_level new_head_level else let*! () = Event.emit_end_catchup () in -- GitLab From e2e4fc78d2535d7de14fd23697435640c482b81d Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Mon, 19 May 2025 19:24:27 +0200 Subject: [PATCH 5/6] DAL/Node: add a TODO to simplify warn_if_lagging with L1 crawler's status --- src/lib_dal_node/RPC_server.ml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib_dal_node/RPC_server.ml b/src/lib_dal_node/RPC_server.ml index b85fb2554ffc..d90caaa1e873 100644 --- a/src/lib_dal_node/RPC_server.ml +++ b/src/lib_dal_node/RPC_server.ml @@ -395,6 +395,10 @@ module Profile_handlers = struct in Lwt.return_unit + (* TODO: https://gitlab.com/tezos/tezos/-/issues/7969 + + We could reuse the internal L1 crawler's newly added status to implement this + function. *) let warn_if_lagging ~last_finalized_level ~attestation_level = (* The L1 node's level is at least [last_finalized_level + 2], because the DAL node processes blocks with a delay of two levels, to be sure that -- GitLab From 886bc8e63e1b7bc7ae85739629915863227963d6 Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Mon, 19 May 2025 19:30:08 +0200 Subject: [PATCH 6/6] DAL/Node: add warning about L1 crawler's status dep on block processing logic change --- src/lib_dal_node/block_handler.mli | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lib_dal_node/block_handler.mli b/src/lib_dal_node/block_handler.mli index fb33fb5eedb7..ad25fb66fcea 100644 --- a/src/lib_dal_node/block_handler.mli +++ b/src/lib_dal_node/block_handler.mli @@ -23,6 +23,15 @@ (* *) (*****************************************************************************) +(* + + /!\ WARNING: + + If the daemon's logic regarding block processing changes, make sure that + the L1 crawler's [status] field in [node_context] is updated accordingly. + +*) + (** [new_finalized_head ctxt cctxt l1_crawler cryptobox block_hash shell_header ~launch_time] processes a new finalized L1 block. It performs cleanup of old DAL data, updates the committee cache, re-registers the gossipsub message -- GitLab