diff --git a/tezt/lib_cloud/agent.ml b/tezt/lib_cloud/agent.ml index 94fab887e631dda94c1b6108a09855f9f02c7ae1..c7e73af1042f2724dac796970b00ef524d4d1c93 100644 --- a/tezt/lib_cloud/agent.ml +++ b/tezt/lib_cloud/agent.ml @@ -69,6 +69,8 @@ type t = { configuration : Configuration.t; process_monitor : Process_monitor.t option; service_manager : Service_manager.t option; + daily_logs_dir : string option; + mutable on_shutdown : (unit -> unit Lwt.t) list; } let ssh_id () = Env.ssh_private_key_filename () @@ -87,19 +89,23 @@ let encoding = configuration; process_monitor; service_manager = _; + daily_logs_dir; + on_shutdown = _; } -> ( vm_name, zone, point, next_available_port (), configuration, - process_monitor )) + process_monitor, + daily_logs_dir )) (fun ( vm_name, zone, point, next_available_port, configuration, - process_monitor ) -> + process_monitor, + daily_logs_dir ) -> let next_available_port = let current_port = ref (next_available_port - 1) in fun () -> @@ -139,15 +145,18 @@ let encoding = configuration; process_monitor; service_manager = None; - (* As of now, this encoding is only used when reattaching *) + daily_logs_dir; + on_shutdown = + [] (* As of now, this encoding is only used when reattaching *); }) - (obj6 + (obj7 (req "vm_name" (option string)) (req "zone" (option string)) (req "point" (option (tup2 string int31))) (req "next_available_port" int31) (req "configuration" Configuration.encoding) - (opt "process_monitor" Process_monitor.encoding)) + (opt "process_monitor" Process_monitor.encoding) + (opt "daily_logs_dir" string)) (* Getters *) @@ -163,8 +172,10 @@ let runner {runner; _} = runner let configuration {configuration; _} = configuration +let daily_logs_dir {daily_logs_dir; _} = daily_logs_dir + let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name - ~process_monitor () = + ~process_monitor ~daily_logs_dir () = let ssh_user = "root" in let runner = match (point, ssh_id) with @@ -190,6 +201,8 @@ let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name zone; process_monitor; service_manager = Service_manager.init () |> Option.some; + daily_logs_dir; + on_shutdown = []; } let cmd_wrapper {zone; vm_name; _} = @@ -213,6 +226,14 @@ let process_monitor agent = agent.process_monitor let service_manager t = t.service_manager +let temp_execution_path () = + (* This assumes that Tezt.Temp.file always returns the same result for the + same process. *) + Temp.dir "" + +let register_shutdown_callback t callback = + t.on_shutdown <- callback :: t.on_shutdown + let host_run_command agent cmd args = match cmd_wrapper agent with | None -> Process.spawn cmd args diff --git a/tezt/lib_cloud/agent.mli b/tezt/lib_cloud/agent.mli index c15c3ab8a87ab957b62758364b00c191a898ed9c..2b57937a06af95f1557df4629393b282374327fd 100644 --- a/tezt/lib_cloud/agent.mli +++ b/tezt/lib_cloud/agent.mli @@ -50,11 +50,12 @@ module Configuration : sig t end -(** [make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name ()] - creates an [agent] from the given parameters. [~next_available_port] should - always provide an available port or raise [Not_found] otherwise. - [~vm_name] is the name of the VM. [?ssh_id] and [?point] are used to potentially - create a [runner] for the [agent]. *) +(** [make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name + ~daily_logs_dir ()] creates an [agent] from the given parameters. + [~next_available_port] should always provide an available port or raise + [Not_found] otherwise. [~vm_name] is the name of the VM. [?ssh_id] and + [?point] are used to potentially create a [runner] for the [agent]. + [daily_logs_dir] stands for the path to the agent's daily logs. *) val make : ?zone:string -> ?ssh_id:string -> @@ -63,6 +64,7 @@ val make : next_available_port:(unit -> int) -> vm_name:string option -> process_monitor:Process_monitor.t option -> + daily_logs_dir:string option -> unit -> t @@ -88,6 +90,9 @@ val runner : t -> Runner.t option (** [configuration t] the configuration of the agent. *) val configuration : t -> Configuration.t +(** [daily_logs_dir agent] daily logs directory associated to the agent. *) +val daily_logs_dir : t -> string option + (** A wrapper to run a command on the VM of the agent. *) val cmd_wrapper : t -> Gcloud.cmd_wrapper option @@ -100,6 +105,13 @@ val process_monitor : t -> Process_monitor.t option (** Returns the service manager if any *) val service_manager : t -> Service_manager.t option +(** Returns the path in which the agent aims it's data. *) +val temp_execution_path : unit -> string + +(** Register a callback that will be executed as soon as the agent is shutting + down. *) +val register_shutdown_callback : t -> (unit -> unit Lwt.t) -> unit + (** Run a command on the docker image run by the agent. This command should not be used outside of the [tezt-cloud] diff --git a/tezt/lib_cloud/cli.ml b/tezt/lib_cloud/cli.ml index 945de1e91e85e36cae4707ea4dad5aec529b64ac..8a180f5782ec27d6b339a14f955d40c326a4a792 100644 --- a/tezt/lib_cloud/cli.ml +++ b/tezt/lib_cloud/cli.ml @@ -728,6 +728,35 @@ let log_rotation = \ Set to 0 to completely disable log-rotation" (Option.value ~default:300 config.log_rotation) +let daily_logs_typ : string option Clap.typ = + Clap.typ + ~name:"daily_logs" + ~dummy:None + ~parse:(fun s -> + if Sys.file_exists s then ( + Log.error + "The destination folder of --retrieve-daily-logs already exists: %s" + s ; + None) + else if proxy then ( + Log.warn + "The --retrieve-daily-logs option is not available when --proxy is \ + used." ; + Some None) + else Some (Some s)) + ~show:(function Some s -> s | None -> "empty") + +let retrieve_daily_logs = + Clap.default + ~section + ~long:"retrieve-daily-logs" + ~description: + "Retrieves the daily logs, usually info logs, that are generated by the \ + daemons, and stores it at the given path. This can represent quite a \ + huge quantity of data. Set to [false] by default." + daily_logs_typ + None + let section = Clap.section ~description:"Define report and alert managing options" diff --git a/tezt/lib_cloud/cli.mli b/tezt/lib_cloud/cli.mli index 55a88094e8e840501f2b5d3b5521c56f50cf8989..fa119d8138af2a4791157a2918b2f87fa47fa9e0 100644 --- a/tezt/lib_cloud/cli.mli +++ b/tezt/lib_cloud/cli.mli @@ -147,6 +147,9 @@ val binaries_path : string Use 0 to disable log-rotation *) val log_rotation : int +(* Daily log path retrieval if set. *) +val retrieve_daily_logs : string option + (** The hostname of the host accessed by ssh on which to deploy *) val ssh_host : string option diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 84e1e45de966e7491687b29def74d05a86738fc6..c177d59679ff1c33c3f54efc2ba0a06df895b878 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -53,6 +53,15 @@ let shutdown ?exn t = Lwt.return_unit) else Lwt.return_unit in + (* Shutdown the service managers before alert_manager *) + let* () = + Lwt_list.iter_s + (fun agent -> + match Agent.service_manager agent with + | None -> Lwt.return_unit + | Some sm -> Service_manager.shutdown sm) + t.agents + in Log.info "Shutting down processes..." ; let* () = Lwt.catch @@ -114,13 +123,6 @@ let shutdown ?exn t = (Printexc.to_string exn) ; Lwt.return_unit) in - (* Shutdown the service managers before alert_manager *) - let () = - List.iter - (fun agent -> - Option.iter Service_manager.shutdown (Agent.service_manager agent)) - t.agents - in let* () = if Option.is_some t.alert_manager then Alert_manager.shutdown () else Lwt.return_unit @@ -628,6 +630,7 @@ let register ?proxy_files ?proxy_args ?vms ~__FILE__ ~title ~tags ?seed ?alerts ~next_available_port ~vm_name:None ~process_monitor + ~daily_logs_dir:Env.retrieve_daily_logs () in f @@ -742,6 +745,7 @@ let agents t = ~next_available_port ~vm_name:(Some (Format.asprintf "%s-orchestrator" Env.tezt_cloud)) ~process_monitor + ~daily_logs_dir:Env.retrieve_daily_logs () in [default_agent] @@ -845,7 +849,7 @@ let agents_by_service_name = Hashtbl.create 10 let service_name agent name = Format.asprintf "%s-%s" (Agent.name agent) name -let service_register ~name ~executable ?on_alive_callback agent = +let service_register ~name ~executable ?on_alive_callback ~on_shutdown agent = match Agent.service_manager agent with | None -> () | Some service_manager -> @@ -855,6 +859,7 @@ let service_register ~name ~executable ?on_alive_callback agent = ~name ~executable ?on_alive_callback + ~on_shutdown service_manager let notify_service_start ~name ~pid = diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index 2b13e979949c2e24d724bfb688a9b511ddbf5ccf..d2e21378fed679022fa3a1ff5fb9506f5e02f7fc 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -60,6 +60,7 @@ val service_register : name:string -> executable:string -> ?on_alive_callback:(alive:bool -> unit) -> + on_shutdown:(unit -> unit Lwt.t) list -> Agent.t -> unit diff --git a/tezt/lib_cloud/deployement.ml b/tezt/lib_cloud/deployement.ml index 14fc1d1322cee24c8e6953ccaf85bc23bf15b867..5951605a9460c0c0266320a62a28548a983d3d1f 100644 --- a/tezt/lib_cloud/deployement.ml +++ b/tezt/lib_cloud/deployement.ml @@ -70,6 +70,7 @@ module Remote = struct let os = vm_configuration.os in let auto_approve = Env.auto_approve in let prometheus_port = Env.prometheus_port in + let daily_logs_dir = Env.retrieve_daily_logs in let* () = Terraform.VM.deploy ~auto_approve @@ -123,6 +124,7 @@ module Remote = struct ~next_available_port ~vm_name:(Some vm_name) ~process_monitor + ~daily_logs_dir () |> Lwt.return in @@ -530,6 +532,7 @@ module Ssh_host = struct ~point:(Runner.address (Some runner), ssh_listening_port) ~ssh_id:(Env.ssh_private_key_filename ()) ~process_monitor:None + ~daily_logs_dir:Env.retrieve_daily_logs () in Lwt.return agent @@ -597,6 +600,7 @@ module Ssh_host = struct ~process_monitor:None ~point:(host, ssh_port) ~ssh_id:(Env.ssh_private_key_filename ()) + ~daily_logs_dir:Env.retrieve_daily_logs () in Lwt.return agent) @@ -741,6 +745,7 @@ module Localhost = struct ~next_available_port:(fun () -> next_port point) ~vm_name:None ~process_monitor + ~daily_logs_dir:Env.retrieve_daily_logs ()) in Lwt.return {number_of_vms; processes; base_port; ports_per_vm; agents} diff --git a/tezt/lib_cloud/env.ml b/tezt/lib_cloud/env.ml index f11c91508523dc6b2fda764102a4d0d926d1b225..8147c3f24b364ee743b52e58327e2ba13862ccd2 100644 --- a/tezt/lib_cloud/env.ml +++ b/tezt/lib_cloud/env.ml @@ -109,6 +109,8 @@ let process_monitoring = Cli.process_monitoring let log_rotation = Cli.log_rotation +let retrieve_daily_logs = Cli.retrieve_daily_logs + let init () = if tezt_cloud = "" then Test.fail diff --git a/tezt/lib_cloud/env.mli b/tezt/lib_cloud/env.mli index c32c9da4482c903cc4068ec7700eacd086727b9f..84f2a06a6683c6510f2c65185e4ac670d64390db 100644 --- a/tezt/lib_cloud/env.mli +++ b/tezt/lib_cloud/env.mli @@ -143,6 +143,9 @@ val binaries_path : string (** Equivalent to [Cli.log_rotation] *) val log_rotation : int +(** Equivalent to [Cli.retrieve_daily_logs] *) +val retrieve_daily_logs : string option + (** Notification backend, slack_channel_id and slack_bot_token *) val notifier : Types.notifier diff --git a/tezt/lib_cloud/service_manager.ml b/tezt/lib_cloud/service_manager.ml index b7b28e672b10dd009fc69da72f01fe80cdb9ca19..7730e5175f27aa3b2742f5568e2d0f495c3aceb0 100644 --- a/tezt/lib_cloud/service_manager.ml +++ b/tezt/lib_cloud/service_manager.ml @@ -10,6 +10,7 @@ type service = { mutable executable : string option; on_alive_callback : alive:bool -> unit; mutable pid : int option; + mutable on_shutdown : (unit -> unit Lwt.t) list; } type t = { @@ -93,7 +94,7 @@ let register_service ~name ~executable ?(on_alive_callback = fun ~alive -> ignore alive ; - ()) t = + ()) ~on_shutdown t = (* Start only when needed *) let () = if Hashtbl.length t.services = 0 then start t else () in (* Get the real executable name *) @@ -101,12 +102,14 @@ let register_service ~name ~executable if Sys.file_exists executable then let executable = Unix.realpath executable in let service = - {executable = Some executable; on_alive_callback; pid = None} + {executable = Some executable; on_alive_callback; pid = None; on_shutdown} in let () = Hashtbl.add t.services name service in Log.info "%s: Registering service: %s (%s)" section name executable else - let service = {executable = None; on_alive_callback; pid = None} in + let service = + {executable = None; on_alive_callback; pid = None; on_shutdown} + in let () = Hashtbl.add t.services name service in Log.info "%s: Registering service: %s (%s)" section name executable @@ -135,4 +138,21 @@ let notify_stop_service ~name t = let () = Log.info "%s: Notify stop service %s" section name in service.pid <- None -let shutdown t = Lwt.wakeup t.worker_waker () +let shutdown t = + let on_shutdown_callbacks = + Hashtbl.fold + (fun name service acc -> (name, service.on_shutdown) :: acc) + t.services + [] + in + let* () = + Lwt_list.iter_s + (fun (name, callbacks) -> + Log.info + "Running service manager shutdown callback for service: %s" + name ; + Lwt_list.iter_s (fun callback -> callback ()) callbacks) + on_shutdown_callbacks + in + Lwt.wakeup t.worker_waker () ; + Lwt.return_unit diff --git a/tezt/lib_cloud/service_manager.mli b/tezt/lib_cloud/service_manager.mli index c5fc06165186b9de25660b23a33b9e6d4ac34355..fa8ad574ecc58b88d8496787476ac529ce095155 100644 --- a/tezt/lib_cloud/service_manager.mli +++ b/tezt/lib_cloud/service_manager.mli @@ -13,14 +13,19 @@ type t (** [init] creates a new instance of a service manager *) val init : unit -> t -(** [register_service ~executable ~pid ~on_alive_callback t] register a new - service with the manager [t]. The [on_alive_callback] callback is called - regularly with a boolean indicating the daemon state. - Automatically start the loop on the first service. *) +(** [register_service ~executable ~pid ~on_alive_callback on_shutdown t] + register a new service with the manager [t]. + The [on_alive_callback] callback is called regularly with a boolean + indicating the daemon state. Automatically start the loop on the first + service. + The [on_shutdown] callbacks aims to be run as soon as [shutdown] is call on + the service. +*) val register_service : name:string -> executable:string -> ?on_alive_callback:(alive:bool -> unit) -> + on_shutdown:(unit -> unit Lwt.t) list -> t -> unit @@ -30,5 +35,6 @@ val notify_start_service : name:string -> pid:int -> t -> unit (** [notify_stop_service name pid] notifies service [name] was stopped *) val notify_stop_service : name:string -> t -> unit -(** [shutdown t] terminates the service manager [t] *) -val shutdown : t -> unit +(** [shutdown t] terminates the service manager [t]. This will run all the + [on_shutdown] callbacks attatched to this service. *) +val shutdown : t -> unit Lwt.t diff --git a/tezt/lib_cloud/tezt_cloud.ml b/tezt/lib_cloud/tezt_cloud.ml index b15cdd90a4233526348bab99b2683e0055b63c21..9e693ad953f8d55d059e8730d41847b3a2574385 100644 --- a/tezt/lib_cloud/tezt_cloud.ml +++ b/tezt/lib_cloud/tezt_cloud.ml @@ -303,4 +303,6 @@ module Tezt_cloud_cli = struct let prometheus = Cli.prometheus let scenario_specific_json = Cli.scenario_specific + + let retrieve_daily_logs = Cli.retrieve_daily_logs end diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index b397424aad26115f059a0910a932c180a292e054..4c463fd37f2e808f56957af65f740443d268f607 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -214,20 +214,23 @@ module Cloud : sig unit -> unit Lwt.t - (** [service_register: name executable on_alive_callback agent] register a - service, ie, a long running background process, that we want to monitor - for launch and crash. + (** [service_register: name executable on_alive_callback on_shutdowan agent] + register a service, ie, a long running background process, that we want to + monitor for launch and crash. [name] is a unique name to identify the service. [on_alive_callback] is a callback whose argument is a boolean which represent the service started if true, or the service was shutdown if false. This callback is called regularly, and expects to be update some metrics. + [on_shutdown] is a list of callbacks that will be called as soon as the + shutdown of a service will be triggered. TODO: change arguments executable and pid to a abstraction for tezt Daemon.t and merge register_binary functionality into register_service *) val service_register : name:string -> executable:string -> ?on_alive_callback:(alive:bool -> unit) -> + on_shutdown:(unit -> unit Lwt.t) list -> Agent.t -> unit @@ -257,6 +260,9 @@ module Tezt_cloud_cli : sig val prometheus : bool val scenario_specific_json : (string * Data_encoding.Json.t) option + + (** Equivalent to [Cli.retrieve_daily_logs] *) + val retrieve_daily_logs : string option end (** [register ~tags] register a set of jobs that can be used for setting diff --git a/tezt/tests/cloud/agent_kind.ml b/tezt/tests/cloud/agent_kind.ml index 35d25cf979978e2810b7868d3b7e01638b76f7a1..9cc6ab224c6da374168ceedc3894f377b0d8a29a 100644 --- a/tezt/tests/cloud/agent_kind.ml +++ b/tezt/tests/cloud/agent_kind.ml @@ -61,3 +61,61 @@ let name_of_daemon = function Format.asprintf "etherlink-%s-rollup-node" name | Etherlink_evm_node name -> Format.asprintf "etherlink-%s-evm-node" name | Etherlink_producer_node name -> Format.asprintf "etherlink-%s-node" name + +module Logs = struct + let scp_logs ~destination_root ~daemon_name agent = + let agent_name = Agent.name agent in + (* This is not compatible with the --proxy mode as the Agent's location of + the proxy might differ from the localhost one. *) + let tezt_root_path = Agent.temp_execution_path () in + Log.info "Retrieving logs from %s" daemon_name ; + match Agent.runner agent with + | None -> + Log.warn "Cannot retrieve logs for %s: no runner for agent" agent_name ; + Lwt.return_unit + | Some runner -> + let identity = + Option.fold ~none:[] ~some:(fun i -> ["-i"; i]) runner.Runner.ssh_id + in + let port = + Option.fold + ~none:[] + ~some:(fun p -> ["-P"; Format.sprintf "%d" p]) + runner.Runner.ssh_port + in + let source = + Format.sprintf + "%s%s:%s" + (Option.fold + ~none:"" + ~some:(fun u -> Format.sprintf "%s@" u) + runner.Runner.ssh_user) + runner.address + tezt_root_path + in + let local_path = + let local_path_root = destination_root // agent_name in + if not (Sys.file_exists destination_root) then + Sys.mkdir destination_root 0o755 ; + if not (Sys.file_exists local_path_root) then + Sys.mkdir local_path_root 0o755 ; + let local_path = local_path_root // daemon_name in + let () = Sys.mkdir local_path 0o755 in + local_path + in + Lwt.catch + (fun () -> + Process.run + "scp" + (["-r"] @ ["-O"] + @ ["-o"; "StrictHostKeyChecking=no"] + @ identity @ port + @ [source // daemon_name // "daily_logs"] + @ [local_path // "daily_logs"])) + (fun exn -> + Log.warn + "Cannot retrieve log from %s: %s" + agent_name + (Printexc.to_string exn) ; + Lwt.return_unit) +end diff --git a/tezt/tests/cloud/agent_kind.mli b/tezt/tests/cloud/agent_kind.mli index e82150a3d871c573a189d6c3c9a416bf4b44d4ca..012158aaf468b01ba40caba2be094b2056be8a6d 100644 --- a/tezt/tests/cloud/agent_kind.mli +++ b/tezt/tests/cloud/agent_kind.mli @@ -54,3 +54,15 @@ type daemon = (** [name_of_daemon] returns the standard name associated with a given [daemon]. Used for consistent naming of VMs, logs and artifacts. *) val name_of_daemon : daemon -> string + +module Logs : sig + (** [scp_logs ~destination_root ~daemon_name agent] uses scp to copy the + `daily_logs` directory from the VM hosting the [agent]'s actor given by + [~daemon_name] into [~destination_root//~daemon_name/daily_logs]. + + If the agent has no SSH runner or the copying process fails, the function is + a no-op (with a corresponding warning). Any missing directory is automatically + created. *) + val scp_logs : + destination_root:string -> daemon_name:string -> Agent.t -> unit Lwt.t +end diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index befb4532c2a7d2eef5549b2fcdb0b75a454f75b5..d841926dc06d1b22999fbd2bdbf0bf5c08ce5450 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -61,6 +61,7 @@ type configuration = { ppx_profiling : bool; ppx_profiling_backends : string list; network_health_monitoring : bool; + daily_logs_destination : string option; } type bootstrap = { @@ -1335,6 +1336,7 @@ let register (module Cli : Scenarios_cli.Dal) = let ppx_profiling = Cli.ppx_profiling in let ppx_profiling_backends = Cli.ppx_profiling_backends in let network_health_monitoring = Cli.enable_network_health_monitoring in + let daily_logs_destination = Tezt_cloud_cli.retrieve_daily_logs in let t = { with_dal; @@ -1367,6 +1369,7 @@ let register (module Cli : Scenarios_cli.Dal) = ppx_profiling; ppx_profiling_backends; network_health_monitoring; + daily_logs_destination; } in (t, etherlink) diff --git a/tezt/tests/cloud/layer1.ml b/tezt/tests/cloud/layer1.ml index d528a52ed3daee2c533ce3e41ab4749bec254814..41dac25c70b4f3ab98aac17136d9388a2ce2dcc1 100644 --- a/tezt/tests/cloud/layer1.ml +++ b/tezt/tests/cloud/layer1.ml @@ -319,6 +319,8 @@ type stresstest_conf = {pkh : string; pk : string; tps : int; seed : int} upgrade will be performed via a UAU. - [stresstest]: See the description of [stresstest_conf] + + - [daily_logs_destination]: daemons daily logs retrieval folder, if set. *) type configuration = { stake : int list; @@ -327,6 +329,7 @@ type configuration = { stresstest : stresstest_conf option; maintenance_delay : int; migration_offset : int option; + daily_logs_destination : string option; } (** A version of the [configuration] partially defined. *) @@ -1094,7 +1097,16 @@ let register (module Cli : Scenarios_cli.Layer1) = if stake = [] then Test.fail "stake parameter can not be empty" ; if snapshot = Snapshot_helpers.No_snapshot then Test.fail "snapshot parameter can not be empty" ; - {stake; network; snapshot; stresstest; maintenance_delay; migration_offset} + let daily_logs_destination = Tezt_cloud_cli.retrieve_daily_logs in + { + stake; + network; + snapshot; + stresstest; + maintenance_delay; + migration_offset; + daily_logs_destination; + } in toplog "Creating the agents" ; let agents = Cloud.agents cloud in diff --git a/tezt/tests/cloud/tezos.ml b/tezt/tests/cloud/tezos.ml index 7c2853a9489d9c5650808b7084d073138b268b6a..e6896f224fc3aaf9fdae39400bc157e110d0d24f 100644 --- a/tezt/tests/cloud/tezos.ml +++ b/tezt/tests/cloud/tezos.ml @@ -290,7 +290,7 @@ module Node = struct module Agent = struct let create ?(group = "L1") ?rpc_external ?(metadata_size_limit = true) ?(arguments = []) ?data_dir ?(path = Uses.path Constant.octez_node) - ?name ?net_addr cloud agent = + ~name ?net_addr cloud agent = let* path = Agent.copy agent ~source:path in let binary_name = Filename.basename path in let* () = @@ -326,7 +326,7 @@ module Node = struct let node = create ?data_dir - ?name + ~name ~path ?runner ?rpc_external @@ -377,12 +377,7 @@ module Node = struct Format.asprintf "%s:prometheus-process-exporter" (Agent.name agent) in let target = Cloud.{agent; port = Node.metrics_port node; app_name} in - let* () = - Cloud.add_prometheus_source - cloud - ~name:(Option.value name ~default:(Node.name node)) - [target] - in + let* () = Cloud.add_prometheus_source cloud ~name [target] in (* Prometheus process-exporter *) Alerts.add_process_exporter_alerts ~cloud @@ -394,10 +389,23 @@ module Node = struct given in command line. The alerts must match the same groupname *) receiver in + let on_shutdown = + match Agent.daily_logs_dir agent with + | None -> [] + | Some destination_root -> + [ + (fun () -> + Agent_kind.Logs.scp_logs + ~destination_root + ~daemon_name:name + agent); + ] + in Cloud.service_register ~name:node_name ~executable ~on_alive_callback + ~on_shutdown agent ; Lwt.return node @@ -499,7 +507,7 @@ module Dal_node = struct module Agent = struct let create_from_endpoint ?(group = "DAL") ?net_port - ?(path = Uses.path Constant.octez_dal_node) ?name ?rpc_port + ?(path = Uses.path Constant.octez_dal_node) ~name ?rpc_port ?disable_shard_validation ?ignore_pkhs ~l1_node_endpoint cloud agent = let* path = Agent.copy agent ~source:path in let binary_name = Filename.basename path in @@ -527,7 +535,7 @@ module Dal_node = struct let listen_addr = Format.asprintf "0.0.0.0:%d" net_port in let node = create_from_endpoint - ?name + ~name ~path ?runner ~rpc_port @@ -557,11 +565,22 @@ module Dal_node = struct ] (if alive then 1.0 else 0.0) in - Cloud.service_register - ~name:node_name - ~executable - ~on_alive_callback - agent ; + let alert = + Alert.make + ~name:"ServiceManagerProcessDown" + ~description: + {|This alert is raised when a process monitored by the service_manager is detected as being not running. This happens typically when the process pid is not found anymore in the process tree, or the pid has been recycled and does not correspond to the executable that was run initially|} + ~summary: + (Format.asprintf + "'[%s.service_manager] the process [%s] is down'" + (Agent.name agent) + executable) + ~route:(Alert.route receiver) + ~severity:Alert.Critical + ~expr:(Format.asprintf {|%s{name="%s"} < 1|} metric_name name) + () + in + let* () = Cloud.add_alert cloud ~alert in let alert = Alerts.service_manager_process_down ~agent:(Agent.name agent) @@ -579,12 +598,7 @@ module Dal_node = struct let target = Cloud.{agent; port = Dal_node.metrics_port node; app_name} in - let* () = - Cloud.add_prometheus_source - cloud - ~name:(Option.value name ~default:(Dal_node.name node)) - [target] - in + let* () = Cloud.add_prometheus_source cloud ~name [target] in Alerts.add_process_exporter_alerts ~cloud ~agent_name:(Agent.name agent) @@ -593,14 +607,32 @@ module Dal_node = struct ~groupname:binary_name receiver in + let on_shutdown = + match Agent.daily_logs_dir agent with + | None -> [] + | Some destination_root -> + [ + (fun () -> + Agent_kind.Logs.scp_logs + ~destination_root + ~daemon_name:name + agent); + ] + in + Cloud.service_register + ~name + ~executable + ~on_alive_callback + ~on_shutdown + agent ; Lwt.return node - let create ?net_port ?path ?name ?disable_shard_validation ?ignore_pkhs + let create ?net_port ?path ~name ?disable_shard_validation ?ignore_pkhs ~node agent = create_from_endpoint ?net_port ?path - ?name + ~name ?disable_shard_validation ?ignore_pkhs ~l1_node_endpoint:(Node.as_rpc_endpoint node)