From 58c3490782f8f5e421f818d392ca446c44860115 Mon Sep 17 00:00:00 2001 From: Hannes Mehnert Date: Fri, 11 Oct 2019 01:10:33 +0200 Subject: [PATCH] restart on failure --- client/albatross_client_bistro.ml | 6 +- client/albatross_client_local.ml | 6 +- command-line/albatross_cli.ml | 9 +- daemon/albatrossd.ml | 248 +++++++++++++---------- packaging/MANIFEST | 13 +- provision/albatross_provision_request.ml | 6 +- src/vmm_asn.ml | 35 +++- src/vmm_core.ml | 15 +- src/vmm_core.mli | 3 + src/vmm_lwt.ml | 6 +- src/vmm_vmmd.ml | 86 ++++---- src/vmm_vmmd.mli | 10 +- 12 files changed, 262 insertions(+), 181 deletions(-) diff --git a/client/albatross_client_bistro.ml b/client/albatross_client_bistro.ml index dda33dd..9e86800 100644 --- a/client/albatross_client_bistro.ml +++ b/client/albatross_client_bistro.ml @@ -110,8 +110,8 @@ let info_ _ endp cert key ca name = let destroy _ endp cert key ca name = jump endp cert key ca name (`Unikernel_cmd `Unikernel_destroy) -let create _ endp cert key ca force name image cpuid memory argv block network compression = - match Albatross_cli.create_vm force image cpuid memory argv block network compression with +let create _ endp cert key ca force name image cpuid memory argv block network compression restart_on_fail = + match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail with | Ok cmd -> jump endp cert key ca name (`Unikernel_cmd cmd) | Error (`Msg msg) -> Error (`Msg msg) @@ -208,7 +208,7 @@ let create_cmd = [`S "DESCRIPTION"; `P "Creates a virtual machine."] in - Term.(term_result (const create $ setup_log $ destination $ ca_cert $ ca_key $ server_ca $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9)), + Term.(term_result (const create $ setup_log $ destination $ ca_cert $ ca_key $ server_ca $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9 $ restart_on_fail)), Term.info "create" ~doc ~man let console_cmd = diff --git a/client/albatross_client_local.ml b/client/albatross_client_local.ml index 29218ea..2b98c79 100644 --- a/client/albatross_client_local.ml +++ b/client/albatross_client_local.ml @@ -61,8 +61,8 @@ let info_ _ opt_socket name = let destroy _ opt_socket name = jump opt_socket name (`Unikernel_cmd `Unikernel_destroy) -let create _ opt_socket force name image cpuid memory argv block network compression = - match Albatross_cli.create_vm force image cpuid memory argv block network compression with +let create _ opt_socket force name image cpuid memory argv block network compression restart_on_fail = + match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail with | Ok cmd -> jump opt_socket name (`Unikernel_cmd cmd) | Error (`Msg msg) -> Error (`Msg msg) @@ -153,7 +153,7 @@ let create_cmd = [`S "DESCRIPTION"; `P "Creates a virtual machine."] in - Term.(term_result (const create $ setup_log $ socket $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 0)), + Term.(term_result (const create $ setup_log $ socket $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 0 $ restart_on_fail)), Term.info "create" ~doc ~man let console_cmd = diff --git a/command-line/albatross_cli.ml b/command-line/albatross_cli.ml index 1d40ee3..6b125d8 100644 --- a/command-line/albatross_cli.ml +++ b/command-line/albatross_cli.ml @@ -79,7 +79,7 @@ let setup_log style_renderer level = Logs.set_level level; Logs.set_reporter (Logs_fmt.reporter ~dst:Format.std_formatter ()) -let create_vm force image cpuid memory argv block_devices bridges compression = +let create_vm force image cpuid memory argv block_devices bridges compression restart_on_fail = let open Rresult.R.Infix in Bos.OS.File.read (Fpath.v image) >>| fun image -> let image = match compression with @@ -88,8 +88,9 @@ let create_vm force image cpuid memory argv block_devices bridges compression = let img = Vmm_compress.compress ~level image in `Hvt_amd64_compressed, Cstruct.of_string img and argv = match argv with [] -> None | xs -> Some xs + and fail_behaviour = if restart_on_fail then `Restart else `Quit in - let config = Unikernel.{ cpuid ; memory ; block_devices ; bridges ; argv ; image } in + let config = Unikernel.{ cpuid ; memory ; block_devices ; bridges ; argv ; image ; fail_behaviour } in if force then `Unikernel_force_create config else `Unikernel_create config let policy vms memory cpus block bridges = @@ -236,6 +237,10 @@ let net = let doc = "Network device names" in Arg.(value & opt_all string [] & info [ "net" ] ~doc) +let restart_on_fail = + let doc = "Restart on fail" in + Arg.(value & flag & info [ "restart-on-fail" ] ~doc) + let timestamp_c = let parse s = match Ptime.of_rfc3339 s with | Ok (t, _, _) -> `Ok t diff --git a/daemon/albatrossd.ml b/daemon/albatrossd.ml index 56f5c40..6a97dfd 100644 --- a/daemon/albatrossd.ml +++ b/daemon/albatrossd.ml @@ -10,37 +10,85 @@ let version = `AV3 let state = ref (Vmm_vmmd.init version) -let create stat_out log_out cons_out data_out cons succ_cont fail_cont = - cons_out "create" cons >>= function - | Error () -> - let data = fail_cont () in - data_out data - | Ok () -> match succ_cont !state with - | Error (`Msg msg) -> - Logs.err (fun m -> m "create (exec) failed %s" msg) ; - let data = fail_cont () in - data_out data - | Ok (state', stat, log, data, name, vm) -> - state := state' ; - Lwt.async (fun () -> - Vmm_lwt.wait_and_clear vm.Unikernel.pid >>= fun r -> - let state', stat', log' = Vmm_vmmd.handle_shutdown !state name vm r in - state := state' ; - stat_out "handle shutdown stat" stat' >>= fun () -> - log_out "handle shutdown log" log' >|= fun () -> - let state', waiter_opt = Vmm_vmmd.waiter !state name in - state := state' ; - (match waiter_opt with - | None -> () - | Some wakeme -> Lwt.wakeup wakeme ())) ; - stat_out "setting up stat" stat >>= fun () -> - log_out "setting up log" log >>= fun () -> - data_out data +let stub_hdr = Vmm_commands.{ version ; sequence = 0L ; name = Name.root } +let stub_data_out _ = Lwt.return_unit -let register who header = - match Vmm_vmmd.register !state who Lwt.task with - | None -> Error (header, `Failure "task already registered") - | Some (state', task) -> state := state' ; Ok task +(* + - handle_create only prepares the unikernel (fifo, image file) + -> IO console about fifo + - only the succ_cont later commits this (to resources) + --> there's a brief period + *) + +let create_lock = Lwt_mutex.create () + +let rec create stat_out log_out cons_out data_out hdr name config = + (match Vmm_vmmd.handle_create !state hdr name config with + | Error `Msg msg -> + Logs.err (fun m -> m "failed to create %a: %s" Name.pp name msg) ; + Lwt.return (None, (hdr, `Failure msg)) + | Ok (state', `Create (cons, succ_cont, fail_cont)) -> + state := state'; + cons_out "create" cons >>= function + | Error () -> Lwt.return (None, fail_cont ()) + | Ok () -> match succ_cont !state with + | Error (`Msg msg) -> + Logs.err (fun m -> m "create (exec) failed %s" msg) ; + Lwt.return (None, fail_cont ()) + | Ok (state', stat, log, data, name, vm) -> + state := state'; + (match Unikernel.(vm.config.fail_behaviour) with + | `Quit -> () + | `Restart -> + match Vmm_vmmd.register_restart !state name Lwt.task with + | None -> () + | Some (state', task) -> + state := state'; + Lwt.async (fun () -> + task >>= function + | (`Signal _ | `Stop _) as r -> + Logs.warn (fun m -> m "unikernel %a exited with signal %a" + Name.pp name pp_process_exit r); + Lwt.return_unit + | `Exit i -> + (* results: + normal exit (i.e. teardown) is 0 + solo5-exit allows an arbitrary int + solo5-abort emits 255 + solo5 internal error (bad image, bad manigest) is 1 + ocaml exceptions (out of memory et al) use 2 + -> soon (4.10) they'll abort == 255 + signal 11 is if a kill -TERM was sent (i.e. our destroy) + + --> best: user-provided list of which exit codes to restart on + (and filter 1 specially) + *) + match i with + | 1 -> Logs.warn (fun m -> m "solo5 exit failure"); Lwt.return_unit + | _ -> + Logs.info (fun m -> m "solo5 exited with %d, restarting" i); + Lwt_mutex.with_lock create_lock (fun () -> + create stat_out log_out cons_out stub_data_out + stub_hdr name vm.Unikernel.config))); + stat_out "setting up stat" stat >>= fun () -> + log_out "setting up log" log >|= fun () -> + (Some vm, data)) >>= fun (started, data) -> + (match started with + | None -> () + | Some vm -> + Lwt.async (fun () -> + Vmm_lwt.wait_and_clear vm.Unikernel.pid >>= fun r -> + Lwt_mutex.with_lock create_lock (fun () -> + let state', stat', log' = Vmm_vmmd.handle_shutdown !state name vm r in + state := state'; + stat_out "handle shutdown stat" stat' >>= fun () -> + log_out "handle shutdown log" log' >|= fun () -> + let state', waiter_opt = Vmm_vmmd.waiter !state name in + state := state'; + waiter_opt) >|= function + | None -> () + | Some wakeme -> Lwt.wakeup wakeme r)); + data_out data let handle log_out cons_out stat_out fd addr = Logs.debug (fun m -> m "connection from %a" Vmm_lwt.pp_sockaddr addr) ; @@ -69,76 +117,67 @@ let handle log_out cons_out stat_out fd addr = Lwt.return_unit | Ok wire -> Logs.debug (fun m -> m "read %a" Vmm_commands.pp_wire wire) ; + Lwt_mutex.lock create_lock >>= fun () -> match Vmm_vmmd.handle_command !state wire with - | Error wire -> out wire + | Error wire -> Lwt_mutex.unlock create_lock; out wire | Ok (state', next) -> state := state' ; match next with - | `Loop wire -> out wire >>= loop - | `End wire -> out wire - | `Create (cons, succ, fail) -> - create stat_out log_out cons_out out cons succ fail + | `Loop wire -> Lwt_mutex.unlock create_lock; out wire >>= loop + | `End wire -> Lwt_mutex.unlock create_lock; out wire + | `Create (hdr, id, vm) -> + create stat_out log_out cons_out out hdr id vm >|= fun () -> + Lwt_mutex.unlock create_lock | `Wait (who, data) -> - (match register who (fst wire) with - | Error data' -> out data' - | Ok task -> - task >>= fun () -> - out data) - | `Wait_and_create (who, next) -> - (match register who (fst wire) with - | Error data -> out data - | Ok task -> - task >>= fun () -> - match next !state with - | Error data -> out data - | Ok (state', `Create (cons, succ, fail)) -> - state := state' ; - create stat_out log_out cons_out out cons succ fail) + let state', task = Vmm_vmmd.register !state who Lwt.task in + state := state'; + Lwt_mutex.unlock create_lock; + task >>= fun r -> + out (data r) + | `Wait_and_create (who, (hdr, id, vm)) -> + let state', task = Vmm_vmmd.register !state who Lwt.task in + state := state'; + Lwt_mutex.unlock create_lock; + task >>= fun r -> + Logs.info (fun m -> m "wait returned %a" pp_process_exit r); + Lwt_mutex.with_lock create_lock (fun () -> + create stat_out log_out cons_out out hdr id vm) in loop () >>= fun () -> Vmm_lwt.safe_close fd -let connect_client_socket sock = - let name = socket_path sock in - Vmm_lwt.connect Lwt_unix.PF_UNIX (Lwt_unix.ADDR_UNIX name) >|= function - | None -> None - | Some x -> Some (x, Lwt_mutex.create ()) - -let write_reply name (fd, mut) txt (header, cmd) = - Logs.debug (fun m -> m "locking to write to %s" name) ; - Lwt_mutex.with_lock mut (fun () -> - Vmm_lwt.write_wire fd (header, cmd) >>= function - | Error `Exception -> invalid_arg ("exception during " ^ txt ^ " while writing to " ^ name) - | Ok () -> Vmm_lwt.read_wire fd) >|= fun r -> - Logs.debug (fun m -> m "unlocking, wrote and read %s" name) ; - match r with - | Ok (header', reply) -> - if not Vmm_commands.(version_eq header.version header'.version) then begin - Logs.err (fun m -> m "%s: wrong version (got %a, expected %a) in reply from %s" - txt - Vmm_commands.pp_version header'.Vmm_commands.version - Vmm_commands.pp_version header.Vmm_commands.version - name) ; - invalid_arg "bad version received" - end else if not Vmm_commands.(Int64.equal header.sequence header'.sequence) then begin - Logs.err (fun m -> m "%s: wrong id %Lu (expected %Lu) in reply from %s" - txt header'.Vmm_commands.sequence header.Vmm_commands.sequence name) ; - invalid_arg "wrong sequence number received" - end else begin - Logs.debug (fun m -> m "%s: received valid reply from %s %a" - txt name Vmm_commands.pp_wire (header', reply)) ; - match reply with - | `Success _ -> Ok () - | `Failure msg -> - Logs.err (fun m -> m "%s: received failure %s from %s" txt msg name) ; - Error () - | _ -> - Logs.err (fun m -> m "%s: unexpected data from %s" txt name) ; - invalid_arg "unexpected data" - end - | Error _ -> - Logs.err (fun m -> m "error in read from %s" name) ; - invalid_arg "communication failure" +let write_reply name fd txt (header, cmd) = + Vmm_lwt.write_wire fd (header, cmd) >>= function + | Error `Exception -> invalid_arg ("exception during " ^ txt ^ " while writing to " ^ name) + | Ok () -> + Vmm_lwt.read_wire fd >|= function + | Ok (header', reply) -> + if not Vmm_commands.(version_eq header.version header'.version) then begin + Logs.err (fun m -> m "%s: wrong version (got %a, expected %a) in reply from %s" + txt + Vmm_commands.pp_version header'.Vmm_commands.version + Vmm_commands.pp_version header.Vmm_commands.version + name) ; + invalid_arg "bad version received" + end else if not Vmm_commands.(Int64.equal header.sequence header'.sequence) then begin + Logs.err (fun m -> m "%s: wrong id %Lu (expected %Lu) in reply from %s" + txt header'.Vmm_commands.sequence header.Vmm_commands.sequence name) ; + invalid_arg "wrong sequence number received" + end else begin + Logs.debug (fun m -> m "%s: received valid reply from %s %a (request %a)" + txt name Vmm_commands.pp_wire (header', reply) Vmm_commands.pp_wire (header,cmd)) ; + match reply with + | `Success _ -> Ok () + | `Failure msg -> + Logs.err (fun m -> m "%s: received failure %s from %s" txt msg name) ; + Error () + | _ -> + Logs.err (fun m -> m "%s: unexpected data from %s" txt name) ; + invalid_arg "unexpected data" + end + | Error _ -> + Logs.err (fun m -> m "error in read from %s" name) ; + invalid_arg "communication failure" let m = conn_metrics "unix" @@ -148,9 +187,12 @@ let jump _ influx = | Error (`Msg msg) -> Logs.err (fun m -> m "bailing out: %s" msg) | Ok old_unikernels -> Lwt_main.run - (init_influx "albatross" influx; + (let unix_connect s = + Vmm_lwt.connect Lwt_unix.PF_UNIX (Lwt_unix.ADDR_UNIX (socket_path s)) + in + init_influx "albatross" influx; Vmm_lwt.server_socket `Vmmd >>= fun ss -> - (connect_client_socket `Log >|= function + (unix_connect `Log >|= function | None -> invalid_arg "cannot connect to log socket" | Some l -> l) >>= fun l -> let self_destruct_mutex = Lwt_mutex.create () in @@ -166,10 +208,10 @@ let jump _ influx = Vmm_lwt.safe_close ss) in Sys.(set_signal sigterm (Signal_handle (fun _ -> Lwt.async self_destruct))); - (connect_client_socket `Console >|= function + (unix_connect `Console >|= function | None -> invalid_arg "cannot connect to console socket" | Some c -> c) >>= fun c -> - connect_client_socket `Stats >>= fun s -> + unix_connect `Stats >>= fun s -> let log_out txt wire = write_reply "log" l txt wire >|= fun _ -> () and cons_out = write_reply "cons" c @@ -178,19 +220,9 @@ let jump _ influx = | Some s -> write_reply "stat" s txt wire >|= fun _ -> () in - let start_unikernel (name, config) = - let hdr = Vmm_commands.{ version ; sequence = 0L ; name = Name.root } - and data_out _ = Lwt.return_unit - in - match Vmm_vmmd.handle_create !state hdr name config with - | Error (`Msg msg) -> - Logs.err (fun m -> m "failed to restart %a: %s" Name.pp name msg) ; - Lwt.return_unit - | Ok (state', `Create (cons, succ, fail)) -> - state := state' ; - create stat_out log_out cons_out data_out cons succ fail - in - Lwt_list.iter_p start_unikernel (Vmm_trie.all old_unikernels) >>= fun () -> + Lwt_list.iter_p (fun (name, config) -> + create stat_out log_out cons_out stub_data_out stub_hdr name config) + (Vmm_trie.all old_unikernels) >>= fun () -> Lwt.catch (fun () -> let rec loop () = diff --git a/packaging/MANIFEST b/packaging/MANIFEST index 9ff5205..a62821b 100644 --- a/packaging/MANIFEST +++ b/packaging/MANIFEST @@ -73,13 +73,16 @@ messages [ add path 'vmm/solo5*' mode 0660 group albatross - * start TLS endpoint via inetd on port 1025, add to /etc/inetd.conf: -blackjack stream tcp nowait albatross /usr/local/libexec/albatross/alabtross_tls_inetd albatross_tls_inetd /usr/local/etc/albatross/cacert.pem /usr/local/etc/albatross/server.pem /usr/local/etc/albatross/server.key + * start TLS endpoint on port 1025 (blackjack), put cacert.pem, server.pem + and server.key into /usr/local/etc/albatross, add this to /etc/inetd.conf: - and add cacert.pem server.pem and server.key to /usr/local/etc/albatross + blackjack stream tcp nowait albatross \ + /usr/local/libexec/albatross/alabtross_tls_inetd albatross_tls_inetd \ + /usr/local/etc/albatross/cacert.pem \ + /usr/local/etc/albatross/server.pem \ + /usr/local/etc/albatross/server.key - * install solo5-hvt.net solo5-hvt.block solo5-hvt.block-net solo5-hvt.none - in /var/db/albatross + * install solo5-hvt in /var/db/albatross =================================================================== EOD; diff --git a/provision/albatross_provision_request.ml b/provision/albatross_provision_request.ml index 2c69ed4..2b2f33b 100644 --- a/provision/albatross_provision_request.ml +++ b/provision/albatross_provision_request.ml @@ -40,8 +40,8 @@ let info_ _ name = jump name (`Unikernel_cmd `Unikernel_info) let destroy _ name = jump name (`Unikernel_cmd `Unikernel_destroy) -let create _ force name image cpuid memory argv block network compression = - match Albatross_cli.create_vm force image cpuid memory argv block network compression with +let create _ force name image cpuid memory argv block network compression restart_on_fail = + match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail with | Ok cmd -> jump name (`Unikernel_cmd cmd) | Error (`Msg msg) -> Error (`Msg msg) @@ -122,7 +122,7 @@ let create_cmd = [`S "DESCRIPTION"; `P "Creates a virtual machine."] in - Term.(term_result (const create $ setup_log $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9)), + Term.(term_result (const create $ setup_log $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9 $ restart_on_fail)), Term.info "create" ~doc ~man let console_cmd = diff --git a/src/vmm_asn.ml b/src/vmm_asn.ml index fd5f391..69f0aea 100644 --- a/src/vmm_asn.ml +++ b/src/vmm_asn.ml @@ -282,27 +282,40 @@ let log_cmd = Asn.S.map f g @@ Asn.S.(sequence (single (optional ~label:"since" utc_time))) +let fail_behaviour = + let f = function + | `C1 () -> `Quit + | `C2 () -> `Restart + and g = function + | `Quit -> `C1 () + | `Restart -> `C2 () + in + Asn.S.map f g @@ + Asn.S.(choice2 + (explicit 0 null) + (explicit 1 null)) + let unikernel_config = let open Unikernel in - let f (image, cpuid, memory, blocks, bridges, argv) = + let f (fail_behaviour, (image, (cpuid, (memory, (blocks, (bridges, argv)))))) = let bridges = match bridges with None -> [] | Some xs -> xs and block_devices = match blocks with None -> [] | Some xs -> xs in - { cpuid ; memory ; block_devices ; bridges ; image ; argv } + { cpuid ; memory ; block_devices ; bridges ; image ; argv ; fail_behaviour } and g vm = let bridges = match vm.bridges with [] -> None | xs -> Some xs and blocks = match vm.block_devices with [] -> None | xs -> Some xs in - (vm.image, vm.cpuid, vm.memory, blocks, bridges, vm.argv) + (vm.fail_behaviour, (vm.image, (vm.cpuid, (vm.memory, (blocks, (bridges, vm.argv)))))) in - Asn.S.map f g @@ - Asn.S.(sequence6 - (required ~label:"image" image) - (required ~label:"cpu" int) - (required ~label:"memory" int) - (optional ~label:"blocks" (explicit 0 (sequence_of utf8_string))) - (optional ~label:"bridges" (explicit 1 (sequence_of utf8_string))) - (optional ~label:"arguments"(explicit 2 (sequence_of utf8_string)))) + Asn.S.(map f g @@ sequence @@ + (required ~label:"fail behaviour" (explicit 3 fail_behaviour)) + @ (required ~label:"image" image) + @ (required ~label:"cpu" int) + @ (required ~label:"memory" int) + @ (optional ~label:"blocks" (explicit 0 (sequence_of utf8_string))) + @ (optional ~label:"bridges" (explicit 1 (sequence_of utf8_string))) + -@ (optional ~label:"arguments"(explicit 2 (sequence_of utf8_string)))) let unikernel_cmd = let f = function diff --git a/src/vmm_core.ml b/src/vmm_core.ml index fe8c0d8..e3a8911 100644 --- a/src/vmm_core.ml +++ b/src/vmm_core.ml @@ -158,6 +158,11 @@ module Unikernel = struct | `Hvt_amd64_compressed -> Fmt.pf ppf "hvt-amd64-compressed" | `Hvt_arm64 -> Fmt.pf ppf "hvt-arm64" + type fail_behaviour = [ `Quit | `Restart ] + + let pp_fail_behaviour ppf f = + Fmt.string ppf (match f with `Quit -> "quit" | `Restart -> "restart") + type config = { cpuid : int ; memory : int ; @@ -165,6 +170,7 @@ module Unikernel = struct bridges : string list ; image : typ * Cstruct.t ; argv : string list option ; + fail_behaviour : fail_behaviour; } let pp_image ppf (typ, blob) = @@ -172,7 +178,8 @@ module Unikernel = struct Fmt.pf ppf "%a: %d bytes" pp_typ typ l let pp_config ppf (vm : config) = - Fmt.pf ppf "cpu %d, %d MB memory, block devices %a@ bridge %a, image %a, argv %a" + Fmt.pf ppf "fail behaviour %a, cpu %d, %d MB memory, block devices %a@ bridge %a, image %a, argv %a" + pp_fail_behaviour vm.fail_behaviour vm.cpuid vm.memory Fmt.(list ~sep:(unit ", ") string) vm.block_devices Fmt.(list ~sep:(unit ", ") string) vm.bridges @@ -280,9 +287,9 @@ end type process_exit = [ `Exit of int | `Signal of int | `Stop of int ] let pp_process_exit ppf = function - | `Exit n -> Fmt.pf ppf "exit %a (%d)" Fmt.Dump.signal n n - | `Signal n -> Fmt.pf ppf "signal %a (%d)" Fmt.Dump.signal n n - | `Stop n -> Fmt.pf ppf "stop %a (%d)" Fmt.Dump.signal n n + | `Exit n -> Fmt.pf ppf "exit %d" n + | `Signal n -> Fmt.pf ppf "signal %a (numeric %d)" Fmt.Dump.signal n n + | `Stop n -> Fmt.pf ppf "stop %a (numeric %d)" Fmt.Dump.signal n n module Log = struct type log_event = [ diff --git a/src/vmm_core.mli b/src/vmm_core.mli index fe14bd8..37049f5 100644 --- a/src/vmm_core.mli +++ b/src/vmm_core.mli @@ -58,6 +58,8 @@ module Unikernel : sig type typ = [ `Hvt_amd64 | `Hvt_amd64_compressed | `Hvt_arm64 ] val pp_typ : typ Fmt.t + type fail_behaviour = [ `Quit | `Restart ] + type config = { cpuid : int; memory : int; @@ -65,6 +67,7 @@ module Unikernel : sig bridges : string list; image : typ * Cstruct.t; argv : string list option; + fail_behaviour : fail_behaviour; } val pp_image : (typ * Cstruct.t) Fmt.t diff --git a/src/vmm_lwt.ml b/src/vmm_lwt.ml index 049db75..a091dd1 100644 --- a/src/vmm_lwt.ml +++ b/src/vmm_lwt.ml @@ -88,7 +88,8 @@ let read_wire s = (fun e -> let err = Printexc.to_string e in Logs.err (fun m -> m "exception %s while reading" err) ; - Lwt.return (Error `Exception)) + safe_close s >|= fun () -> + Error `Exception) in r buf 0 4 >>= function | Error e -> Lwt.return (Error e) @@ -121,7 +122,8 @@ let write_raw s buf = w (off + n) (l - n)) (fun e -> Logs.err (fun m -> m "exception %s while writing" (Printexc.to_string e)) ; - Lwt.return (Error `Exception)) + safe_close s >|= fun () -> + Error `Exception) in (* Logs.debug (fun m -> m "writing %a" Cstruct.hexdump_pp (Cstruct.of_bytes buf)) ; *) w 0 (Bytes.length buf) diff --git a/src/vmm_vmmd.ml b/src/vmm_vmmd.ml index 74a7c19..1c2ae28 100644 --- a/src/vmm_vmmd.ml +++ b/src/vmm_vmmd.ml @@ -23,8 +23,30 @@ let killall t = | [] -> false | vms -> in_shutdown := true ; List.iter Vmm_unix.destroy vms ; true +let remove_resources t name = + let resources = match Vmm_resources.remove_vm t.resources name with + | Error (`Msg e) -> + Logs.warn (fun m -> m "%s while removing vm %a from resources" e Name.pp name) ; + t.resources + | Ok resources -> resources + in + { t with resources } + +let dump_unikernels t = + let unikernels = Vmm_trie.all t.resources.Vmm_resources.unikernels in + let trie = List.fold_left (fun t (name, unik) -> + fst @@ Vmm_trie.insert name unik.Unikernel.config t) + Vmm_trie.empty unikernels + in + let data = Vmm_asn.unikernels_to_cstruct trie in + match Vmm_unix.dump data with + | Error (`Msg msg) -> Logs.err (fun m -> m "failed to dump unikernels: %s" msg) + | Ok () -> Logs.info (fun m -> m "dumped current state") + let waiter t id = + let t = remove_resources t id in let name = Name.to_string id in + if not !in_shutdown then dump_unikernels t ; match String.Map.find name t.waiters with | None -> t, None | Some waiter -> @@ -32,12 +54,15 @@ let waiter t id = { t with waiters }, Some waiter let register t id create = + let name = Name.to_string id in + let task, waiter = create () in + { t with waiters = String.Map.add name waiter t.waiters }, task + +let register_restart t id create = let name = Name.to_string id in match String.Map.find name t.waiters with - | None -> - let task, waiter = create () in - Some ({ t with waiters = String.Map.add name waiter t.waiters }, task) - | Some _ -> None + | Some _ -> Logs.err (fun m -> m "restart attempted to overwrite waiter"); None + | _ -> Some (register t id create) let init wire_version = let t = { @@ -89,17 +114,6 @@ let restore_unikernels () = Logs.info (fun m -> m "restored %d unikernels" (List.length (Vmm_trie.all unikernels))) ; Ok unikernels -let dump_unikernels t = - let unikernels = Vmm_trie.all t.resources.Vmm_resources.unikernels in - let trie = List.fold_left (fun t (name, unik) -> - fst @@ Vmm_trie.insert name unik.Unikernel.config t) - Vmm_trie.empty unikernels - in - let data = Vmm_asn.unikernels_to_cstruct trie in - match Vmm_unix.dump data with - | Error (`Msg msg) -> Logs.err (fun m -> m "failed to dump unikernels: %s" msg) - | Ok () -> Logs.info (fun m -> m "dumped current state") - let setup_stats t name vm = let stat_out = let name = match Vmm_unix.vm_device vm with @@ -171,14 +185,6 @@ let handle_shutdown t name vm r = (match Vmm_unix.free_system_resources name vm.Unikernel.taps with | Ok () -> () | Error (`Msg e) -> Logs.warn (fun m -> m "%s while shutdown vm %a" e Unikernel.pp vm)) ; - let resources = match Vmm_resources.remove_vm t.resources name with - | Error (`Msg e) -> - Logs.warn (fun m -> m "%s while removing vm %a from resources" e Unikernel.pp vm) ; - t.resources - | Ok resources -> resources - in - let t = { t with resources } in - if not !in_shutdown then dump_unikernels t ; let t, log_out = log t name (`Unikernel_stop (name, vm.Unikernel.pid, r)) in let t, stat_out = remove_stats t name in (t, stat_out, log_out) @@ -213,7 +219,7 @@ let handle_policy_cmd t reply id = function | _ -> Ok (t, `End (reply (`Policies policies))) -let handle_unikernel_cmd t reply header id msg_to_err = function +let handle_unikernel_cmd t reply header id = function | `Unikernel_info -> Logs.debug (fun m -> m "info %a" Name.pp id) ; let vms = @@ -228,29 +234,37 @@ let handle_unikernel_cmd t reply header id msg_to_err = function | _ -> Ok (t, `End (reply (`Unikernels vms))) end - | `Unikernel_create vm_config -> handle_create t header id vm_config + | `Unikernel_create vm_config -> Ok (t, `Create (header, id, vm_config)) | `Unikernel_force_create vm_config -> begin let resources = match Vmm_resources.remove_vm t.resources id with - | Error _ -> t.resources - | Ok r -> r + | Error _ -> t.resources | Ok r -> r in Vmm_resources.check_vm resources id vm_config >>= fun () -> match Vmm_resources.find_vm t.resources id with - | None -> handle_create t header id vm_config + | None -> Ok (t, `Create (header, id, vm_config)) | Some vm -> - Vmm_unix.destroy vm ; - Ok (t, `Wait_and_create - (id, fun t -> msg_to_err @@ handle_create t header id vm_config)) + (match Vmm_unix.destroy vm with + | exception Unix.Unix_error _ -> () + | () -> ()); + Ok (t, `Wait_and_create (id, (header, id, vm_config))) end | `Unikernel_destroy -> match Vmm_resources.find_vm t.resources id with - | Some vm -> - Vmm_unix.destroy vm ; - let s = reply (`String "destroyed unikernel") in - Ok (t, `Wait (id, s)) | None -> Error (`Msg "destroy: not found") + | Some vm -> + let answer = + try + Vmm_unix.destroy vm ; "destroyed unikernel" + with + Unix.Unix_error _ -> "kill failed" + in + let s ex = + let data = Fmt.strf "%a %s %a" Name.pp id answer pp_process_exit ex in + reply (`String data) + in + Ok (t, `Wait (id, s)) let handle_block_cmd t reply id = function | `Block_remove -> @@ -300,7 +314,7 @@ let handle_command t (header, payload) = msg_to_err ( match payload with | `Command (`Policy_cmd pc) -> handle_policy_cmd t reply id pc - | `Command (`Unikernel_cmd vc) -> handle_unikernel_cmd t reply header id msg_to_err vc + | `Command (`Unikernel_cmd vc) -> handle_unikernel_cmd t reply header id vc | `Command (`Block_cmd bc) -> handle_block_cmd t reply id bc | _ -> Logs.err (fun m -> m "ignoring %a" Vmm_commands.pp_wire (header, payload)) ; diff --git a/src/vmm_vmmd.mli b/src/vmm_vmmd.mli index cdfdcc5..f63b5d3 100644 --- a/src/vmm_vmmd.mli +++ b/src/vmm_vmmd.mli @@ -8,7 +8,9 @@ val init : Vmm_commands.version -> 'a t val waiter : 'a t -> Name.t -> 'a t * 'a option -val register : 'a t -> Name.t -> (unit -> 'b * 'a) -> ('a t * 'b) option +val register : 'a t -> Name.t -> (unit -> 'b * 'a) -> ('a t * 'b) + +val register_restart : 'a t -> Name.t -> (unit -> 'b * 'a) -> ('a t * 'b) option type 'a create = Vmm_commands.wire * @@ -24,11 +26,11 @@ val handle_create : 'a t -> Vmm_commands.header -> val handle_command : 'a t -> Vmm_commands.wire -> ('a t * - [ `Create of 'a create + [ `Create of Vmm_commands.header * Name.t * Unikernel.config | `Loop of Vmm_commands.wire | `End of Vmm_commands.wire - | `Wait of Name.t * Vmm_commands.wire - | `Wait_and_create of Name.t * ('a t -> ('a t * [ `Create of 'a create ], Vmm_commands.wire) result) ], + | `Wait of Name.t * (process_exit -> Vmm_commands.wire) + | `Wait_and_create of Name.t * (Vmm_commands.header * Name.t * Unikernel.config) ], Vmm_commands.wire) result val killall : 'a t -> bool