restart on failure: add optional integer set which exit codes to restart on

This commit is contained in:
Hannes Mehnert 2019-10-12 00:01:36 +02:00
parent 6be9ebbc8b
commit 0808c20583
8 changed files with 92 additions and 48 deletions

View file

@ -110,8 +110,8 @@ let info_ _ endp cert key ca name =
let destroy _ endp cert key ca name =
jump endp cert key ca name (`Unikernel_cmd `Unikernel_destroy)
let create _ endp cert key ca force name image cpuid memory argv block network compression restart_on_fail =
match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail with
let create _ endp cert key ca force name image cpuid memory argv block network compression restart_on_fail exit_code =
match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail exit_code with
| Ok cmd -> jump endp cert key ca name (`Unikernel_cmd cmd)
| Error (`Msg msg) -> Error (`Msg msg)
@ -208,7 +208,7 @@ let create_cmd =
[`S "DESCRIPTION";
`P "Creates a virtual machine."]
in
Term.(term_result (const create $ setup_log $ destination $ ca_cert $ ca_key $ server_ca $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9 $ restart_on_fail)),
Term.(term_result (const create $ setup_log $ destination $ ca_cert $ ca_key $ server_ca $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9 $ restart_on_fail $ exit_code)),
Term.info "create" ~doc ~man
let console_cmd =

View file

@ -61,8 +61,8 @@ let info_ _ opt_socket name =
let destroy _ opt_socket name =
jump opt_socket name (`Unikernel_cmd `Unikernel_destroy)
let create _ opt_socket force name image cpuid memory argv block network compression restart_on_fail =
match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail with
let create _ opt_socket force name image cpuid memory argv block network compression restart_on_fail exit_code =
match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail exit_code with
| Ok cmd -> jump opt_socket name (`Unikernel_cmd cmd)
| Error (`Msg msg) -> Error (`Msg msg)
@ -153,7 +153,7 @@ let create_cmd =
[`S "DESCRIPTION";
`P "Creates a virtual machine."]
in
Term.(term_result (const create $ setup_log $ socket $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 0 $ restart_on_fail)),
Term.(term_result (const create $ setup_log $ socket $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 0 $ restart_on_fail $ exit_code)),
Term.info "create" ~doc ~man
let console_cmd =

View file

@ -79,7 +79,7 @@ let setup_log style_renderer level =
Logs.set_level level;
Logs.set_reporter (Logs_fmt.reporter ~dst:Format.std_formatter ())
let create_vm force image cpuid memory argv block_devices bridges compression restart_on_fail =
let create_vm force image cpuid memory argv block_devices bridges compression restart_on_fail exit_codes =
let open Rresult.R.Infix in
Bos.OS.File.read (Fpath.v image) >>| fun image ->
let image, compressed = match compression with
@ -88,7 +88,9 @@ let create_vm force image cpuid memory argv block_devices bridges compression re
let img = Vmm_compress.compress ~level image in
Cstruct.of_string img, true
and argv = match argv with [] -> None | xs -> Some xs
and fail_behaviour = if restart_on_fail then `Restart else `Quit
and fail_behaviour =
let exits = match exit_codes with [] -> None | xs -> Some (IS.of_list xs) in
if restart_on_fail then `Restart exits else `Quit
in
let config = Unikernel.{ typ = `Solo5 ; compressed ; image ; fail_behaviour ; cpuid ; memory ; block_devices ; bridges ; argv } in
if force then `Unikernel_force_create config else `Unikernel_create config
@ -241,6 +243,10 @@ let restart_on_fail =
let doc = "Restart on fail" in
Arg.(value & flag & info [ "restart-on-fail" ] ~doc)
let exit_code =
let doc = "Exit code to restart on" in
Arg.(value & opt_all int [] & info [ "exit-code" ] ~doc)
let timestamp_c =
let parse s = match Ptime.of_rfc3339 s with
| Ok (t, _, _) -> `Ok t

View file

@ -37,39 +37,19 @@ let rec create stat_out log_out cons_out data_out hdr name config =
Lwt.return (None, fail_cont ())
| Ok (state', stat, log, data, name, vm) ->
state := state';
(match Unikernel.(vm.config.fail_behaviour) with
| `Quit -> ()
| `Restart ->
(if Unikernel.restart_handler config then
match Vmm_vmmd.register_restart !state name Lwt.task with
| None -> ()
| Some (state', task) ->
state := state';
Lwt.async (fun () ->
task >>= function
| (`Signal _ | `Stop _) as r ->
Logs.warn (fun m -> m "unikernel %a exited with signal %a"
Name.pp name pp_process_exit r);
Lwt.return_unit
| `Exit i ->
(* results:
normal exit (i.e. teardown) is 0
solo5-exit allows an arbitrary int
solo5-abort emits 255
solo5 internal error (bad image, bad manigest) is 1
ocaml exceptions (out of memory et al) use 2
-> soon (4.10) they'll abort == 255
signal 11 is if a kill -TERM was sent (i.e. our destroy)
--> best: user-provided list of which exit codes to restart on
(and filter 1 specially)
*)
match i with
| 1 -> Logs.warn (fun m -> m "solo5 exit failure"); Lwt.return_unit
| _ ->
Logs.info (fun m -> m "solo5 exited with %d, restarting" i);
task >>= fun r ->
if should_restart config name r then
Lwt_mutex.with_lock create_lock (fun () ->
create stat_out log_out cons_out stub_data_out
stub_hdr name vm.Unikernel.config)));
stub_hdr name vm.Unikernel.config)
else
Lwt.return_unit));
stat_out "setting up stat" stat >>= fun () ->
log_out "setting up log" log >|= fun () ->
(Some vm, data)) >>= fun (started, data) ->

View file

@ -40,8 +40,8 @@ let info_ _ name = jump name (`Unikernel_cmd `Unikernel_info)
let destroy _ name =
jump name (`Unikernel_cmd `Unikernel_destroy)
let create _ force name image cpuid memory argv block network compression restart_on_fail =
match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail with
let create _ force name image cpuid memory argv block network compression restart_on_fail exit_code =
match Albatross_cli.create_vm force image cpuid memory argv block network compression restart_on_fail exit_code with
| Ok cmd -> jump name (`Unikernel_cmd cmd)
| Error (`Msg msg) -> Error (`Msg msg)
@ -122,7 +122,7 @@ let create_cmd =
[`S "DESCRIPTION";
`P "Creates a virtual machine."]
in
Term.(term_result (const create $ setup_log $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9 $ restart_on_fail)),
Term.(term_result (const create $ setup_log $ force $ vm_name $ image $ cpu $ vm_mem $ args $ block $ net $ compress_level 9 $ restart_on_fail $ exit_code)),
Term.info "create" ~doc ~man
let console_cmd =

View file

@ -279,15 +279,25 @@ let typ =
let fail_behaviour =
let f = function
| `C1 () -> `Quit
| `C2 () -> `Restart
| `C2 xs ->
let exit_codes = match xs with
| [] -> None
| xs -> Some (IS.of_list xs)
in
`Restart exit_codes
and g = function
| `Quit -> `C1 ()
| `Restart -> `C2 ()
| `Restart xs ->
let exit_codes = match xs with
| None -> []
| Some i -> IS.elements i
in
`C2 exit_codes
in
Asn.S.map f g @@
Asn.S.(choice2
(explicit 0 null)
(explicit 1 null))
(explicit 1 (set_of int)))
let unikernel_config =
let open Unikernel in
@ -309,8 +319,8 @@ let unikernel_config =
@ (required ~label:"fail behaviour" fail_behaviour)
@ (required ~label:"cpuid" int)
@ (required ~label:"memory" int)
@ (optional ~label:"blocks" (explicit 0 (sequence_of utf8_string)))
@ (optional ~label:"bridges" (explicit 1 (sequence_of utf8_string)))
@ (optional ~label:"blocks" (explicit 0 (set_of utf8_string)))
@ (optional ~label:"bridges" (explicit 1 (set_of utf8_string)))
-@ (optional ~label:"arguments"(explicit 2 (sequence_of utf8_string))))
let unikernel_cmd =

View file

@ -156,10 +156,14 @@ module Unikernel = struct
let pp_typ ppf = function
| `Solo5 -> Fmt.pf ppf "solo5"
type fail_behaviour = [ `Quit | `Restart ]
type fail_behaviour = [ `Quit | `Restart of IS.t option ]
let pp_fail_behaviour ppf f =
Fmt.string ppf (match f with `Quit -> "quit" | `Restart -> "restart")
let pp_fail_behaviour ppf = function
| `Quit -> Fmt.string ppf "quit"
| `Restart codes ->
Fmt.pf ppf "restart %a"
Fmt.(option ~none:(unit "all except 1") (list ~sep:(unit ", ") int))
(match codes with None -> None | Some x -> Some (IS.elements x))
type config = {
typ : typ ;
@ -184,6 +188,9 @@ module Unikernel = struct
Fmt.(list ~sep:(unit ", ") string) vm.bridges
Fmt.(option ~none:(unit "no") (list ~sep:(unit " ") string)) vm.argv
let restart_handler config =
match config.fail_behaviour with `Quit -> false | `Restart _ -> true
type t = {
config : config ;
cmd : Bos.Cmd.t ;
@ -289,6 +296,43 @@ let pp_process_exit ppf = function
| `Signal n -> Fmt.pf ppf "signal %a (numeric %d)" Fmt.Dump.signal n n
| `Stop n -> Fmt.pf ppf "stop %a (numeric %d)" Fmt.Dump.signal n n
let should_restart config name = function
| (`Signal _ | `Stop _) as r ->
(* signal 11 is if a kill -TERM was sent (i.e. our destroy) *)
Logs.warn (fun m -> m "unikernel %a exited with signal %a"
Name.pp name pp_process_exit r);
false
| `Exit i ->
(* results (and default behaviour) -- solo5-exit allows an arbitrary int
0 normal exit (i.e. teardown) -> restart
1 solo5 internal error (bad image, bad manigest) -> no restart, never
2 ocaml exceptions (out of memory et al) -> restart
64..70 -> no restart (soon to be used by unikernel command line parsing)
255 solo5-abort -> soon (OCaml 4.10) fatal error (out of memory) -> restart *)
let opt_mem i =
match config.Unikernel.fail_behaviour with
| `Quit -> assert false
| `Restart None -> true
| `Restart (Some c) -> IS.mem i c
in
match i with
| 1 ->
Logs.warn (fun m -> m "unikernel %a solo5 exit failure (1)"
Name.pp name);
false
| 64 | 65 | 66 | 67 | 68 | 69 | 70 ->
Logs.warn (fun m -> m "unikernel %a exited %d, not restarting"
Name.pp name i);
false
| _ when opt_mem i ->
Logs.info (fun m -> m "unikernel %a exited %d, restarting"
Name.pp name i);
true
| _ ->
Logs.info (fun m -> m "unikernel %a exited %d, not restarting %a"
Name.pp name i Unikernel.pp_fail_behaviour config.fail_behaviour);
false
module Log = struct
type log_event = [
| `Login of Name.t * Ipaddr.V4.t * int

View file

@ -58,7 +58,7 @@ module Unikernel : sig
type typ = [ `Solo5 ]
val pp_typ : typ Fmt.t
type fail_behaviour = [ `Quit | `Restart ]
type fail_behaviour = [ `Quit | `Restart of IS.t option ]
type config = {
typ : typ ;
@ -74,6 +74,8 @@ module Unikernel : sig
val pp_config : config Fmt.t
val restart_handler : config -> bool
type t = {
config : config;
cmd : Bos.Cmd.t;
@ -153,6 +155,8 @@ type process_exit = [ `Exit of int | `Signal of int | `Stop of int ]
val pp_process_exit : process_exit Fmt.t
val should_restart : Unikernel.config -> Name.t -> process_exit -> bool
module Log : sig
type log_event = [
| `Login of Name.t * Ipaddr.V4.t * int