albatross/daemon/albatrossd.ml

225 lines
8.9 KiB
OCaml
Raw Permalink Normal View History

2017-05-26 14:30:34 +00:00
(* (c) 2017 Hannes Mehnert, all rights reserved *)
2019-03-27 23:11:43 +00:00
open Albatross_cli
2018-11-13 00:02:05 +00:00
open Vmm_core
2017-05-26 14:30:34 +00:00
open Lwt.Infix
let state = ref (Vmm_vmmd.init ())
2019-10-10 23:10:33 +00:00
let stub_data_out _ = Lwt.return_unit
let create_lock = Lwt_mutex.create ()
2019-10-12 00:30:21 +00:00
(* the global lock held during execution of create -- and also while
Vmm_vmmd.handle is getting called, and while communicating via log /
console / stat socket communication. *)
2019-10-10 23:10:33 +00:00
let rec create stat_out log_out cons_out data_out name config =
(match Vmm_vmmd.handle_create !state name config with
2019-10-10 23:10:33 +00:00
| Error `Msg msg ->
Logs.err (fun m -> m "failed to create %a: %s" Name.pp name msg) ;
Lwt.return (None, `Failure msg)
2019-10-11 21:04:59 +00:00
| Ok (state', (cons, succ_cont, fail_cont)) ->
2019-10-10 23:10:33 +00:00
state := state';
cons_out "create" cons >>= function
| Error () -> Lwt.return (None, fail_cont ())
| Ok () -> match succ_cont !state with
| Error (`Msg msg) ->
Logs.err (fun m -> m "create (exec) failed %s" msg) ;
Lwt.return (None, fail_cont ())
| Ok (state', stat, log, data, name, vm) ->
state := state';
(if Unikernel.restart_handler config then
2019-10-10 23:10:33 +00:00
match Vmm_vmmd.register_restart !state name Lwt.task with
| None -> ()
| Some (state', task) ->
state := state';
Lwt.async (fun () ->
task >>= fun r ->
if should_restart config name r then
Lwt_mutex.with_lock create_lock (fun () ->
create stat_out log_out cons_out stub_data_out
name vm.Unikernel.config)
else
Lwt.return_unit));
2019-10-10 23:10:33 +00:00
stat_out "setting up stat" stat >>= fun () ->
log_out "setting up log" log >|= fun () ->
(Some vm, data)) >>= fun (started, data) ->
(match started with
| None -> ()
| Some vm ->
Lwt.async (fun () ->
Vmm_lwt.wait_and_clear vm.Unikernel.pid >>= fun r ->
Lwt_mutex.with_lock create_lock (fun () ->
let state', stat', log' = Vmm_vmmd.handle_shutdown !state name vm r in
state := state';
stat_out "handle shutdown stat" stat' >>= fun () ->
log_out "handle shutdown log" log' >|= fun () ->
let state', waiter_opt = Vmm_vmmd.waiter !state name in
state := state';
waiter_opt) >|= function
| None -> ()
| Some wakeme -> Lwt.wakeup wakeme r));
data_out data
let handle log_out cons_out stat_out fd addr =
Logs.debug (fun m -> m "connection from %a" Vmm_lwt.pp_sockaddr addr) ;
let rec loop () =
Logs.debug (fun m -> m "now reading") ;
Vmm_lwt.read_wire fd >>= function
| Error _ ->
Logs.err (fun m -> m "error while reading") ;
Lwt.return_unit
| Ok (hdr, wire) ->
let out wire' =
(* TODO should we terminate the connection on write failure? *)
Vmm_lwt.write_wire fd (hdr, wire') >|= fun _ -> ()
in
Logs.debug (fun m -> m "read %a" Vmm_commands.pp_wire (hdr, wire));
2019-10-10 23:10:33 +00:00
Lwt_mutex.lock create_lock >>= fun () ->
match Vmm_vmmd.handle_command !state (hdr, wire) with
| Error wire' -> Lwt_mutex.unlock create_lock; out wire'
| Ok (state', next) ->
state := state' ;
match next with
2019-10-10 23:10:33 +00:00
| `Loop wire -> Lwt_mutex.unlock create_lock; out wire >>= loop
| `End wire -> Lwt_mutex.unlock create_lock; out wire
| `Create (id, vm) ->
create stat_out log_out cons_out out id vm >|= fun () ->
2019-10-10 23:10:33 +00:00
Lwt_mutex.unlock create_lock
| `Wait (who, data) ->
2019-10-10 23:10:33 +00:00
let state', task = Vmm_vmmd.register !state who Lwt.task in
state := state';
Lwt_mutex.unlock create_lock;
task >>= fun r ->
out (data r)
| `Wait_and_create (who, (id, vm)) ->
2019-10-10 23:10:33 +00:00
let state', task = Vmm_vmmd.register !state who Lwt.task in
state := state';
Lwt_mutex.unlock create_lock;
task >>= fun r ->
Logs.info (fun m -> m "wait returned %a" pp_process_exit r);
Lwt_mutex.with_lock create_lock (fun () ->
create stat_out log_out cons_out out id vm)
in
loop () >>= fun () ->
Vmm_lwt.safe_close fd
2017-05-26 14:30:34 +00:00
let write_reply name fd txt (hdr, cmd) =
Vmm_lwt.write_wire fd (hdr, cmd) >>= function
| Error `Exception ->
invalid_arg ("exception during " ^ txt ^ " while writing to " ^ name)
2019-10-10 23:10:33 +00:00
| Ok () ->
Vmm_lwt.read_wire fd >|= function
| Ok (hdr', reply) ->
if not Vmm_commands.(Int64.equal hdr.sequence hdr'.sequence) then begin
2019-10-10 23:10:33 +00:00
Logs.err (fun m -> m "%s: wrong id %Lu (expected %Lu) in reply from %s"
txt hdr'.Vmm_commands.sequence hdr.Vmm_commands.sequence name) ;
2019-10-10 23:10:33 +00:00
invalid_arg "wrong sequence number received"
end else begin
Logs.debug (fun m -> m "%s: received valid reply from %s %a (request %a)"
txt name Vmm_commands.pp_wire (hdr', reply) Vmm_commands.pp_wire (hdr, cmd)) ;
2019-10-10 23:10:33 +00:00
match reply with
| `Success _ -> Ok ()
| `Failure msg ->
Logs.err (fun m -> m "%s: received failure %s from %s" txt msg name) ;
Error ()
| _ ->
Logs.err (fun m -> m "%s: unexpected data from %s" txt name) ;
invalid_arg "unexpected data"
end
| Error _ ->
Logs.err (fun m -> m "error in read from %s" name) ;
invalid_arg "communication failure"
2019-10-10 20:26:36 +00:00
let m = conn_metrics "unix"
let jump _ systemd influx tmpdir dbdir retries enable_stats =
Sys.(set_signal sigpipe Signal_ignore);
Albatross_cli.set_tmpdir tmpdir;
Albatross_cli.set_dbdir dbdir;
Rresult.R.error_msg_to_invalid_arg
(Vmm_unix.check_commands ());
match Vmm_vmmd.restore_unikernels () with
| Error (`Msg msg) -> Logs.err (fun m -> m "bailing out: %s" msg)
| Ok old_unikernels ->
Lwt_main.run
(let rec unix_connect ~retries s =
2020-05-19 19:00:37 +00:00
let path = socket_path s in
Vmm_lwt.connect Lwt_unix.PF_UNIX (Lwt_unix.ADDR_UNIX path) >>= function
| Some x -> Lwt.return x
| None when (retries <> 0) ->
2020-05-19 19:00:37 +00:00
Logs.err (fun m -> m "unable to connect to %a, retrying in 5 seconds"
pp_socket s);
Lwt_unix.sleep 5.0 >>= fun () ->
unix_connect ~retries:(retries - 1) s
| None -> Lwt.fail_with (Fmt.strf "cannot connect to %a" pp_socket s)
2019-10-10 23:10:33 +00:00
in
init_influx "albatross" influx;
2020-05-19 19:00:37 +00:00
unix_connect ~retries `Log >>= fun l ->
unix_connect ~retries `Console >>= fun c ->
(if enable_stats then
2020-05-19 19:00:37 +00:00
unix_connect ~retries `Stats >|= fun s ->
Some s
else
Lwt.return_none) >>= fun s ->
Lwt.catch
(fun () -> Vmm_lwt.server_socket ~systemd `Vmmd)
2020-05-19 19:00:37 +00:00
(fun e ->
let str =
Fmt.strf "unable to create server socket %a: %s"
pp_socket `Vmmd (Printexc.to_string e)
in
invalid_arg str) >>= fun ss ->
let self_destruct_mutex = Lwt_mutex.create () in
let self_destruct () =
Lwt_mutex.with_lock self_destruct_mutex (fun () ->
(if Vmm_vmmd.killall !state then
(* not too happy about the sleep here, but cleaning up resources
is really important (fifos, vm images, tap devices) - which
is done asynchronously (in the task waitpid() on the pid) *)
Lwt_unix.sleep 1.
else
Lwt.return_unit) >>= fun () ->
Vmm_lwt.safe_close ss)
in
2020-05-19 19:00:37 +00:00
Sys.(set_signal sigterm
(Signal_handle (fun _ -> Lwt.async self_destruct)));
let log_out txt wire = write_reply "log" l txt wire >|= fun _ -> ()
and cons_out = write_reply "cons" c
and stat_out txt wire = match s with
2020-05-19 19:00:37 +00:00
| None ->
Logs.info (fun m -> m "ignoring stat %s %a" txt
Vmm_commands.pp_wire wire);
Lwt.return_unit
| Some s -> write_reply "stat" s txt wire >|= fun _ -> ()
in
Lwt_list.iter_s (fun (name, config) ->
2019-11-11 21:11:22 +00:00
Lwt_mutex.with_lock create_lock (fun () ->
create stat_out log_out cons_out stub_data_out name config))
2019-10-10 23:10:33 +00:00
(Vmm_trie.all old_unikernels) >>= fun () ->
Lwt.catch (fun () ->
let rec loop () =
Lwt_unix.accept ss >>= fun (fd, addr) ->
Lwt_unix.set_close_on_exec fd ;
2019-10-10 20:26:36 +00:00
m `Open;
Lwt.async (fun () ->
handle log_out cons_out stat_out fd addr >|= fun () ->
m `Close) ;
loop ()
in
loop ())
(fun e ->
2020-05-19 19:00:37 +00:00
Logs.err (fun m -> m "exception %s, shutting down"
(Printexc.to_string e));
self_destruct ()))
2017-05-26 14:30:34 +00:00
open Cmdliner
let cmd =
Term.(const jump $ setup_log $ systemd_socket_activation $ influx $ tmpdir $ dbdir $ retry_connections $ enable_stats),
Term.info "albatrossd" ~version:Albatross_cli.version
2017-05-26 14:30:34 +00:00
let () = match Term.eval cmd with `Ok () -> exit 0 | _ -> exit 1