vmm_influxdb: improve connection handling (next attempt to not leak fds and reconnect on demand)

This commit is contained in:
Hannes Mehnert 2018-07-07 12:38:29 +02:00
parent 43ee0cf4e0
commit 9e4cb94884
2 changed files with 131 additions and 96 deletions

View file

@ -6,4 +6,5 @@ S provision
B _build/** B _build/**
PKG topkg logs ipaddr x509 tls rresult bos lwt cmdliner hex cstruct.ppx duration PKG topkg logs ipaddr x509 tls rresult bos lwt cmdliner hex cstruct.ppx duration
PKG ptime ptime.clock.os ipaddr.unix decompress PKG ptime ptime.clock.os ipaddr.unix decompress
PKG lwt.unix

View file

@ -160,94 +160,114 @@ let safe_close s =
Logs.err (fun m -> m "exception %s while closing" (Printexc.to_string e)) ; Logs.err (fun m -> m "exception %s while closing" (Printexc.to_string e)) ;
Lwt.return_unit) Lwt.return_unit)
let rec read_sock_write_tcp db c ?fd addr addrtype = let rec read_sock_write_tcp closing db c ?fd addr addrtype =
match fd with match fd with
| None -> | None ->
Logs.debug (fun m -> m "new connection to TCP") ; if !closing then
let fd = Lwt_unix.socket addrtype Lwt_unix.SOCK_STREAM 0 in Lwt.return_unit
Lwt_unix.setsockopt fd Lwt_unix.SO_KEEPALIVE true ; else begin
Lwt.catch Logs.debug (fun m -> m "new connection to TCP") ;
(fun () -> let fd = Lwt_unix.socket addrtype Lwt_unix.SOCK_STREAM 0 in
Lwt_unix.connect fd addr >|= fun () -> Lwt_unix.setsockopt fd Lwt_unix.SO_KEEPALIVE true ;
Logs.debug (fun m -> m "connected to TCP")) Lwt.catch
(fun e -> (fun () ->
let addr', port = match addr with Lwt_unix.connect fd addr >|= fun () ->
| Lwt_unix.ADDR_INET (ip, port) -> Unix.string_of_inet_addr ip, port Logs.debug (fun m -> m "connected to TCP") ;
| Lwt_unix.ADDR_UNIX addr -> addr, 0 Some fd)
in (fun e ->
Logs.warn (fun m -> m "error %s connecting to influxd %s:%d, retrying in 5s" let addr', port = match addr with
(Printexc.to_string e) addr' port) ; | Lwt_unix.ADDR_INET (ip, port) -> Unix.string_of_inet_addr ip, port
safe_close fd >>= fun () -> | Lwt_unix.ADDR_UNIX addr -> addr, 0
Lwt_unix.sleep 5.0 >>= fun () -> in
read_sock_write_tcp db c addr addrtype) >>= fun () -> Logs.warn (fun m -> m "error %s connecting to influxd %s:%d, retrying in 5s"
read_sock_write_tcp db c ~fd addr addrtype (Printexc.to_string e) addr' port) ;
safe_close fd >>= fun () ->
Lwt_unix.sleep 5.0 >|= fun () ->
None) >>= fun fd ->
read_sock_write_tcp closing db c ?fd addr addrtype
end
| Some fd -> | Some fd ->
let open Vmm_wire in if !closing then
Logs.debug (fun m -> m "reading from unix socket") ;
Vmm_lwt.read_exactly c >>= function
| Error e ->
Logs.err (fun m -> m "error %s while reading vmm socket (return)"
(str_of_e e)) ;
safe_close fd safe_close fd
| Ok (hdr, data) -> else begin
if not (version_eq hdr.version my_version) then begin let open Vmm_wire in
Logs.err (fun m -> m "unknown wire protocol version") ; Logs.debug (fun m -> m "reading from unix socket") ;
safe_close fd Vmm_lwt.read_exactly c >>= function
end else
let name = IM.find hdr.id !req in
req := IM.remove hdr.id !req ;
match Stats.int_to_op hdr.tag with
| Some Stats.Stat_reply ->
begin match Vmm_wire.Stats.decode_stats (Cstruct.of_string data) with
| Error (`Msg msg) ->
Logs.warn (fun m -> m "error %s while decoding stats %s, ignoring"
msg name) ;
read_sock_write_tcp db c ~fd addr addrtype
| Ok (ru, vmm, ifs) ->
let ru = P.encode_ru name ru in
let vmm = P.encode_vmm name vmm in
let taps = List.map (P.encode_if name) ifs in
let out = (String.concat ~sep:"\n" (ru :: vmm :: taps)) ^ "\n" in
Logs.debug (fun m -> m "writing %d via tcp" (String.length out)) ;
Vmm_lwt.write_raw fd out >>= function
| Ok () ->
Logs.debug (fun m -> m "wrote successfully") ;
read_sock_write_tcp db c ~fd addr addrtype
| Error e ->
Logs.err (fun m -> m "error %s while writing to tcp (%s)"
(str_of_e e) name) ;
safe_close fd >>= fun () ->
read_sock_write_tcp db c addr addrtype
end
| _ when hdr.tag = fail_tag ->
Logs.err (fun m -> m "failed to retrieve statistics for %s" name) ;
read_sock_write_tcp db c ~fd addr addrtype
| _ ->
Logs.err (fun m -> m "unhandled tag %d for %s" hdr.tag name) ;
read_sock_write_tcp db c ~fd addr addrtype
let rec query_sock prefix db c interval =
(* query c for everyone in db *)
Lwt_list.iter_s (fun (id, name) ->
let id = identifier id in
let id = match prefix with None -> id | Some p -> p ^ "." ^ id in
let request = Vmm_wire.Stats.stat !command my_version id in
req := IM.add !command name !req ;
incr command ;
Logs.debug (fun m -> m "%d requesting %s via socket" !command id) ;
Vmm_lwt.write_raw c request >>= function
| Ok () ->
Logs.debug (fun m -> m "%d done" !command) ;
Lwt.return_unit
| Error e -> | Error e ->
Logs.err (fun m -> m "error while writing to vmm socket %s: %s" Logs.err (fun m -> m "error %s while reading vmm socket (return)"
id (str_of_e e)) ; (str_of_e e)) ;
Lwt.fail_with "exception while writing") closing := true ;
db >>= fun () -> safe_close fd
Lwt_unix.sleep (float_of_int interval) >>= fun () -> | Ok (hdr, data) ->
query_sock prefix db c interval if not (version_eq hdr.version my_version) then begin
Logs.err (fun m -> m "unknown wire protocol version") ;
closing := true ;
safe_close fd
end else
let name =
try IM.find hdr.id !req
with Not_found -> "not found"
in
req := IM.remove hdr.id !req ;
begin match Stats.int_to_op hdr.tag with
| Some Stats.Stat_reply ->
begin match Vmm_wire.Stats.decode_stats (Cstruct.of_string data) with
| Error (`Msg msg) ->
Logs.warn (fun m -> m "error %s while decoding stats %s, ignoring"
msg name) ;
Lwt.return (Some fd)
| Ok (ru, vmm, ifs) ->
let ru = P.encode_ru name ru in
let vmm = P.encode_vmm name vmm in
let taps = List.map (P.encode_if name) ifs in
let out = (String.concat ~sep:"\n" (ru :: vmm :: taps)) ^ "\n" in
Logs.debug (fun m -> m "writing %d via tcp" (String.length out)) ;
Vmm_lwt.write_raw fd out >>= function
| Ok () ->
Logs.debug (fun m -> m "wrote successfully") ;
Lwt.return (Some fd)
| Error e ->
Logs.err (fun m -> m "error %s while writing to tcp (%s)"
(str_of_e e) name) ;
safe_close fd >|= fun () ->
None
end
| _ when hdr.tag = fail_tag ->
Logs.err (fun m -> m "failed to retrieve statistics for %s" name) ;
Lwt.return (Some fd)
| _ ->
Logs.err (fun m -> m "unhandled tag %d for %s" hdr.tag name) ;
Lwt.return (Some fd)
end >>= fun fd ->
read_sock_write_tcp closing db c ?fd addr addrtype
end
let maybe_connect stat_socket = let rec query_sock closing prefix db c interval =
(* query c for everyone in db *)
if !closing then
Lwt.return_unit
else
Lwt_list.fold_left_s (fun r (id, name) ->
match r with
| Error e -> Lwt.return (Error e)
| Ok () ->
let id = identifier id in
let id = match prefix with None -> id | Some p -> p ^ "." ^ id in
let request = Vmm_wire.Stats.stat !command my_version id in
req := IM.add !command name !req ;
incr command ;
Logs.debug (fun m -> m "%d requesting %s via socket" !command id) ;
Vmm_lwt.write_raw c request)
(Ok ()) db >>= function
| Error e ->
Logs.err (fun m -> m "error %s while writing to vmm socket" (str_of_e e)) ;
closing := true ;
Lwt.return_unit
| Ok () ->
Lwt_unix.sleep (float_of_int interval) >>= fun () ->
query_sock closing prefix db c interval
let rec maybe_connect stat_socket =
let c = Lwt_unix.(socket PF_UNIX SOCK_STREAM 0) in let c = Lwt_unix.(socket PF_UNIX SOCK_STREAM 0) in
Lwt.catch Lwt.catch
(fun () -> (fun () ->
@ -258,7 +278,9 @@ let maybe_connect stat_socket =
(fun e -> (fun e ->
Logs.warn (fun m -> m "error %s connecting to socket %s" Logs.warn (fun m -> m "error %s connecting to socket %s"
(Printexc.to_string e) stat_socket) ; (Printexc.to_string e) stat_socket) ;
Lwt.fail_with "cannot connect to stat socket") safe_close c >>= fun () ->
Lwt_unix.sleep (float_of_int 5) >>= fun () ->
maybe_connect stat_socket)
let client stat_socket influxhost influxport db prefix interval = let client stat_socket influxhost influxport db prefix interval =
(* start a socket connection to vmm_stats *) (* start a socket connection to vmm_stats *)
@ -272,16 +294,27 @@ let client stat_socket influxhost influxport db prefix interval =
in in
(* loop *) (* loop *)
(* the query task queries the stat_socket at each interval
- if this fails, closing is set to true (and unit is returned)
the read_sock reads the stat_socket, and forwards to a TCP socket
- if closing is true, the TCP socket is closed and unit is returned
- if read on the unix domain socket fails, closing is set to true
(and unit is returned) *)
(* connection to the unix domain socket is managed in this loop only:
- maybe_connect attempts to establishes to it
- query_sock/read_sock_write_tcp write an read from it
- on failure in read or write, the TCP connection is closed, and loop
takes control: safe_close, maybe_connect, rinse, repeat *)
let rec loop c = let rec loop c =
Lwt.catch (fun () -> let closing = ref false in
Lwt.pick [ query_sock prefix db c interval ; read_sock_write_tcp db c addr addrtype ] >>= fun () -> Lwt.join [
safe_close c >>= fun () -> query_sock closing prefix db c interval ;
maybe_connect stat_socket >>= fun c -> read_sock_write_tcp closing db c addr addrtype
loop c) ] >>= fun () ->
(fun _ -> safe_close c >>= fun () ->
safe_close c >>= fun () -> maybe_connect stat_socket >>= fun c ->
maybe_connect stat_socket >>= fun c -> loop c
loop c)
in in
loop c loop c
@ -322,15 +355,16 @@ let host_port : (string * int) Arg.converter =
let socket = let socket =
let doc = "Stat socket to connect onto" in let doc = "Stat socket to connect onto" in
Arg.(required & pos 0 (some string) None & info [] ~doc) let sock = Fpath.(to_string (Vmm_core.tmpdir / "stat" + "sock")) in
Arg.(value & opt string sock & info [ "s" ; "socket" ] ~doc)
let influx = let influx =
Arg.(required & pos 1 (some host_port) None & info [] ~docv:"influx" Arg.(required & pos 0 (some host_port) None & info [] ~docv:"influx"
~doc:"the influx hostname:port to connect to") ~doc:"the influx hostname:port to connect to")
let db = let db =
let doc = "VMID database" in let doc = "VMID database" in
Arg.(required & pos 2 (some file) None & info [] ~doc) Arg.(required & pos 1 (some file) None & info [] ~doc)
let prefix = let prefix =
let doc = "prefix" in let doc = "prefix" in