From 64a017dbaea124e3f3593ec0de975ff73903a327 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 11 Sep 2022 13:28:45 +0930 Subject: [PATCH] lightningd: more graceful shutdown. Be more graceful in shutting down: this should fix the issue where bookkeeper gets upset that its commands are rejected during shutdown, and generally make things more graceful. 1. Stop any new RPC connections. 2. Stop any per-peer daemons (channeld, etc). 3. Shut down plugins. 4. Stop all existing RPC connections. 5. Stop global daemons. 6. Free up peer, chanen HTLC datastructures. 7. Close database. Signed-off-by: Rusty Russell Changelog-Changed: Plugins: RPC operations are now still available during shutdown. --- lightningd/gossip_control.c | 6 ++-- lightningd/jsonrpc.c | 22 ++++++++---- lightningd/jsonrpc.h | 14 +++++++- lightningd/lightningd.c | 51 +++++++++++++++++---------- lightningd/plugin.c | 8 ----- lightningd/plugin_hook.c | 3 +- lightningd/subd.c | 12 +++---- lightningd/subd.h | 9 ++--- lightningd/test/run-find_my_abspath.c | 6 ++++ tests/test_plugin.py | 2 +- 10 files changed, 80 insertions(+), 53 deletions(-) diff --git a/lightningd/gossip_control.c b/lightningd/gossip_control.c index a240e11accdc..5fcd4f798d79 100644 --- a/lightningd/gossip_control.c +++ b/lightningd/gossip_control.c @@ -125,9 +125,9 @@ static void handle_local_channel_update(struct lightningd *ld, const u8 *msg) * us. */ channel = any_channel_by_scid(ld, &scid, true); if (!channel) { - log_broken(ld->log, "Local update for bad scid %s", - type_to_string(tmpctx, struct short_channel_id, - &scid)); + log_unusual(ld->log, "Local update for bad scid %s", + type_to_string(tmpctx, struct short_channel_id, + &scid)); return; } diff --git a/lightningd/jsonrpc.c b/lightningd/jsonrpc.c index 357297f2f3a4..1904184a9693 100644 --- a/lightningd/jsonrpc.c +++ b/lightningd/jsonrpc.c @@ -933,11 +933,6 @@ parse_request(struct json_connection *jcon, const jsmntok_t tok[]) json_tok_full(jcon->buffer, method)); } - if (jcon->ld->state == LD_STATE_SHUTDOWN) { - return command_fail(c, LIGHTNINGD_SHUTDOWN, - "lightningd is shutting down"); - } - rpc_hook = tal(c, struct rpc_command_hook_payload); rpc_hook->cmd = c; /* Duplicate since we might outlive the connection */ @@ -1268,8 +1263,21 @@ void jsonrpc_listen(struct jsonrpc *jsonrpc, struct lightningd *ld) if (listen(fd, 128) != 0) err(1, "Listening on '%s'", rpc_filename); - jsonrpc->rpc_listener = io_new_listener( - ld->rpc_filename, fd, incoming_jcon_connected, ld); + + /* All conns will be tal children of jsonrpc: good for freeing later! */ + jsonrpc->rpc_listener + = io_new_listener(jsonrpc, fd, incoming_jcon_connected, ld); +} + +void jsonrpc_stop_listening(struct jsonrpc *jsonrpc) +{ + jsonrpc->rpc_listener = tal_free(jsonrpc->rpc_listener); +} + +void jsonrpc_stop_all(struct lightningd *ld) +{ + /* Closes all conns. */ + ld->jsonrpc = tal_free(ld->jsonrpc); } static struct command_result *param_command(struct command *cmd, diff --git a/lightningd/jsonrpc.h b/lightningd/jsonrpc.h index 8f090801e464..43ed29b30f9c 100644 --- a/lightningd/jsonrpc.h +++ b/lightningd/jsonrpc.h @@ -173,12 +173,24 @@ void jsonrpc_setup(struct lightningd *ld); /** - * Start listeing on ld->rpc_filename. + * Start listening on ld->rpc_filename. * * Sets up the listener effectively starting the RPC interface. */ void jsonrpc_listen(struct jsonrpc *rpc, struct lightningd *ld); +/** + * Stop listening on ld->rpc_filename. + * + * No new connections from here in. + */ +void jsonrpc_stop_listening(struct jsonrpc *jsonrpc); + +/** + * Kill any remaining JSON-RPC connections. + */ +void jsonrpc_stop_all(struct lightningd *ld); + /** * Add a new command/method to the JSON-RPC interface. * diff --git a/lightningd/lightningd.c b/lightningd/lightningd.c index 39e561e02ddf..288b589e2bcb 100644 --- a/lightningd/lightningd.c +++ b/lightningd/lightningd.c @@ -515,7 +515,7 @@ static const char *find_daemon_dir(struct lightningd *ld, const char *argv0) * is an awesome runtime memory usage detector for C and C++ programs). In * some ways it would be neater not to do this, but it turns out some * transient objects still need cleaning. */ -static void shutdown_subdaemons(struct lightningd *ld) +static void free_all_channels(struct lightningd *ld) { struct peer *p; @@ -529,19 +529,6 @@ static void shutdown_subdaemons(struct lightningd *ld) * writes, which must be inside a transaction. */ db_begin_transaction(ld->wallet->db); - /* Let everyone shutdown cleanly. */ - close(ld->hsm_fd); - /*~ The three "global" daemons, which we shutdown explicitly: we - * give them 10 seconds to exit gracefully before killing them. */ - ld->connectd = subd_shutdown(ld->connectd, 10); - ld->gossip = subd_shutdown(ld->gossip, 10); - ld->hsm = subd_shutdown(ld->hsm, 10); - - /*~ Closing the hsmd means all other subdaemons should be exiting; - * deal with that cleanly before we start freeing internal - * structures. */ - subd_shutdown_remaining(ld); - /* Now we free all the HTLCs */ free_htlcs(ld, NULL); @@ -579,6 +566,18 @@ static void shutdown_subdaemons(struct lightningd *ld) db_commit_transaction(ld->wallet->db); } +static void shutdown_global_subdaemons(struct lightningd *ld) +{ + /* Let everyone shutdown cleanly. */ + close(ld->hsm_fd); + + /*~ The three "global" daemons, which we shutdown explicitly: we + * give them 10 seconds to exit gracefully before killing them. */ + ld->connectd = subd_shutdown(ld->connectd, 10); + ld->gossip = subd_shutdown(ld->gossip, 10); + ld->hsm = subd_shutdown(ld->hsm, 10); +} + /*~ Our wallet logic needs to know what outputs we might be interested in. We * use BIP32 (a.k.a. "HD wallet") to generate keys from a single seed, so we * keep the maximum-ever-used key index in the db, and add them all to the @@ -1200,7 +1199,10 @@ int main(int argc, char *argv[]) assert(io_loop_ret == ld); log_debug(ld->log, "io_loop_with_timers: %s", __func__); - /* Fail JSON RPC requests and ignore plugin's responses */ + /* Stop *new* JSON RPC requests. */ + jsonrpc_stop_listening(ld->jsonrpc); + + /* Give permission for things to get destroyed without getting upset. */ ld->state = LD_STATE_SHUTDOWN; stop_fd = -1; @@ -1221,13 +1223,24 @@ int main(int argc, char *argv[]) /* We're not going to collect our children. */ remove_sigchild_handler(sigchld_conn); - shutdown_subdaemons(ld); - /* Tell plugins we're shutting down, closes the db. */ + /* Get rid of per-channel subdaemons. */ + subd_shutdown_nonglobals(ld); + + /* Tell plugins we're shutting down, use force if necessary. */ shutdown_plugins(ld); - /* Cleanup JSON RPC separately: destructors assume some list_head * in ld */ - tal_free(ld->jsonrpc); + /* Now kill any remaining connections */ + jsonrpc_stop_all(ld); + + /* Get rid of major subdaemons. */ + shutdown_global_subdaemons(ld); + + /* Clean up internal peer/channel/htlc structures. */ + free_all_channels(ld); + + /* Now close database */ + ld->wallet->db = tal_free(ld->wallet->db); /* Clean our our HTLC maps, since they use malloc. */ htlc_in_map_clear(&ld->htlcs_in); diff --git a/lightningd/plugin.c b/lightningd/plugin.c index ea9fb5c75872..df7aa545a0ec 100644 --- a/lightningd/plugin.c +++ b/lightningd/plugin.c @@ -581,11 +581,6 @@ static const char *plugin_response_handle(struct plugin *plugin, "Received a JSON-RPC response for non-existent request"); } - /* Ignore responses when shutting down */ - if (plugin->plugins->ld->state == LD_STATE_SHUTDOWN) { - return NULL; - } - /* We expect the request->cb to copy if needed */ pd = plugin_detect_destruction(plugin); request->response_cb(plugin->buffer, toks, idtok, request->response_cb_arg); @@ -2124,9 +2119,6 @@ void shutdown_plugins(struct lightningd *ld) { struct plugin *p, *next; - /* The next io_loop does not need db access, close it. */ - ld->wallet->db = tal_free(ld->wallet->db); - /* Tell them all to shutdown; if they care. */ list_for_each_safe(&ld->plugins->plugins, p, next, list) { /* Kill immediately, deletes self from list. */ diff --git a/lightningd/plugin_hook.c b/lightningd/plugin_hook.c index afab5785f59b..b8d7322f4094 100644 --- a/lightningd/plugin_hook.c +++ b/lightningd/plugin_hook.c @@ -167,7 +167,8 @@ static void plugin_hook_callback(const char *buffer, const jsmntok_t *toks, tal_del_destructor(last, plugin_hook_killed); tal_free(last); - if (r->ld->state == LD_STATE_SHUTDOWN) { + /* Actually, if it dies during shutdown, *don't* process result! */ + if (!buffer && r->ld->state == LD_STATE_SHUTDOWN) { log_debug(r->ld->log, "Abandoning plugin hook call due to shutdown"); return; diff --git a/lightningd/subd.c b/lightningd/subd.c index 4ee11c1c257e..ac151d184737 100644 --- a/lightningd/subd.c +++ b/lightningd/subd.c @@ -902,15 +902,13 @@ struct subd *subd_shutdown(struct subd *sd, unsigned int seconds) return tal_free(sd); } -void subd_shutdown_remaining(struct lightningd *ld) +void subd_shutdown_nonglobals(struct lightningd *ld) { - struct subd *subd; + struct subd *subd, *next; - /* We give them a second to finish exiting, before we kill - * them in destroy_subd() */ - sleep(1); - - while ((subd = list_top(&ld->subds, struct subd, list)) != NULL) { + list_for_each_safe(&ld->subds, subd, next, list) { + if (!subd->channel) + continue; /* Destructor removes from list */ io_close(subd->conn); } diff --git a/lightningd/subd.h b/lightningd/subd.h index db0fd6afa219..f43436b2b5d6 100644 --- a/lightningd/subd.h +++ b/lightningd/subd.h @@ -211,7 +211,7 @@ struct subd_req *subd_req_(const tal_t *ctx, void subd_release_channel(struct subd *owner, const void *channel); /** - * subd_shutdown - try to politely shut down a subdaemon. + * subd_shutdown - try to politely shut down a (global) subdaemon. * @subd: subd to shutdown. * @seconds: maximum seconds to wait for it to exit. * @@ -225,13 +225,10 @@ void subd_release_channel(struct subd *owner, const void *channel); struct subd *subd_shutdown(struct subd *subd, unsigned int seconds); /** - * subd_shutdown_remaining - kill all remaining (per-peer) subds + * subd_shutdown_nonglobals - kill all per-peer subds * @ld: lightningd - * - * They should already be exiting (since we shutdown hsmd), but - * make sure they have. */ -void subd_shutdown_remaining(struct lightningd *ld); +void subd_shutdown_nonglobals(struct lightningd *ld); /* Ugly helper to get full pathname of the current binary. */ const char *find_my_abspath(const tal_t *ctx, const char *argv0); diff --git a/lightningd/test/run-find_my_abspath.c b/lightningd/test/run-find_my_abspath.c index 585eb5328dc4..85fd5fcf9029 100644 --- a/lightningd/test/run-find_my_abspath.c +++ b/lightningd/test/run-find_my_abspath.c @@ -117,6 +117,12 @@ void jsonrpc_listen(struct jsonrpc *rpc UNNEEDED, struct lightningd *ld UNNEEDED /* Generated stub for jsonrpc_setup */ void jsonrpc_setup(struct lightningd *ld UNNEEDED) { fprintf(stderr, "jsonrpc_setup called!\n"); abort(); } +/* Generated stub for jsonrpc_stop_all */ +void jsonrpc_stop_all(struct lightningd *ld UNNEEDED) +{ fprintf(stderr, "jsonrpc_stop_all called!\n"); abort(); } +/* Generated stub for jsonrpc_stop_listening */ +void jsonrpc_stop_listening(struct jsonrpc *jsonrpc UNNEEDED) +{ fprintf(stderr, "jsonrpc_stop_listening called!\n"); abort(); } /* Generated stub for load_channels_from_wallet */ struct htlc_in_map *load_channels_from_wallet(struct lightningd *ld UNNEEDED) { fprintf(stderr, "load_channels_from_wallet called!\n"); abort(); } diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 55f39f9c6312..d9f7ad9ac3dc 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -2542,7 +2542,7 @@ def test_plugin_shutdown(node_factory): l1.rpc.plugin_start(p, dont_shutdown=True) l1.rpc.stop() l1.daemon.wait_for_logs(['test_libplugin: shutdown called', - 'misc_notifications.py: via lightningd shutdown, datastore failed', + 'misc_notifications.py: .* Connection refused', 'test_libplugin: failed to self-terminate in time, killing.'])