Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lightningd: make shutdown safe again #5932

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions db/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct db {
/* List of statements that have been created but not executed yet. */
struct list_head pending_statements;
char *error;
bool *shutdown;

/* Were there any modifying statements in the current transaction?
* Used to bump the data_version in the DB.*/
Expand Down
6 changes: 4 additions & 2 deletions db/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,16 @@ void db_report_changes(struct db *db, const char *final, size_t min)
{
assert(db->changes);
assert(tal_count(db->changes) >= min);

/* Having changes implies that we have a dirty TX. The opposite is
* currently not true, e.g., the postgres driver doesn't record
* changes yet. */
assert(!tal_count(db->changes) || db->dirty);

if (tal_count(db->changes) > min && db->report_changes_fn)
if (tal_count(db->changes) > min && db->report_changes_fn) {
if (*db->shutdown)
db_fatal("sivr db_write during shutdown");
db->report_changes_fn(db);
}
db->changes = tal_free(db->changes);
}

Expand Down
21 changes: 21 additions & 0 deletions lightningd/jsonrpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,15 @@ parse_request(struct json_connection *jcon, const jsmntok_t tok[])
json_tok_full(jcon->buffer, method));
}

if (c->ld->state == LD_STATE_SHUTDOWN) {
log_debug(jcon->ld->log, "sivr method=%s id=%s\n",
method ? json_strdup(tmpctx, jcon->buffer, method) : "",
id ? json_strdup(tmpctx, jcon->buffer, id) : "");
return command_fail(c, LIGHTNINGD_SHUTDOWN,
"lightningd is shutting down");
}


rpc_hook = tal(c, struct rpc_command_hook_payload);
rpc_hook->cmd = c;
/* Duplicate since we might outlive the connection */
Expand Down Expand Up @@ -1227,6 +1236,18 @@ bool jsonrpc_command_add(struct jsonrpc *rpc, struct json_command *command,
return true;
}

bool jsonrpc_command_del(struct jsonrpc *rpc, const char *name)
{
size_t count = tal_count(rpc->commands);
for (size_t j = 0; j < count; j++) {
if (streq(name, rpc->commands[j]->name)) {
tal_free(rpc->commands[j]);
return true;
}
}
return false;
}

static bool jsonrpc_command_add_perm(struct lightningd *ld,
struct jsonrpc *rpc,
struct json_command *command)
Expand Down
3 changes: 3 additions & 0 deletions lightningd/jsonrpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ void jsonrpc_stop_all(struct lightningd *ld);
bool jsonrpc_command_add(struct jsonrpc *rpc, struct json_command *command,
const char *usage TAKES);

/* Remove a command from the JSON-RPC interface */
bool jsonrpc_command_del(struct jsonrpc *rpc, const char *name);

/**
* Begin a JSON-RPC notification with the specified topic.
*
Expand Down
24 changes: 17 additions & 7 deletions lightningd/lightningd.c
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,9 @@ int main(int argc, char *argv[])

/*~ Every daemon calls this in some form: the hooks are for dumping
* backtraces when we crash (if supported on this platform). */
#if DEVELOPER
daemon_maybe_debug(argv);
#endif
daemon_setup(argv[0], log_backtrace_print, log_backtrace_exit);

/*~ There's always a battle between what a constructor like this
Expand All @@ -974,6 +977,7 @@ int main(int argc, char *argv[])
* variables. */
ld = new_lightningd(NULL);
ld->state = LD_STATE_RUNNING;
ld->shutdown = false;

/*~ We store an copy of our arguments before parsing mangles them, so
* we can re-exec if versions of subdaemons change. Note the use of
Expand Down Expand Up @@ -1236,7 +1240,9 @@ int main(int argc, char *argv[])
/* Stop *new* JSON RPC requests. */
jsonrpc_stop_listening(ld->jsonrpc);

/* Give permission for things to get destroyed without getting upset. */
/* Give permission for things to get destroyed without getting upset.
* Also ignore responses to JSON-RPC requests in upcoming main io_loop.*/
/* Fail JSON RPC requests and ignore plugin's responses */
ld->state = LD_STATE_SHUTDOWN;

stop_fd = -1;
Expand All @@ -1261,21 +1267,25 @@ int main(int argc, char *argv[])
/* Get rid of per-channel subdaemons. */
subd_shutdown_nonglobals(ld);

/* Tell plugins we're shutting down, use force if necessary. */
shutdown_plugins(ld);

/* Now kill any remaining connections */
jsonrpc_stop_all(ld);

/* Get rid of major subdaemons. */
shutdown_global_subdaemons(ld);

/* Tell normal plugins we're shutting down, use force if necessary. */
ld->shutdown = true;
shutdown_plugins(ld, true);

/* Clean up internal peer/channel/htlc structures. */
free_all_channels(ld);

/* Now kill any remaining connections */
jsonrpc_stop_all(ld);

/* Now close database */
ld->wallet->db = tal_free(ld->wallet->db);

/* As database is closed we can safely shutdown db_write plugins */
shutdown_plugins(ld, false);

remove(ld->pidfile);

/* FIXME: pay can have children off tmpctx which unlink from
Expand Down
1 change: 1 addition & 0 deletions lightningd/lightningd.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ struct lightningd {
alt_subdaemon_map alt_subdaemons;

enum lightningd_state state;
bool shutdown;

/* Total number of coin moves we've seen, since
* coin move tracking was cool */
Expand Down
2 changes: 1 addition & 1 deletion lightningd/opening_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ static void destroy_uncommitted_channel(struct uncommitted_channel *uc)
subd_release_channel(open_daemon, uc);
}

/* This is how shutdown_subdaemons tells us not to delete from db! */
/* This is how free_all_channels tells us not to delete from db! */
if (!uc->peer->uncommitted_channel)
return;

Expand Down
12 changes: 12 additions & 0 deletions lightningd/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,11 @@ static char *opt_force_featureset(const char *optarg,
return NULL;
}

static char *opt_null(void *unused UNNEEDED)
{
return NULL;
}

static void dev_register_opts(struct lightningd *ld)
{
/* We might want to debug plugins, which are started before normal
Expand All @@ -689,6 +694,8 @@ static void dev_register_opts(struct lightningd *ld)
&ld->dev_no_plugin_checksum,
"Don't checksum plugins to detect changes");

opt_register_noarg("--debugger", opt_null, NULL,
"Invoke gdb to lightningd");
opt_register_noarg("--dev-no-reconnect", opt_set_invbool,
&ld->reconnect,
"Disable automatic reconnect-attempts by this node, but accept incoming");
Expand Down Expand Up @@ -1581,6 +1588,11 @@ static void add_config(struct lightningd *ld,
} else if (opt->cb == (void *)plugin_opt_flag_set) {
/* Noop, they will get added below along with the
* OPT_HASARG options. */
#if DEVELOPER
/* instead of this, we could maybe change -dev-debugger=lightningd? */
} else if (streq(name, "debugger")) {
/* Handled early in lightningd's main(), ignored here */
#endif
} else {
/* Insert more decodes here! */
assert(!"A noarg option was added but was not handled");
Expand Down
99 changes: 86 additions & 13 deletions lightningd/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ static const char *state_desc(const struct plugin *plugin)
return "before replying to init";
case INIT_COMPLETE:
return "during normal operation";
case SHUTDOWN:
return "in state shutdown";
}
fatal("Invalid plugin state %i for %s",
plugin->plugin_state, plugin->cmd);
Expand Down Expand Up @@ -216,8 +218,10 @@ static void destroy_plugin(struct plugin *p)

/* Daemon shutdown overrules plugin's importance; aborts init checks */
if (p->plugins->ld->state == LD_STATE_SHUTDOWN) {
/* But return if this was the last plugin! */
if (list_empty(&p->plugins->plugins)) {

/* But break io_loop if this was the last plugin we waited for */
if (p->plugin_state == SHUTDOWN &&
!plugins_any_in_state(p->plugins, SHUTDOWN)) {
log_debug(p->plugins->ld->log, "io_break: %s", __func__);
io_break(destroy_plugin);
}
Expand Down Expand Up @@ -585,6 +589,21 @@ static const char *plugin_response_handle(struct plugin *plugin,
json_tok_full(plugin->buffer, idtok));
}

/* In this state we're shutting down and in the process of unwinding the
* custom behavior injected by plugins via hooks etc.. We want to stay dead
* now, because otherwise (other) plugins that are already destroyed would
* miss the action, which wouldn't be consistent.
* An exception is a db_write plugin. Ideally db_write's never happen in
* this state but that is hard to proof, so we keep db_write plugins alive
* longer and still allow db_write hook responses. */
if (plugin->plugins->ld->state == LD_STATE_SHUTDOWN &&
strcmp(request->method, "db_write") != 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a tiny helper: streq

log_debug(plugin->plugins->ld->log,
"Ignoring response of method %s from plugin %s due to shutdown",
request->method, plugin->shortname);
return NULL;
}

/* We expect the request->cb to copy if needed */
pd = plugin_detect_destruction(plugin);
request->response_cb(plugin->buffer, toks, idtok, request->response_cb_arg);
Expand Down Expand Up @@ -2140,37 +2159,91 @@ void plugins_set_builtin_plugins_dir(struct plugins *plugins,
NULL, NULL);
}

void shutdown_plugins(struct lightningd *ld)
void shutdown_plugins(struct lightningd *ld, bool keep_db_write_plugins)
{
struct plugin *p, *next;
bool need_io = false;

/* Subdaemon should be dead to *stay* dead when restarting the io_loop */
assert(list_empty(&ld->subds));

/* Tell them all to shutdown; if they care. */
list_for_each_safe(&ld->plugins->plugins, p, next, list) {
/* Kill immediately, deletes self from list. */
if (p->plugin_state != INIT_COMPLETE || !notify_plugin_shutdown(ld, p))

/* Simply kill uninitialized plugin, freeing removes it from list */
if (p->plugin_state < INIT_COMPLETE) {
tal_free(p);
continue;
}

/* First call: */
if (keep_db_write_plugins) {

/* Send shutdown notifications while jsonrpc still exists */
assert(ld->jsonrpc);
bool notify = notify_plugin_shutdown(ld, p);
need_io = need_io || notify;

if (plugin_registered_db_write_hook(p))
continue;

/* Mark the plugins we shall wait for, others are killed */
if (notify)
p->plugin_state = SHUTDOWN;
else
tal_free(p);

/* Second call: */
} else {
assert(!ld->wallet->db);

/* io_close sends EOF to plugin's stdin, which eventually breaks its
* main io-loop (safely) after subroutines have returned. */
io_close(p->stdin_conn);
need_io = true;

/* But we still wait for them to self-terminate. */
p->plugin_state = SHUTDOWN;
}
}

/* If anyone was interested in shutdown, give them time. */
if (!list_empty(&ld->plugins->plugins)) {
/* io_loop sends the notifications, waits 30 seconds for marked plugins
* to self-terminate, see also destroy_plugin */
if (need_io) {
struct timers *timer;
struct timer *expired;

/* 30 seconds should do it, use a clean timers struct */
/* But don't wait full 30s to just notify, also use a clean timers
* struct to ignore any existing timers. */
int t = plugins_any_in_state(ld->plugins, SHUTDOWN) ? 30000 : 1;
timer = tal(NULL, struct timers);
timers_init(timer, time_mono());
new_reltimer(timer, timer, time_from_sec(30), NULL, NULL);
new_reltimer(timer, timer, time_from_msec(t), NULL, NULL);

void *ret = io_loop(timer, &expired);
assert(ret == NULL || ret == destroy_plugin);

/* Report and free remaining plugins. */
while (!list_empty(&ld->plugins->plugins)) {
p = list_pop(&ld->plugins->plugins, struct plugin, list);
/* Report and free plugins that didn't self-terminate in time */
list_for_each_safe(&ld->plugins->plugins, p, next, list) {
if (p->plugin_state != SHUTDOWN) {
continue;
}

list_del_from(&ld->plugins->plugins, &p->list);
log_debug(ld->log,
"%s: failed to self-terminate in time, killing.",
p->shortname);
tal_free(p);
}
}

if (keep_db_write_plugins)
/* Remove JSON-RPC methods of remaining db_write plugins, so we can
* free jsonrpc. */
list_for_each(&ld->plugins->plugins, p, list) {
for (size_t i=0; i<tal_count(p->methods); i++) {
jsonrpc_command_del(ld->jsonrpc, p->methods[i]);
}
}
else
assert(list_empty(&ld->plugins->plugins));
}
15 changes: 12 additions & 3 deletions lightningd/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ enum plugin_state {
/* We have to get `init` response */
AWAITING_INIT_RESPONSE,
/* We have `init` response. */
INIT_COMPLETE
INIT_COMPLETE,
/* Wait for it to self-terminate */
SHUTDOWN,
};

/**
Expand Down Expand Up @@ -238,9 +240,16 @@ void plugin_kill(struct plugin *plugin, enum log_level loglevel,
const char *fmt, ...);

/**
* Tell all the plugins we're shutting down, and free them.
* Tell subscribed plugins we're shutting down, wait for them to self-terminate
* and free them.
*
* @keep_db_write_plugins=true: plugins that registered the db_write hook are
* kept alive, but still send them a "shutdown" notification when subscribed.
*
* @keep_db_write_plugins=false: shutdown remaining plugins: send EOF to their
* stdin, then wait for them to self-terminate.
*/
void shutdown_plugins(struct lightningd *ld);
void shutdown_plugins(struct lightningd *ld, bool keep_db_write_plugins);

/**
* Returns the plugin which registers the command with name {cmd_name}
Expand Down
15 changes: 14 additions & 1 deletion lightningd/plugin_hook.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,17 +162,20 @@ static void plugin_hook_callback(const char *buffer, const jsmntok_t *toks,
struct db *db = r->db;
struct plugin_hook_call_link *last, *it;
bool in_transaction = false;
struct plugin *from_plugin;

/* Pop the head off the call chain and continue with the next */
last = list_pop(&r->call_chain, struct plugin_hook_call_link, list);
from_plugin = last->plugin;
assert(last != NULL);
tal_del_destructor(last, plugin_hook_killed);
tal_free(last);

/* Actually, if it dies during shutdown, *don't* process result! */
if (!buffer && r->ld->state == LD_STATE_SHUTDOWN) {
log_debug(r->ld->log,
"Abandoning plugin hook call due to shutdown");
"Abandoning hook %s because its plugin %s was killed in shutdown",
r->hook->name, from_plugin->shortname);
return;
}

Expand Down Expand Up @@ -304,6 +307,16 @@ struct db_write_hook_req {
size_t *num_hooks;
};

bool plugin_registered_db_write_hook(struct plugin *plugin) {
const struct plugin_hook *hook = &db_write_hook;

for (size_t i = 0; i < tal_count(hook->hooks); i++)
if (hook->hooks[i]->plugin == plugin)
return true;

return false;
}

static void db_hook_response(const char *buffer, const jsmntok_t *toks,
const jsmntok_t *idtok,
struct db_write_hook_req *dwh_req)
Expand Down
Loading