Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

connectd: fix accidental handling of old reconnections. #5256

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
TODO: Insert version codename, and username of the contributor that named the release.
-->

## [0.11.1] - 2022-05-13: Simon's Carefully Chosen Release Name II

Single change which fixed a bug introduced in 0.11.0 which could cause
unwanted unilateral closes (`bad reestablish revocation_number: 0 vs 3`)

### Fixed

- connectd: make sure we don't keep stale reconnections around. ([#5256])
- connectd: fix assert which we could trigger. ([#5256])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- connectd: fix assert which we could trigger. ([#5256])
- connectd: fix assert which we could trigger. ([#5254])

This should be #5254 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, these refer to the PR, not the bug itself...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah damnt! Sorry just assuming wrong stuff


[#5256]: https://github.com/ElementsProject/lightning/pull/5256

## [0.11.0.1] - 2022-04-04: Simon's Carefully Chosen Release Name

This release would have been named by Simon Vrouwe, had he responded to my emails!
Expand Down
49 changes: 24 additions & 25 deletions connectd/connectd.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,36 +211,29 @@ static void peer_connected_in(struct daemon *daemon,
tal_free(connect);
}

/*~ This is an ad-hoc marshalling structure where we store arguments so we
* can call peer_connected again. */
struct peer_reconnected {
struct daemon *daemon;
struct node_id id;
struct wireaddr_internal addr;
const struct wireaddr *remote_addr;
struct crypto_state cs;
const u8 *their_features;
bool incoming;
};

/*~ For simplicity, lightningd only ever deals with a single connection per
* peer. So if we already know about a peer, we tell lightning to disconnect
* the old one and retry once it does. */
static struct io_plan *retry_peer_connected(struct io_conn *conn,
struct peer_reconnected *pr)
{
struct io_plan *plan;

/*~ As you can see, we've had issues with this code before :( */
status_peer_debug(&pr->id, "processing now old peer gone");

/*~ Usually the pattern is to return this directly, but we have to free
* our temporary structure. */
plan = peer_connected(conn, pr->daemon, &pr->id, &pr->addr,
/* If this fails (still waiting), pr will be freed, so reparent onto
* tmpctx so it gets freed either way. */
tal_steal(tmpctx, pr);

/*~ Usually the pattern is to return this directly. */
return peer_connected(conn, pr->daemon, &pr->id, &pr->addr,
pr->remote_addr,
&pr->cs, take(pr->their_features), pr->incoming);
tal_free(pr);
return plan;
}

/*~ A common use for destructors is to remove themselves from a data structure */
static void destroy_peer_reconnected(struct peer_reconnected *pr)
{
peer_reconnected_htable_del(&pr->daemon->reconnected, pr);
}

/*~ If we already know about this peer, we tell lightningd and it disconnects
Expand All @@ -259,6 +252,13 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,

status_peer_debug(id, "reconnect");

/* If we have a previous reconnection, we replace it. */
pr = peer_reconnected_htable_get(&daemon->reconnected, id);
if (pr) {
peer_reconnected_htable_del(&daemon->reconnected, pr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the previous reconnection already had the destructor installed, would it not delete this twice? Or delete from htable is idempotent?

tal_free(pr);
}

/* Tell master to kill it: will send peer_disconnect */
msg = towire_connectd_reconnected(NULL, id);
daemon_conn_send(daemon->master, take(msg));
Expand All @@ -271,6 +271,8 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
pr->addr = *addr;
pr->remote_addr = tal_dup_or_null(pr, struct wireaddr, remote_addr);
pr->incoming = incoming;
peer_reconnected_htable_add(&daemon->reconnected, pr);
tal_add_destructor(pr, destroy_peer_reconnected);

/*~ Note that tal_dup_talarr() will do handle the take() of features
* (turning it into a simply tal_steal() in those cases). */
Expand All @@ -280,11 +282,7 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
* the peer set. When someone calls `io_wake()` on that address, it
* will call retry_peer_connected above. */
return io_wait(conn, peer_htable_get(&daemon->peers, id),
/*~ The notleak() wrapper is a DEVELOPER-mode hack so
* that our memory leak detection doesn't consider 'pr'
* (which is not referenced from our code) to be a
* memory leak. */
retry_peer_connected, notleak(pr));
retry_peer_connected, pr);
}

/*~ When we free a peer, we remove it from the daemon's hashtable */
Expand Down Expand Up @@ -1902,7 +1900,6 @@ void peer_conn_closed(struct peer *peer)
struct connecting *connect = find_connecting(peer->daemon, &peer->id);

/* These should be closed already! */
assert(tal_count(peer->subds) == 0);
assert(!peer->to_peer);
assert(peer->ready_to_die || !peer->active);

Expand Down Expand Up @@ -1981,6 +1978,7 @@ static void dev_connect_memleak(struct daemon *daemon, const u8 *msg)
/* Now delete daemon and those which it has pointers to. */
memleak_remove_region(memtable, daemon, sizeof(daemon));
memleak_remove_htable(memtable, &daemon->peers.raw);
memleak_remove_htable(memtable, &daemon->reconnected.raw);

found_leak = dump_memleak(memtable, memleak_status_broken);
daemon_conn_send(daemon->master,
Expand Down Expand Up @@ -2127,6 +2125,7 @@ int main(int argc, char *argv[])
/* Allocate and set up our simple top-level structure. */
daemon = tal(NULL, struct daemon);
peer_htable_init(&daemon->peers);
peer_reconnected_htable_init(&daemon->reconnected);
memleak_add_helper(daemon, memleak_daemon_cb);
list_head_init(&daemon->connecting);
timers_init(&daemon->timers, time_mono());
Expand Down
34 changes: 34 additions & 0 deletions connectd/connectd.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,37 @@ HTABLE_DEFINE_TYPE(struct peer,
peer_eq_node_id,
peer_htable);

/*~ This is an ad-hoc marshalling structure where we store arguments so we
* can call peer_connected again. */
struct peer_reconnected {
struct daemon *daemon;
struct node_id id;
struct wireaddr_internal addr;
const struct wireaddr *remote_addr;
struct crypto_state cs;
const u8 *their_features;
bool incoming;
};

static const struct node_id *
peer_reconnected_keyof(const struct peer_reconnected *pr)
{
return &pr->id;
}

static bool peer_reconnected_eq_node_id(const struct peer_reconnected *pr,
const struct node_id *id)
{
return node_id_eq(&pr->id, id);
}

/*~ This defines 'struct peer_reconnected_htable'. */
HTABLE_DEFINE_TYPE(struct peer_reconnected,
peer_reconnected_keyof,
node_id_hash,
peer_reconnected_eq_node_id,
peer_reconnected_htable);

/*~ This is the global state, like `struct lightningd *ld` in lightningd. */
struct daemon {
/* Who am I? */
Expand All @@ -142,6 +173,9 @@ struct daemon {
* have disconnected. */
struct peer_htable peers;

/* Peers which have reconnected, waiting for us to kill existing conns */
struct peer_reconnected_htable reconnected;

/* Peers we are trying to reach */
struct list_head connecting;

Expand Down