Skip to content
This repository has been archived by the owner on Oct 28, 2021. It is now read-only.

Garbage collect incompatible peers in Host::run() #5624

Merged
merged 7 commits into from
Jun 18, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- Added: [#5591](https://github.com/ethereum/aleth/pull/5591) Network logging bugfixes and improvements and add p2pcap log channel.
- Added: [#5588](https://github.com/ethereum/aleth/pull/5588) Testeth prints similar test suite name suggestions, when the name passed in `-t` argument is not found.
- Added: [#5593](https://github.com/ethereum/aleth/pull/5593) Dynamically updating host ENR.
- Added: [#5624](https://github.com/ethereum/aleth/pull/5624) Remove useless peers from peer list.
- Changed: [#5532](https://github.com/ethereum/aleth/pull/5532) The leveldb is upgraded to 1.22. This is breaking change on Windows and the old databases are not compatible.
- Changed: [#5559](https://github.com/ethereum/aleth/pull/5559) Update peer validation error messages.
- Changed: [#5568](https://github.com/ethereum/aleth/pull/5568) Improve rlpx handshake log messages and create new rlpx log channel.
Expand Down
11 changes: 11 additions & 0 deletions libp2p/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,17 @@ enum DisconnectReason
/// @returns the string form of the given disconnection reason.
std::string reasonOf(DisconnectReason _r);

enum HandshakeFailureReason
{
NoFailure = 0,
UnknownFailure,
Timeout,
TcpError,
FrameDecryptionFailure,
InternalError,
ProtocolError
};

using CapDesc = std::pair<std::string, unsigned>;
using CapDescSet = std::set<CapDesc>;
using CapDescs = std::vector<CapDesc>;
Expand Down
42 changes: 34 additions & 8 deletions libp2p/Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,25 @@ void Host::stopCapabilities()
}
}

std::shared_ptr<Peer> Host::peer(NodeID const& _n) const
{
RecursiveGuard l(x_sessions);
auto it = m_peers.find(_n);
if (it == m_peers.end())
{
LOG(m_logger) << "Peer " << _n << " not found";
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
return nullptr;
}
return it->second;
}

void Host::handshakeFailed(NodeID const& _n, HandshakeFailureReason _r)
{
std::shared_ptr<Peer> p = peer(_n);
assert(p);
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
p->m_lastHandshakeFailure = _r;
}

void Host::doneWorking()
{
// Return early if we have no capabilities since there's nothing to do. We've already stopped
Expand Down Expand Up @@ -284,6 +303,7 @@ void Host::startPeerSession(Public const& _id, RLP const& _hello,
m_peers[_id] = peer;
}
}
peer->m_lastHandshakeFailure = NoFailure;
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
if (peer->isOffline())
peer->m_lastConnected = chrono::system_clock::now();
peer->endpoint.setAddress(_s->remoteEndpoint().address());
Expand Down Expand Up @@ -783,15 +803,21 @@ void Host::run(boost::system::error_code const& _ec)
unsigned reqConn = 0;
{
RecursiveGuard l(x_sessions);
for (auto const& p : m_peers)
{
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
bool haveSession = havePeerSession(p.second->id);
bool required = p.second->peerType == PeerType::Required;
if (haveSession && required)
reqConn++;
else if (!haveSession && p.second->shouldReconnect() &&
(!m_netConfig.pin || required))
toConnect.push_back(p.second);
for (auto p = m_peers.cbegin(); p != m_peers.cend(); p++)
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
{
bool haveSession = havePeerSession(p->second->id);
bool required = p->second->peerType == PeerType::Required;
if (haveSession && required)
reqConn++;
else if (!haveSession)
{
if (p->second->fallbackSeconds() == numeric_limits<unsigned>::max())
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
p = m_peers.erase(p);
else if (p->second->shouldReconnect() && (!m_netConfig.pin || required))
toConnect.push_back(p->second);
}
}
}
}

Expand Down
7 changes: 6 additions & 1 deletion libp2p/Host.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,11 @@ class Host: public Worker
/// Stop registered capabilities, typically done when the network is being shut down.
void stopCapabilities();

std::shared_ptr<Peer> peer(NodeID const& _n) const;

/// Set a handshake failure reason for a peer
void handshakeFailed(NodeID const& _n, HandshakeFailureReason _r);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe better onHandshakeFailed

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also I would make it public and declare close to startPeerSession

Copy link
Contributor Author

@halfalicious halfalicious Jun 15, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gumb0 : Why make this public, does it make sense to expose the concept of a handshake to consumers of Host?

Copy link
Member

@gumb0 gumb0 Jun 17, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well it looks to me like a callback similar to startPeerSession, the callback called by RLPXHandshake when handshake is finished. One is for success another one is for failure.

It works as private, because RLPXHandshake is a friend of Host, but idealluy we should get rid of this friend declarations at some point.

Exposing it to the clients of Host is of course not great, but the proper way to deal with it could be to create a separate interface with these callbacks only, don't expose it to Host clients, but pass it only to RLPXHandshake. That's a bit complicated change, at least it's not for this PR.

(In other words, we won't make it much worse, because startPeerSession is already public)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well it looks to me like a callback similar to startPeerSession, the callback called by RLPXHandshake when handshake is finished. One is for success another one is for failure.

It works as private, because RLPXHandshake is a friend of Host, but idealluy we should get rid of this friend declarations at some point.

Exposing it to the clients of Host is of course not great, but the proper way to deal with it could be to create a separate interface with these callbacks only, don't expose it to Host clients, but pass it only to RLPXHandshake. That's a bit complicated change, at least it's not for this PR.

(In other words, we won't make it much worse, because startPeerSession is already public)

Ah that makes sense, thank you for clarifying! 😄 I'll make the change before merging.


bytes m_restoreNetwork; ///< Set by constructor and used to set Host key and restore network peers & nodes.

std::atomic<bool> m_run{false}; ///< Whether network is running.
Expand Down Expand Up @@ -408,7 +413,7 @@ class Host: public Worker
/// logging to once every c_logActivePeersInterval seconds
std::chrono::steady_clock::time_point m_lastPeerLogMessage;

Logger m_logger{createLogger(VerbosityDebug, "net")};
mutable Logger m_logger{createLogger(VerbosityDebug, "net")};
Logger m_detailsLogger{createLogger(VerbosityTrace, "net")};
Logger m_infoLogger{createLogger(VerbosityInfo, "net")};
};
Expand Down
25 changes: 20 additions & 5 deletions libp2p/Peer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ namespace dev

namespace p2p
{

Peer::Peer(Peer const& _original):
Node(_original),
Peer::Peer(Peer const& _original)
: Node(_original),
m_lastConnected(_original.m_lastConnected),
m_lastAttempted(_original.m_lastAttempted),
m_lastDisconnect(_original.m_lastDisconnect),
m_lastHandshakeFailure(_original.m_lastHandshakeFailure),
m_session(_original.m_session)
{
m_score = _original.m_score.load();
Expand All @@ -45,18 +45,33 @@ Peer::Peer(Peer const& _original):

bool Peer::shouldReconnect() const
{
return id && endpoint && chrono::system_clock::now() > m_lastAttempted + chrono::seconds(fallbackSeconds());
return id && endpoint &&
fallbackSeconds() != numeric_limits<unsigned>::max() &&
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
chrono::system_clock::now() > m_lastAttempted + chrono::seconds(fallbackSeconds());
}

unsigned Peer::fallbackSeconds() const
{
if (peerType == PeerType::Required)
return 5;

switch (m_lastHandshakeFailure)
{
case FrameDecryptionFailure:
case ProtocolError:
return numeric_limits<unsigned>::max();
default:
break;
}

switch (m_lastDisconnect)
{
case BadProtocol:
return 30 * (m_failedAttempts + 1);
case UselessPeer:
case IncompatibleProtocol:
case UnexpectedIdentity:
case UserReason:
return numeric_limits<unsigned>::max();
case TooManyPeers:
return 25 * (m_failedAttempts + 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

m_failedAttempts seems to affect only the value of fallbackSeconds currently. Maybe we should use it now to retry several times and then go to "critical error, disconnect" state.
(at least for some cases of failures)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gumb0 Good idea - what do you think of this being taken care of in another PR? I'd like to limit the amount of changes I make to the peer gc logic in this PR so if something ends up breaking it will be easier to debug.

Copy link
Member

@gumb0 gumb0 Jun 14, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok for another PR, but it seems to be the matter of only adding condition like m_failedAttempts >= 20 to uselessPeer function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additional thought is that we could make all this change more conservative if we leave fallbackSeconds() as it was before (so that we do the same reconnects as before) and have just this check for failed attempts count in uselessPeer (plus the new check for handshake failures)
This way it would reconnect with the same intervals for each case as before, but stop after limited number of attempts.

But I'm fine with it if you think it's better to immediately stop in some cases.

case ClientQuit:
Expand Down
4 changes: 3 additions & 1 deletion libp2p/Peer.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ class Peer: public Node
void noteSessionGood() { m_failedAttempts = 0; }

private:
/// Returns number of seconds to wait until attempting connection, based on attempted connection history.
/// Returns number of seconds to wait until attempting connection, based on attempted connection history, or
/// numeric_limits<unsigned>::max() if a connection should never be attempted.
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
unsigned fallbackSeconds() const;

std::atomic<int> m_score{0}; ///< All time cumulative.
Expand All @@ -92,6 +93,7 @@ class Peer: public Node
std::chrono::system_clock::time_point m_lastAttempted;
std::atomic<unsigned> m_failedAttempts{0};
DisconnectReason m_lastDisconnect = NoDisconnect; ///< Reason for disconnect that happened last.
HandshakeFailureReason m_lastHandshakeFailure = NoFailure; ///< Reason for most recent handshake failure

/// Used by isOffline() and (todo) for peer to emit session information.
std::weak_ptr<Session> m_session;
Expand Down
30 changes: 25 additions & 5 deletions libp2p/RLPxHandshake.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ void RLPXHandshake::cancel()

void RLPXHandshake::error(boost::system::error_code _ech)
{
if (m_originated)
halfalicious marked this conversation as resolved.
Show resolved Hide resolved
m_host->handshakeFailed(m_remote, m_failureReason);

stringstream errorStream;
errorStream << "Handshake failed";
if (_ech)
Expand Down Expand Up @@ -278,7 +281,9 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
if (!_ec)
{
LOG(m_logger) << "Disconnecting (Handshake Timeout) from";
cancel();
m_failureReason = Timeout;
m_nextState = Error;
transition();
}
});

Expand Down Expand Up @@ -328,10 +333,12 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
bytes packet;
s.swapOut(packet);
m_io->writeSingleFramePacket(&packet, m_handshakeOutBuffer);
ba::async_write(m_socket->ref(), ba::buffer(m_handshakeOutBuffer), [this, self](boost::system::error_code ec, std::size_t)
{
transition(ec);
});
ba::async_write(m_socket->ref(), ba::buffer(m_handshakeOutBuffer),
[this, self](boost::system::error_code ec, std::size_t) {
if (ec)
m_failureReason = TcpError;
transition(ec);
});
}
else if (m_nextState == ReadHello)
{
Expand All @@ -346,14 +353,18 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
boost::asio::buffer(m_handshakeInBuffer, handshakeSizeBytes),
[this, self](boost::system::error_code ec, std::size_t) {
if (ec)
{
m_failureReason = TcpError;
transition(ec);
}
else
{
if (!m_io)
{
LOG(m_errorLogger)
<< "Internal error in handshake: RLPXFrameCoder disappeared ("
<< m_remote << ")";
m_failureReason = InternalError;
m_nextState = Error;
transition();
return;
Expand All @@ -365,6 +376,7 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
if (!m_io->authAndDecryptHeader(
bytesRef(m_handshakeInBuffer.data(), m_handshakeInBuffer.size())))
{
m_failureReason = FrameDecryptionFailure;
m_nextState = Error;
transition();
return;
Expand All @@ -383,6 +395,7 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
LOG(m_logger)
<< "Frame is too large! Expected size: " << expectedFrameSizeBytes
<< " bytes, actual size: " << frameSize << " bytes";
m_failureReason = ProtocolError;
m_nextState = Error;
transition();
return;
Expand All @@ -407,13 +420,17 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
m_idleTimer.cancel();

if (ec)
{
m_failureReason = TcpError;
transition(ec);
}
else
{
if (!m_io)
{
LOG(m_errorLogger) << "Internal error in handshake: "
"RLPXFrameCoder disappeared";
m_failureReason = InternalError;
m_nextState = Error;
transition();
return;
Expand All @@ -423,6 +440,7 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
if (!m_io->authAndDecryptFrame(frame))
{
LOG(m_logger) << "Frame body decrypt failed";
m_failureReason = FrameDecryptionFailure;
m_nextState = Error;
transition();
return;
Expand All @@ -436,6 +454,7 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
<< "Invalid packet type. Expected: "
<< p2pPacketTypeToString(HelloPacket)
<< ", received: " << p2pPacketTypeToString(packetType);
m_failureReason = ProtocolError;
m_nextState = Error;
transition();
return;
Expand All @@ -453,6 +472,7 @@ void RLPXHandshake::transition(boost::system::error_code _ech)
{
LOG(m_errorLogger)
<< "Handshake causing an exception: " << _e.what();
m_failureReason = UnknownFailure;
m_nextState = Error;
transition();
}
Expand Down
2 changes: 2 additions & 0 deletions libp2p/RLPxHandshake.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ class RLPXHandshake: public std::enable_shared_from_this<RLPXHandshake>
/// Timer which enforces c_timeout. Reset for each stage of the handshake.
ba::steady_timer m_idleTimer;

HandshakeFailureReason m_failureReason;
halfalicious marked this conversation as resolved.
Show resolved Hide resolved

Logger m_logger{createLogger(VerbosityTrace, "rlpx")};
Logger m_errorLogger{createLogger(VerbosityError, "rlpx")};
};
Expand Down