Skip to content

Commit

Permalink
Fix participation lease removal for wrong network
Browse files Browse the repository at this point in the history
When manifest changes, depending on the timing it is possible for newly
generated valid leases to get removed if the sign message loop attempts
to sign messages that are as a result of progressing previous network.

Here is an example scenario in a specific order that was causing itests
to fail:
 * participants get a lease for network A up to instance 5
 * network A progresses to instance 6
 * manifest changes the network name to B
 * participants get a new lease for network B up to instance 5
 * sign loop receives a message from network A, instance 6
 * `getParticipantsByInstance` lazily removes leases since it only
   checks the instance.
 * the node ends up with no participants, and stuck.

To fix this:
 1) check if participants asked for are within the current network, and
    if not refuse to participate.
 2) check network name, as well as instance, to lazily remove expired
    leases.
  • Loading branch information
masih committed Oct 16, 2024
1 parent 1b15207 commit 49546f4
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 7 deletions.
2 changes: 1 addition & 1 deletion chain/lf3/f3.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (fff *F3) runSigningLoop(ctx context.Context) {
clear(alreadyParticipated)
}

participants := fff.leaser.getParticipantsByInstance(mb.Payload.Instance)
participants := fff.leaser.getParticipantsByInstance(mb.NetworkName, mb.Payload.Instance)
for _, id := range participants {
if _, ok := alreadyParticipated[id]; ok {
continue
Expand Down
17 changes: 14 additions & 3 deletions chain/lf3/participation_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,26 @@ func (l *leaser) participate(ticket api.F3ParticipationTicket) (api.F3Participat
return newLease, nil
}

func (l *leaser) getParticipantsByInstance(instance uint64) []uint64 {
func (l *leaser) getParticipantsByInstance(network gpbft.NetworkName, instance uint64) []uint64 {
l.mutex.Lock()
defer l.mutex.Unlock()
currentManifest, _ := l.status()
currentNetwork := currentManifest.NetworkName
if currentNetwork != network {
log.Warnf("no participants for network: current network (%s) does not match requested network (%s) at instance %d", currentNetwork, network, instance)
return nil
}
var participants []uint64
for id, lease := range l.leases {
if instance > lease.ToInstance() {
if currentNetwork != lease.Network {
// Lazily delete any lease that does not belong to network, likely acquired from
// prior manifests.
delete(l.leases, id)
log.Warnf("lost F3 participation lease for miner %d at instance %d due to network mismatch: %s != %s", id, instance, currentNetwork, lease.Network)
} else if instance > lease.ToInstance() {
// Lazily delete the expired leases.
delete(l.leases, id)
log.Warnf("lost F3 participation lease for miner %d", id)
log.Warnf("lost F3 participation lease for miner %d due to instance (%d) > lease to instance (%d)", id, instance, lease.ToInstance())
} else {
participants = append(participants, id)
}
Expand Down
6 changes: 3 additions & 3 deletions chain/lf3/participation_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ func TestLeaser(t *testing.T) {
require.NoError(t, err)

// Both participants should still be valid.
participants := subject.getParticipantsByInstance(11)
participants := subject.getParticipantsByInstance(testManifest.NetworkName, 11)
require.Len(t, participants, 2)
require.Contains(t, participants, uint64(123))
require.Contains(t, participants, uint64(456))

// After instance 16, only participant 456 should be valid.
participants = subject.getParticipantsByInstance(16)
participants = subject.getParticipantsByInstance(testManifest.NetworkName, 16)
require.Len(t, participants, 1)
require.Contains(t, participants, uint64(456))

// After instance 17, no participant must have a lease.
participants = subject.getParticipantsByInstance(17)
participants = subject.getParticipantsByInstance(testManifest.NetworkName, 17)
require.Empty(t, participants)
})
t.Run("expired ticket", func(t *testing.T) {
Expand Down

0 comments on commit 49546f4

Please sign in to comment.