Skip to content

Commit

Permalink
feat(IC-1579): TLA instrumentation for merge_neuron (#2341)
Browse files Browse the repository at this point in the history
Adds a TLA model of `merge_neurons` and links it to the code.

Since the only Rust integration tests that test merge_neuron are
proptests, which generate a ton of traces, introduced a limit to check
only some prefix (currently, first 30) of the state pairs from the
collected traces. Also made the Apalache checks a bit more parallel, by
using channels to signal completion as soon as a single thread is
available instead of waiting for the batch.

---------

Co-authored-by: IDX GitHub Automation <infra+github-automation@dfinity.org>
  • Loading branch information
oggy-dfin and IDX GitHub Automation authored Nov 29, 2024
1 parent b7bb757 commit 7a20299
Show file tree
Hide file tree
Showing 12 changed files with 659 additions and 96 deletions.
25 changes: 21 additions & 4 deletions rs/nns/governance/src/governance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ pub mod tla;

#[cfg(feature = "tla")]
pub use tla::{
claim_neuron_desc, split_neuron_desc, tla_update_method, InstrumentationState, ToTla,
TLA_INSTRUMENTATION_STATE, TLA_TRACES_LKEY, TLA_TRACES_MUTEX,
tla_update_method, InstrumentationState, ToTla, CLAIM_NEURON_DESC, MERGE_NEURONS_DESC,
SPLIT_NEURON_DESC, TLA_INSTRUMENTATION_STATE, TLA_TRACES_LKEY, TLA_TRACES_MUTEX,
};

// 70 KB (for executing NNS functions that are not canister upgrades)
Expand Down Expand Up @@ -2670,7 +2670,7 @@ impl Governance {
/// stake.
/// - The amount to split minus the transfer fee is more than the minimum
/// stake.
#[cfg_attr(feature = "tla", tla_update_method(split_neuron_desc()))]
#[cfg_attr(feature = "tla", tla_update_method(SPLIT_NEURON_DESC.clone()))]
pub async fn split_neuron(
&mut self,
id: &NeuronId,
Expand Down Expand Up @@ -2926,6 +2926,7 @@ impl Governance {
/// it will be merged into the stake of the target neuron; if it is less
/// than the transaction fee, the maturity of the source neuron will
/// still be merged into the maturity of the target neuron.
#[cfg_attr(feature = "tla", tla_update_method(MERGE_NEURONS_DESC.clone()))]
pub async fn merge_neurons(
&mut self,
id: &NeuronId,
Expand Down Expand Up @@ -2965,13 +2966,29 @@ impl Governance {

// Step 4: burn neuron fees if needed.
if let Some(source_burn_fees) = effect.source_burn_fees() {
tla_log_locals! {
source_neuron_id: effect.source_neuron_id().id,
target_neuron_id: effect.target_neuron_id().id,
fees_amount: effect.source_burn_fees().map_or(0, |f| f.amount_e8s),
amount_to_target: effect.stake_transfer().map_or(0, |t| t.amount_to_target_e8s)
}
tla_log_label!("MergeNeurons_Burn");

source_burn_fees
.burn_neuron_fees_with_ledger(&*self.ledger, &mut self.neuron_store, now)
.await?;
}

// Step 5: transfer the stake if needed.
if let Some(stake_transfer) = effect.stake_transfer() {
tla_log_locals! {
source_neuron_id: effect.source_neuron_id().id,
target_neuron_id: effect.target_neuron_id().id,
fees_amount: effect.source_burn_fees().map_or(0, |f| f.amount_e8s),
amount_to_target: effect.stake_transfer().map_or(0, |t| t.amount_to_target_e8s)
}
tla_log_label!("MergeNeurons_Stake");

stake_transfer
.transfer_neuron_stake_with_ledger(&*self.ledger, &mut self.neuron_store, now)
.await?;
Expand Down Expand Up @@ -6138,7 +6155,7 @@ impl Governance {
/// the neuron and lock it before we make the call, we know that any
/// concurrent call to mutate the same neuron will need to wait for this
/// one to finish before proceeding.
#[cfg_attr(feature = "tla", tla_update_method(claim_neuron_desc()))]
#[cfg_attr(feature = "tla", tla_update_method(CLAIM_NEURON_DESC.clone()))]
async fn claim_neuron(
&mut self,
subaccount: Subaccount,
Expand Down
7 changes: 7 additions & 0 deletions rs/nns/governance/src/governance/ledger_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ use crate::{
use ic_nervous_system_common::ledger::IcpLedger;
use ic_nns_common::pb::v1::NeuronId;

#[cfg(feature = "tla")]
use super::tla::TLA_INSTRUMENTATION_STATE;
#[cfg(feature = "tla")]
use tla_instrumentation_proc_macros::tla_function;

/// An object that represents the burning of neuron fees.
#[derive(Clone, PartialEq, Debug)]
pub struct BurnNeuronFeesOperation {
Expand All @@ -19,6 +24,7 @@ impl BurnNeuronFeesOperation {
/// Burns the neuron fees by calling ledger and changing the neuron. Recoverable errors are
/// returned while unrecoverable errors cause a panic. A neuron lock should be held before
/// calling this.
#[cfg_attr(feature = "tla", tla_function)]
pub async fn burn_neuron_fees_with_ledger(
self,
ledger: &dyn IcpLedger,
Expand Down Expand Up @@ -80,6 +86,7 @@ pub struct NeuronStakeTransferOperation {
impl NeuronStakeTransferOperation {
/// Transfers the stake from one neuron to another by calling ledger and changing the neurons.
/// Recoverable errors are returned while unrecoverable errors cause a panic.
#[cfg_attr(feature = "tla", tla_function)]
pub async fn transfer_neuron_stake_with_ledger(
self,
ledger: &dyn IcpLedger,
Expand Down
43 changes: 23 additions & 20 deletions rs/nns/governance/src/governance/tla/claim_neuron.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,30 @@
use lazy_static::lazy_static;
use tla_instrumentation::{Label, TlaConstantAssignment, ToTla, Update, VarAssignment};

use super::common::default_account;
use super::{extract_common_constants, post_process_trace};

pub fn claim_neuron_desc() -> Update {
const PID: &str = "Claim_Neuron";
let default_locals = VarAssignment::new()
.add("account", default_account())
.add("neuron_id", 0_u64.to_tla_value());
lazy_static! {
pub static ref CLAIM_NEURON_DESC: Update = {
const PID: &str = "Claim_Neuron";
let default_locals = VarAssignment::new()
.add("account", default_account())
.add("neuron_id", 0_u64.to_tla_value());

Update {
default_start_locals: default_locals.clone(),
default_end_locals: default_locals,
start_label: Label::new("ClaimNeuron1"),
end_label: Label::new("Done"),
process_id: PID.to_string(),
canister_name: "governance".to_string(),
post_process: |trace| {
let constants = TlaConstantAssignment {
constants: extract_common_constants(PID, trace).into_iter().collect(),
};
post_process_trace(trace);
constants
},
}
Update {
default_start_locals: default_locals.clone(),
default_end_locals: default_locals,
start_label: Label::new("ClaimNeuron1"),
end_label: Label::new("Done"),
process_id: PID.to_string(),
canister_name: "governance".to_string(),
post_process: |trace| {
let constants = TlaConstantAssignment {
constants: extract_common_constants(PID, trace).into_iter().collect(),
};
post_process_trace(trace);
constants
},
}
};
}
34 changes: 34 additions & 0 deletions rs/nns/governance/src/governance/tla/merge_neurons.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use super::{account_to_tla, extract_common_constants, post_process_trace};
use crate::governance::governance_minting_account;
use lazy_static::lazy_static;
use tla_instrumentation::{Label, TlaConstantAssignment, ToTla, Update, VarAssignment};

const PID: &str = "Merge_Neurons";
lazy_static! {
pub static ref MERGE_NEURONS_DESC: Update = {
let default_locals = VarAssignment::new()
.add("source_neuron_id", 0_u64.to_tla_value())
.add("target_neuron_id", 0_u64.to_tla_value())
.add("fees_amount", 0_u64.to_tla_value())
.add("amount_to_target", 0_u64.to_tla_value());
Update {
default_start_locals: default_locals.clone(),
default_end_locals: default_locals,
start_label: Label::new("MergeNeurons_Start"),
end_label: Label::new("Done"),
process_id: PID.to_string(),
canister_name: "governance".to_string(),
post_process: |trace| {
let mut constants = TlaConstantAssignment {
constants: extract_common_constants(PID, trace).into_iter().collect(),
};
post_process_trace(trace);
constants.constants.insert(
"Minting_Account_Id".to_string(),
account_to_tla(governance_minting_account()),
);
constants
},
}
};
}
146 changes: 98 additions & 48 deletions rs/nns/governance/src/governance/tla/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use itertools::Itertools;
use std::{
collections::{BTreeMap, BTreeSet},
ops::Deref,
thread,
};
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::hash::Hash;
use std::sync::mpsc;
use std::thread;

use super::Governance;
use crate::storage::{with_stable_neuron_indexes, with_stable_neuron_store};
Expand All @@ -26,15 +24,18 @@ pub use common::{account_to_tla, opt_subaccount_to_tla, subaccount_to_tla};
use common::{function_domain_union, governance_account_id};
pub use store::{TLA_INSTRUMENTATION_STATE, TLA_TRACES_LKEY, TLA_TRACES_MUTEX};

mod split_neuron;
pub use split_neuron::split_neuron_desc;
mod claim_neuron;
pub use claim_neuron::claim_neuron_desc;
mod merge_neurons;
mod split_neuron;

pub use claim_neuron::CLAIM_NEURON_DESC;
pub use merge_neurons::MERGE_NEURONS_DESC;
pub use split_neuron::SPLIT_NEURON_DESC;

fn neuron_global(gov: &Governance) -> TlaValue {
let neuron_map: BTreeMap<u64, TlaValue> = with_stable_neuron_store(|store| {
gov.neuron_store.with_active_neurons_iter(|iter| {
iter.map(|n| n.deref().clone())
iter.map(|n| (*n).clone())
.chain(store.range_neurons(std::ops::RangeFull))
.map(|neuron| {
(
Expand Down Expand Up @@ -207,29 +208,50 @@ fn get_tla_module_path(module: &str) -> PathBuf {
})
}

fn dedup_by_key<E, K, F>(vec: &mut Vec<E>, mut key_selector: F)
where
F: FnMut(&E) -> K,
K: Eq + Hash,
{
let mut seen_keys = HashSet::new();
vec.retain(|element| seen_keys.insert(key_selector(element)));
}

/// Checks a trace against the model.
///
/// It's assumed that the corresponding model is called `<PID>_Apalache.tla`, where PID is the
/// `process_id`` field used in the `Update` value for the corresponding method.
pub fn check_traces() {
// Large states make Apalache time and memory consumption explode. We'll look at
// improving that later, for now we introduce a hard limit on the state size, and
// skip checking states larger than the limit
// skip checking states larger than the limit. The limit is a somewhat arbitrary
// number based on what we observed in the tests. We saw that states with 1000+ atoms take
// a long time to process, whereas most manual tests yield states of size 100 or so.
const STATE_SIZE_LIMIT: u64 = 500;
// Proptests generate lots of traces (and thus state pairs) that make the Apalache testing very long.
// Again, limit this to some arbitrary number where checking is still reasonably fast.
// Note that this is effectively a per-test limit due to how `check_traces` is normally used.
const STATE_PAIR_COUNT_LIMIT: usize = 30;
fn is_under_limit(p: &ResolvedStatePair) -> bool {
p.start.size() < STATE_SIZE_LIMIT && p.end.size() < STATE_SIZE_LIMIT
}

fn print_stats(traces: &Vec<UpdateTrace>) {
let mut total_pairs = 0;
println!("Checking {} traces with TLA/Apalache", traces.len());
for t in traces {
let total_len = t.state_pairs.len();
total_pairs += total_len;
let under_limit_len = t.state_pairs.iter().filter(|p| is_under_limit(p)).count();
println!(
"TLA/Apalache checks: keeping {}/{} states for update {}",
"TLA/Apalache checks: keeping {}/{} state pairs for update {}",
under_limit_len, total_len, t.update.process_id
);
}
println!(
"Total of {} state pairs to be checked with Apalache; will retain {}",
total_pairs, STATE_PAIR_COUNT_LIMIT
)
}

let traces = {
Expand All @@ -238,6 +260,26 @@ pub fn check_traces() {
std::mem::take(&mut (*t))
};

print_stats(&traces);

let mut all_pairs = traces
.into_iter()
.flat_map(|t| {
t.state_pairs
.into_iter()
.filter(is_under_limit)
.map(move |p| (t.update.clone(), t.constants.clone(), p))
})
.collect();

// A quick check that we don't have any duplicate state pairs. We assume the constants should
// be the same anyways and look at just the process ID and the state sthemselves.
dedup_by_key(&mut all_pairs, |(u, _c, p)| {
(u.process_id.clone(), p.start.clone(), p.end.clone())
});

all_pairs.truncate(STATE_PAIR_COUNT_LIMIT);

set_java_path();

let apalache = std::env::var("TLA_APALACHE_BIN")
Expand All @@ -248,42 +290,40 @@ pub fn check_traces() {
panic!("bad apalache bin from 'TLA_APALACHE_BIN': '{:?}'", apalache);
}

print_stats(&traces);

let chunk_size = 20;
let all_pairs = traces.into_iter().flat_map(|t| {
t.state_pairs
.into_iter()
.filter(is_under_limit)
.map(move |p| (t.update.clone(), t.constants.clone(), p))
});
let chunks = all_pairs.chunks(chunk_size);
for chunk in &chunks {
let mut handles = vec![];
for (update, constants, pair) in chunk {
let apalache = apalache.clone();
let constants = constants.clone();
let pair = pair.clone();
// NOTE: We adopt the convention to reuse the 'process_id" as the tla module name
let tla_module = format!("{}_Apalache.tla", update.process_id);
let tla_module = get_tla_module_path(&tla_module);

let handle = thread::spawn(move || {
check_tla_code_link(
&apalache,
PredicateDescription {
tla_module,
transition_predicate: "Next".to_string(),
predicate_parameters: Vec::new(),
},
pair,
constants,
)
});
handles.push(handle);
// A poor man's parallel_map; process up to MAX_THREADS state pairs in parallel. Use mpsc channels
// to signal threads becoming available.
const MAX_THREADS: usize = 20;
let mut running_threads = 0;
let (thread_freed_tx, thread_freed_rx) = mpsc::channel::<()>();
for (i, (update, constants, pair)) in all_pairs.iter().enumerate() {
println!("Checking state pair #{}", i + 1);
if running_threads >= MAX_THREADS {
thread_freed_rx
.recv()
.expect("Error while waiting for the thread completion signal");
running_threads -= 1;
}
for handle in handles {
handle.join().unwrap().unwrap_or_else(|e| {

let thread_freed_rx = thread_freed_tx.clone();
let apalache = apalache.clone();
let constants = constants.clone();
let pair = pair.clone();
// NOTE: We adopt the convention to reuse the 'process_id" as the tla module name
let tla_module = format!("{}_Apalache.tla", update.process_id);
let tla_module = get_tla_module_path(&tla_module);

running_threads += 1;
let _handle = thread::spawn(move || {
let _ = check_tla_code_link(
&apalache,
PredicateDescription {
tla_module,
transition_predicate: "Next".to_string(),
predicate_parameters: Vec::new(),
},
pair,
constants,
).map_err(|e| {
println!("Possible divergence from the TLA model detected when interacting with the ledger!");
println!("If you did not expect to change the interaction between governance and the ledger, reconsider whether your change is safe. You can find additional data on the step that triggered the error below.");
println!("If you are confident that your change is correct, please contact the #formal-models Slack channel and describe the problem.");
Expand All @@ -293,6 +333,16 @@ pub fn check_traces() {
println!("Apalache returned:\n{:#?}", e.apalache_error);
panic!("Apalache check failed")
});
}
thread_freed_rx
.send(())
.expect("Couldn't send the thread completion signal");
});
}

while running_threads > 0 {
thread_freed_rx
.recv()
.expect("Error while waiting for the thread completion signal");
running_threads -= 1;
}
}
Loading

0 comments on commit 7a20299

Please sign in to comment.