feat(IC-1579): TLA instrumentation for merge_neuron (#2341)

Adds a TLA model of `merge_neurons` and links it to the code. Since the only Rust integration tests that test merge_neuron are proptests, which generate a ton of traces, introduced a limit to check only some prefix (currently, first 30) of the state pairs from the collected traces. Also made the Apalache checks a bit more parallel, by using channels to signal completion as soon as a single thread is available instead of waiting for the batch. --------- Co-authored-by: IDX GitHub Automation <infra+github-automation@dfinity.org>
dfinity · Nov 29, 2024 · 7a20299 · 7a20299
1 parent b7bb757
commit 7a20299
Show file tree

Hide file tree

Showing 12 changed files with 659 additions and 96 deletions.
diff --git a/rs/nns/governance/src/governance.rs b/rs/nns/governance/src/governance.rs
@@ -138,8 +138,8 @@ pub mod tla;
 
 #[cfg(feature = "tla")]
 pub use tla::{
-    claim_neuron_desc, split_neuron_desc, tla_update_method, InstrumentationState, ToTla,
-    TLA_INSTRUMENTATION_STATE, TLA_TRACES_LKEY, TLA_TRACES_MUTEX,
+    tla_update_method, InstrumentationState, ToTla, CLAIM_NEURON_DESC, MERGE_NEURONS_DESC,
+    SPLIT_NEURON_DESC, TLA_INSTRUMENTATION_STATE, TLA_TRACES_LKEY, TLA_TRACES_MUTEX,
 };
 
 // 70 KB (for executing NNS functions that are not canister upgrades)
@@ -2670,7 +2670,7 @@ impl Governance {
     ///   stake.
     /// - The amount to split minus the transfer fee is more than the minimum
     ///   stake.
-    #[cfg_attr(feature = "tla", tla_update_method(split_neuron_desc()))]
+    #[cfg_attr(feature = "tla", tla_update_method(SPLIT_NEURON_DESC.clone()))]
     pub async fn split_neuron(
         &mut self,
         id: &NeuronId,
@@ -2926,6 +2926,7 @@ impl Governance {
     ///   it will be merged into the stake of the target neuron; if it is less
     ///   than the transaction fee, the maturity of the source neuron will
     ///   still be merged into the maturity of the target neuron.
+    #[cfg_attr(feature = "tla", tla_update_method(MERGE_NEURONS_DESC.clone()))]
     pub async fn merge_neurons(
         &mut self,
         id: &NeuronId,
@@ -2965,13 +2966,29 @@ impl Governance {
 
         // Step 4: burn neuron fees if needed.
         if let Some(source_burn_fees) = effect.source_burn_fees() {
+            tla_log_locals! {
+                source_neuron_id: effect.source_neuron_id().id,
+                target_neuron_id: effect.target_neuron_id().id,
+                fees_amount: effect.source_burn_fees().map_or(0, |f| f.amount_e8s),
+                amount_to_target: effect.stake_transfer().map_or(0, |t| t.amount_to_target_e8s)
+            }
+            tla_log_label!("MergeNeurons_Burn");
+
             source_burn_fees
                 .burn_neuron_fees_with_ledger(&*self.ledger, &mut self.neuron_store, now)
                 .await?;
         }
 
         // Step 5: transfer the stake if needed.
         if let Some(stake_transfer) = effect.stake_transfer() {
+            tla_log_locals! {
+                source_neuron_id: effect.source_neuron_id().id,
+                target_neuron_id: effect.target_neuron_id().id,
+                fees_amount: effect.source_burn_fees().map_or(0, |f| f.amount_e8s),
+                amount_to_target: effect.stake_transfer().map_or(0, |t| t.amount_to_target_e8s)
+            }
+            tla_log_label!("MergeNeurons_Stake");
+
             stake_transfer
                 .transfer_neuron_stake_with_ledger(&*self.ledger, &mut self.neuron_store, now)
                 .await?;
@@ -6138,7 +6155,7 @@ impl Governance {
     /// the neuron and lock it before we make the call, we know that any
     /// concurrent call to mutate the same neuron will need to wait for this
     /// one to finish before proceeding.
-    #[cfg_attr(feature = "tla", tla_update_method(claim_neuron_desc()))]
+    #[cfg_attr(feature = "tla", tla_update_method(CLAIM_NEURON_DESC.clone()))]
     async fn claim_neuron(
         &mut self,
         subaccount: Subaccount,

diff --git a/rs/nns/governance/src/governance/ledger_helper.rs b/rs/nns/governance/src/governance/ledger_helper.rs
@@ -8,6 +8,11 @@ use crate::{
 use ic_nervous_system_common::ledger::IcpLedger;
 use ic_nns_common::pb::v1::NeuronId;
 
+#[cfg(feature = "tla")]
+use super::tla::TLA_INSTRUMENTATION_STATE;
+#[cfg(feature = "tla")]
+use tla_instrumentation_proc_macros::tla_function;
+
 /// An object that represents the burning of neuron fees.
 #[derive(Clone, PartialEq, Debug)]
 pub struct BurnNeuronFeesOperation {
@@ -19,6 +24,7 @@ impl BurnNeuronFeesOperation {
     /// Burns the neuron fees by calling ledger and changing the neuron. Recoverable errors are
     /// returned while unrecoverable errors cause a panic. A neuron lock should be held before
     /// calling this.
+    #[cfg_attr(feature = "tla", tla_function)]
     pub async fn burn_neuron_fees_with_ledger(
         self,
         ledger: &dyn IcpLedger,
@@ -80,6 +86,7 @@ pub struct NeuronStakeTransferOperation {
 impl NeuronStakeTransferOperation {
     /// Transfers the stake from one neuron to another by calling ledger and changing the neurons.
     /// Recoverable errors are returned while unrecoverable errors cause a panic.
+    #[cfg_attr(feature = "tla", tla_function)]
     pub async fn transfer_neuron_stake_with_ledger(
         self,
         ledger: &dyn IcpLedger,

diff --git a/rs/nns/governance/src/governance/tla/claim_neuron.rs b/rs/nns/governance/src/governance/tla/claim_neuron.rs
@@ -1,27 +1,30 @@
+use lazy_static::lazy_static;
 use tla_instrumentation::{Label, TlaConstantAssignment, ToTla, Update, VarAssignment};
 
 use super::common::default_account;
 use super::{extract_common_constants, post_process_trace};
 
-pub fn claim_neuron_desc() -> Update {
-    const PID: &str = "Claim_Neuron";
-    let default_locals = VarAssignment::new()
-        .add("account", default_account())
-        .add("neuron_id", 0_u64.to_tla_value());
+lazy_static! {
+    pub static ref CLAIM_NEURON_DESC: Update = {
+        const PID: &str = "Claim_Neuron";
+        let default_locals = VarAssignment::new()
+            .add("account", default_account())
+            .add("neuron_id", 0_u64.to_tla_value());
 
-    Update {
-        default_start_locals: default_locals.clone(),
-        default_end_locals: default_locals,
-        start_label: Label::new("ClaimNeuron1"),
-        end_label: Label::new("Done"),
-        process_id: PID.to_string(),
-        canister_name: "governance".to_string(),
-        post_process: |trace| {
-            let constants = TlaConstantAssignment {
-                constants: extract_common_constants(PID, trace).into_iter().collect(),
-            };
-            post_process_trace(trace);
-            constants
-        },
-    }
+        Update {
+            default_start_locals: default_locals.clone(),
+            default_end_locals: default_locals,
+            start_label: Label::new("ClaimNeuron1"),
+            end_label: Label::new("Done"),
+            process_id: PID.to_string(),
+            canister_name: "governance".to_string(),
+            post_process: |trace| {
+                let constants = TlaConstantAssignment {
+                    constants: extract_common_constants(PID, trace).into_iter().collect(),
+                };
+                post_process_trace(trace);
+                constants
+            },
+        }
+    };
 }
diff --git a/rs/nns/governance/src/governance/tla/merge_neurons.rs b/rs/nns/governance/src/governance/tla/merge_neurons.rs
@@ -0,0 +1,34 @@
+use super::{account_to_tla, extract_common_constants, post_process_trace};
+use crate::governance::governance_minting_account;
+use lazy_static::lazy_static;
+use tla_instrumentation::{Label, TlaConstantAssignment, ToTla, Update, VarAssignment};
+
+const PID: &str = "Merge_Neurons";
+lazy_static! {
+    pub static ref MERGE_NEURONS_DESC: Update = {
+        let default_locals = VarAssignment::new()
+            .add("source_neuron_id", 0_u64.to_tla_value())
+            .add("target_neuron_id", 0_u64.to_tla_value())
+            .add("fees_amount", 0_u64.to_tla_value())
+            .add("amount_to_target", 0_u64.to_tla_value());
+        Update {
+            default_start_locals: default_locals.clone(),
+            default_end_locals: default_locals,
+            start_label: Label::new("MergeNeurons_Start"),
+            end_label: Label::new("Done"),
+            process_id: PID.to_string(),
+            canister_name: "governance".to_string(),
+            post_process: |trace| {
+                let mut constants = TlaConstantAssignment {
+                    constants: extract_common_constants(PID, trace).into_iter().collect(),
+                };
+                post_process_trace(trace);
+                constants.constants.insert(
+                    "Minting_Account_Id".to_string(),
+                    account_to_tla(governance_minting_account()),
+                );
+                constants
+            },
+        }
+    };
+}
diff --git a/rs/nns/governance/src/governance/tla/mod.rs b/rs/nns/governance/src/governance/tla/mod.rs
@@ -1,9 +1,7 @@
-use itertools::Itertools;
-use std::{
-    collections::{BTreeMap, BTreeSet},
-    ops::Deref,
-    thread,
-};
+use std::collections::{BTreeMap, BTreeSet, HashSet};
+use std::hash::Hash;
+use std::sync::mpsc;
+use std::thread;
 
 use super::Governance;
 use crate::storage::{with_stable_neuron_indexes, with_stable_neuron_store};
@@ -26,15 +24,18 @@ pub use common::{account_to_tla, opt_subaccount_to_tla, subaccount_to_tla};
 use common::{function_domain_union, governance_account_id};
 pub use store::{TLA_INSTRUMENTATION_STATE, TLA_TRACES_LKEY, TLA_TRACES_MUTEX};
 
-mod split_neuron;
-pub use split_neuron::split_neuron_desc;
 mod claim_neuron;
-pub use claim_neuron::claim_neuron_desc;
+mod merge_neurons;
+mod split_neuron;
+
+pub use claim_neuron::CLAIM_NEURON_DESC;
+pub use merge_neurons::MERGE_NEURONS_DESC;
+pub use split_neuron::SPLIT_NEURON_DESC;
 
 fn neuron_global(gov: &Governance) -> TlaValue {
     let neuron_map: BTreeMap<u64, TlaValue> = with_stable_neuron_store(|store| {
         gov.neuron_store.with_active_neurons_iter(|iter| {
-            iter.map(|n| n.deref().clone())
+            iter.map(|n| (*n).clone())
                 .chain(store.range_neurons(std::ops::RangeFull))
                 .map(|neuron| {
                     (
@@ -207,29 +208,50 @@ fn get_tla_module_path(module: &str) -> PathBuf {
         })
 }
 
+fn dedup_by_key<E, K, F>(vec: &mut Vec<E>, mut key_selector: F)
+where
+    F: FnMut(&E) -> K,
+    K: Eq + Hash,
+{
+    let mut seen_keys = HashSet::new();
+    vec.retain(|element| seen_keys.insert(key_selector(element)));
+}
+
 /// Checks a trace against the model.
 ///
 /// It's assumed that the corresponding model is called `<PID>_Apalache.tla`, where PID is the
 /// `process_id`` field used in the `Update` value for the corresponding method.
 pub fn check_traces() {
     // Large states make Apalache time and memory consumption explode. We'll look at
     // improving that later, for now we introduce a hard limit on the state size, and
-    // skip checking states larger than the limit
+    // skip checking states larger than the limit. The limit is a somewhat arbitrary
+    // number based on what we observed in the tests. We saw that states with 1000+ atoms take
+    // a long time to process, whereas most manual tests yield states of size 100 or so.
     const STATE_SIZE_LIMIT: u64 = 500;
+    // Proptests generate lots of traces (and thus state pairs) that make the Apalache testing very long.
+    // Again, limit this to some arbitrary number where checking is still reasonably fast.
+    // Note that this is effectively a per-test limit due to how `check_traces` is normally used.
+    const STATE_PAIR_COUNT_LIMIT: usize = 30;
     fn is_under_limit(p: &ResolvedStatePair) -> bool {
         p.start.size() < STATE_SIZE_LIMIT && p.end.size() < STATE_SIZE_LIMIT
     }
 
     fn print_stats(traces: &Vec<UpdateTrace>) {
+        let mut total_pairs = 0;
         println!("Checking {} traces with TLA/Apalache", traces.len());
         for t in traces {
             let total_len = t.state_pairs.len();
+            total_pairs += total_len;
             let under_limit_len = t.state_pairs.iter().filter(|p| is_under_limit(p)).count();
             println!(
-                "TLA/Apalache checks: keeping {}/{} states for update {}",
+                "TLA/Apalache checks: keeping {}/{} state pairs for update {}",
                 under_limit_len, total_len, t.update.process_id
             );
         }
+        println!(
+            "Total of {} state pairs to be checked with Apalache; will retain {}",
+            total_pairs, STATE_PAIR_COUNT_LIMIT
+        )
     }
 
     let traces = {
@@ -238,6 +260,26 @@ pub fn check_traces() {
         std::mem::take(&mut (*t))
     };
 
+    print_stats(&traces);
+
+    let mut all_pairs = traces
+        .into_iter()
+        .flat_map(|t| {
+            t.state_pairs
+                .into_iter()
+                .filter(is_under_limit)
+                .map(move |p| (t.update.clone(), t.constants.clone(), p))
+        })
+        .collect();
+
+    // A quick check that we don't have any duplicate state pairs. We assume the constants should
+    // be the same anyways and look at just the process ID and the state sthemselves.
+    dedup_by_key(&mut all_pairs, |(u, _c, p)| {
+        (u.process_id.clone(), p.start.clone(), p.end.clone())
+    });
+
+    all_pairs.truncate(STATE_PAIR_COUNT_LIMIT);
+
     set_java_path();
 
     let apalache = std::env::var("TLA_APALACHE_BIN")
@@ -248,42 +290,40 @@ pub fn check_traces() {
         panic!("bad apalache bin from 'TLA_APALACHE_BIN': '{:?}'", apalache);
     }
 
-    print_stats(&traces);
-
-    let chunk_size = 20;
-    let all_pairs = traces.into_iter().flat_map(|t| {
-        t.state_pairs
-            .into_iter()
-            .filter(is_under_limit)
-            .map(move |p| (t.update.clone(), t.constants.clone(), p))
-    });
-    let chunks = all_pairs.chunks(chunk_size);
-    for chunk in &chunks {
-        let mut handles = vec![];
-        for (update, constants, pair) in chunk {
-            let apalache = apalache.clone();
-            let constants = constants.clone();
-            let pair = pair.clone();
-            // NOTE: We adopt the convention to reuse the 'process_id" as the tla module name
-            let tla_module = format!("{}_Apalache.tla", update.process_id);
-            let tla_module = get_tla_module_path(&tla_module);
-
-            let handle = thread::spawn(move || {
-                check_tla_code_link(
-                    &apalache,
-                    PredicateDescription {
-                        tla_module,
-                        transition_predicate: "Next".to_string(),
-                        predicate_parameters: Vec::new(),
-                    },
-                    pair,
-                    constants,
-                )
-            });
-            handles.push(handle);
+    // A poor man's parallel_map; process up to MAX_THREADS state pairs in parallel. Use mpsc channels
+    // to signal threads becoming available.
+    const MAX_THREADS: usize = 20;
+    let mut running_threads = 0;
+    let (thread_freed_tx, thread_freed_rx) = mpsc::channel::<()>();
+    for (i, (update, constants, pair)) in all_pairs.iter().enumerate() {
+        println!("Checking state pair #{}", i + 1);
+        if running_threads >= MAX_THREADS {
+            thread_freed_rx
+                .recv()
+                .expect("Error while waiting for the thread completion signal");
+            running_threads -= 1;
         }
-        for handle in handles {
-            handle.join().unwrap().unwrap_or_else(|e| {
+
+        let thread_freed_rx = thread_freed_tx.clone();
+        let apalache = apalache.clone();
+        let constants = constants.clone();
+        let pair = pair.clone();
+        // NOTE: We adopt the convention to reuse the 'process_id" as the tla module name
+        let tla_module = format!("{}_Apalache.tla", update.process_id);
+        let tla_module = get_tla_module_path(&tla_module);
+
+        running_threads += 1;
+        let _handle = thread::spawn(move || {
+            let _ = check_tla_code_link(
+                &apalache,
+                PredicateDescription {
+                    tla_module,
+                    transition_predicate: "Next".to_string(),
+                    predicate_parameters: Vec::new(),
+                },
+                pair,
+                constants,
+            ).map_err(|e| {
                 println!("Possible divergence from the TLA model detected when interacting with the ledger!");
                 println!("If you did not expect to change the interaction between governance and the ledger, reconsider whether your change is safe. You can find additional data on the step that triggered the error below.");
                 println!("If you are confident that your change is correct, please contact the #formal-models Slack channel and describe the problem.");
@@ -293,6 +333,16 @@ pub fn check_traces() {
                 println!("Apalache returned:\n{:#?}", e.apalache_error);
                 panic!("Apalache check failed")
             });
-        }
+            thread_freed_rx
+                .send(())
+                .expect("Couldn't send the thread completion signal");
+        });
+    }
+
+    while running_threads > 0 {
+        thread_freed_rx
+            .recv()
+            .expect("Error while waiting for the thread completion signal");
+        running_threads -= 1;
     }
 }