From f8ab279f821a92602bf7cd2439f373dc721b6cab Mon Sep 17 00:00:00 2001
From: eschorn1 <eric.schorn@nccgroup.com>
Date: Sat, 23 Mar 2024 08:26:53 -0500
Subject: [PATCH] ct_cm4 update

---
 Cargo.toml           |  6 ++--
 benches/benchmark.rs | 29 +++++++++++++----
 ct_cm4/Cargo.toml    | 20 +++++++++---
 ct_cm4/Embed.toml    | 14 ++++++++
 ct_cm4/README.md     | 43 +++++--------------------
 ct_cm4/openocd.gdb   | 37 ---------------------
 ct_cm4/src/main.rs   | 76 ++++++++++++++++++--------------------------
 src/helpers.rs       |  3 +-
 src/sampling.rs      | 74 +++++++++++++++++++++++++-----------------
 src/types.rs         |  4 +--
 10 files changed, 140 insertions(+), 166 deletions(-)
 create mode 100644 ct_cm4/Embed.toml
 delete mode 100644 ct_cm4/openocd.gdb

diff --git a/Cargo.toml b/Cargo.toml
index e51f3f5..e8314c2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,9 +44,9 @@ harness = false
 
 [profile.dev]
 debug = true
-lto = true
-opt-level = 3
-codegen-units = 1
+#lto = true
+#opt-level = 3
+#codegen-units = 1
 
 
 [profile.release]
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
index 90c0ec8..648f9b1 100644
--- a/benches/benchmark.rs
+++ b/benches/benchmark.rs
@@ -9,8 +9,11 @@ struct BenchRng();
 
 impl RngCore for BenchRng {
     fn next_u32(&mut self) -> u32 { unimplemented!() }
+
     fn next_u64(&mut self) -> u64 { unimplemented!() }
+
     fn fill_bytes(&mut self, out: &mut [u8]) { out.iter_mut().for_each(|b| *b = 0); }
+
     fn try_fill_bytes(&mut self, out: &mut [u8]) -> Result<(), rand_core::Error> {
         self.fill_bytes(out);
         Ok(())
@@ -31,13 +34,25 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     let (ek_1024, dk_1024) = ml_kem_1024::KG::try_keygen_with_rng_vt(&mut bench_rng).unwrap();
     let (_, ct_1024) = ek_1024.try_encaps_vt().unwrap();
 
-    c.bench_function("ml_kem_512  KeyGen", |b| b.iter(|| ml_kem_512::KG::try_keygen_with_rng_vt(&mut bench_rng)));
-    c.bench_function("ml_kem_768  KeyGen", |b| b.iter(|| ml_kem_768::KG::try_keygen_with_rng_vt(&mut bench_rng)));
-    c.bench_function("ml_kem_1024 KeyGen", |b| b.iter(|| ml_kem_1024::KG::try_keygen_with_rng_vt(&mut bench_rng)));
-
-    c.bench_function("ml_kem_512  Encaps", |b| b.iter(|| ek_512.try_encaps_with_rng_vt(&mut bench_rng)));
-    c.bench_function("ml_kem_768  Encaps", |b| b.iter(|| ek_768.try_encaps_with_rng_vt(&mut bench_rng)));
-    c.bench_function("ml_kem_1024 Encaps", |b| b.iter(|| ek_1024.try_encaps_with_rng_vt(&mut bench_rng)));
+    c.bench_function("ml_kem_512  KeyGen", |b| {
+        b.iter(|| ml_kem_512::KG::try_keygen_with_rng_vt(&mut bench_rng))
+    });
+    c.bench_function("ml_kem_768  KeyGen", |b| {
+        b.iter(|| ml_kem_768::KG::try_keygen_with_rng_vt(&mut bench_rng))
+    });
+    c.bench_function("ml_kem_1024 KeyGen", |b| {
+        b.iter(|| ml_kem_1024::KG::try_keygen_with_rng_vt(&mut bench_rng))
+    });
+
+    c.bench_function("ml_kem_512  Encaps", |b| {
+        b.iter(|| ek_512.try_encaps_with_rng_vt(&mut bench_rng))
+    });
+    c.bench_function("ml_kem_768  Encaps", |b| {
+        b.iter(|| ek_768.try_encaps_with_rng_vt(&mut bench_rng))
+    });
+    c.bench_function("ml_kem_1024 Encaps", |b| {
+        b.iter(|| ek_1024.try_encaps_with_rng_vt(&mut bench_rng))
+    });
 
     c.bench_function("ml_kem_512  Decaps", |b| b.iter(|| dk_512.try_decaps_vt(&ct_512)));
     c.bench_function("ml_kem_768  Decaps", |b| b.iter(|| dk_768.try_decaps_vt(&ct_768)));
diff --git a/ct_cm4/Cargo.toml b/ct_cm4/Cargo.toml
index c310a22..432c3b0 100644
--- a/ct_cm4/Cargo.toml
+++ b/ct_cm4/Cargo.toml
@@ -10,13 +10,23 @@ edition = "2021"
 
 [dependencies]
 fips203 = { path = "..", default-features = false, features = ["ml-kem-512"] }
-cortex-m-semihosting = "0.5.0"
-panic-semihosting = { version = "0.6.0", features = ["exit"] }
 cortex-m = { version = "0.7.7", features = ["critical-section-single-core"] }
-cortex-m-rt = "0.6.15"  # Required by 'most recent' version of stm32f3-discovery below
-stm32f3-discovery = "0.7.2"
-panic-itm = "0.4.2"
+cortex-m-rt = "0.7.3"
+panic-rtt-target = { version = "0.1.2", features = ["cortex-m"] }
+microbit-v2 = "0.13.0"
+rtt-target = { version = "0.5.0" } #, features = ["cortex-m"] }
 rand_core = { version = "0.6.4", default-features = false }
+hex-literal = "0.4.1"
+rand_chacha = { version = "0.3.1", default-features = false }
+
 
 [profile.dev]
+debug = true
+debug-assertions = false
+overflow-checks = false
+lto = true
 opt-level = 3
+codegen-units = 1
+
+
+# cargo update -p fixed@1.26.0 --precise 1.23.1
\ No newline at end of file
diff --git a/ct_cm4/Embed.toml b/ct_cm4/Embed.toml
new file mode 100644
index 0000000..8c4e687
--- /dev/null
+++ b/ct_cm4/Embed.toml
@@ -0,0 +1,14 @@
+[default.general]
+chip = "nrf52833_xxAA"
+
+#[default.reset]
+#halt_afterwards = true
+
+[default.rtt]
+enabled = true
+
+[default.gdb]
+enabled = false
+
+[default.probe]
+protocol = "Swd"
\ No newline at end of file
diff --git a/ct_cm4/README.md b/ct_cm4/README.md
index b3c1051..064726c 100644
--- a/ct_cm4/README.md
+++ b/ct_cm4/README.md
@@ -1,37 +1,10 @@
-An example for the STM Discovery Board -- https://docs.rust-embedded.org/discovery/f3discovery/index.html
+An example for the Microbit v2 Board -- <https://docs.rust-embedded.org/discovery/microbit/index.html>
 
-One-off setup:
+This example demonstrates the full loop of keygen, encaps then decaps functionality.
+Cycle counts are measured, displayed, and operation confirmed to be constant-time.  
+See the link above for tooling setup.
 
-~~~
-rustup target add thumbv7em-none-eabihf
-rustup component add llvm-tools-preview
-~~~
-
-You will need to be running with two windows in parallel.
-
-1. In the first window:
-
-   ~~~
-   $ cd ct_cm4   # <here>
-   $ cargo build --target thumbv7em-none-eabihf
-   $ cargo readobj --target thumbv7em-none-eabihf --bin ct_cm4-fips203 -- --file-header  # double-checks built object
-   $ cargo size --bin ct_cm4-fips203 --release -- -A
-   ~~~
-
-2. In the second window:
-
-   ~~~
-   $ cd /tmp && openocd -f interface/stlink-v2-1.cfg -f target/stm32f3x.cfg
-   ~~~
-
-3. Back to the first window:
-
-   ~~~
-   $ cargo run
-
-   then:
-      layout src
-      break k_pke.rs:29
-      continue
-      s
-   ~~~
+ ~~~
+ $ cd ct_cm4   # <here>
+ $ cargo embed
+ ~~~
diff --git a/ct_cm4/openocd.gdb b/ct_cm4/openocd.gdb
deleted file mode 100644
index 4c9ff3f..0000000
--- a/ct_cm4/openocd.gdb
+++ /dev/null
@@ -1,37 +0,0 @@
-# Connect to gdb remote server
-target remote :3333
-
-# Load will flash the code
-load
-
-# Enable demangling asm names on disassembly
-set print asm-demangle on
-
-# Enable pretty printing
-set print pretty on
-
-# Disable style sources as the default colors can be hard to read
-set style sources off
-
-# Initialize monitoring so iprintln! macro output
-# is sent from the itm port to itm.txt
-monitor tpiu config internal itm.txt uart off 8000000
-
-# Turn on the itm port and semihosting
-monitor itm port 0 on
-monitor arm semihosting enable
-
-# Set a breakpoint at main, aka entry
-break main
-
-# Set a breakpoint at DefaultHandler
-break DefaultHandler
-
-# Set a breakpoint at HardFault
-break HardFault
-
-# Continue running until we hit the main breakpoint
-continue
-
-# Step from the trampoline code in entry into main
-step
diff --git a/ct_cm4/src/main.rs b/ct_cm4/src/main.rs
index 6658494..ec5fbaa 100644
--- a/ct_cm4/src/main.rs
+++ b/ct_cm4/src/main.rs
@@ -1,67 +1,53 @@
 #![no_std]
 #![no_main]
 
-use cortex_m::peripheral::DWT;
+use cortex_m::asm;
 use cortex_m_rt::entry;
 use fips203::ml_kem_512;
-use fips203::traits::KeyGen;
-use rand_core::{CryptoRng, RngCore};
-use stm32f3_discovery::leds::Leds;
-use stm32f3_discovery::stm32f3xx_hal::{pac, prelude::*};
-use stm32f3_discovery::switch_hal::ToggleableOutputSwitch;
+use fips203::traits::{Decaps, Encaps, KeyGen, SerDes};
+use microbit::{board::Board, hal::{pac::DWT, prelude::OutputPin}};
+use rand_chacha::rand_core::SeedableRng;
+use rtt_target::{rprintln, rtt_init_print};
 
-
-// Dummy RNG that regurgitates zeros when 'asked'
-struct MyRng();
-impl RngCore for MyRng {
-    fn next_u32(&mut self) -> u32 { unimplemented!() }
-    fn next_u64(&mut self) -> u64 { unimplemented!() }
-    fn fill_bytes(&mut self, out: &mut [u8]) { out.iter_mut().for_each(|b| *b = 0); }
-    fn try_fill_bytes(&mut self, out: &mut [u8]) -> Result<(), rand_core::Error> {
-        self.fill_bytes(out);
-        Ok(())
-    }
-}
-impl CryptoRng for MyRng {}
-
-
-#[panic_handler]
-fn panic(_info: &core::panic::PanicInfo) -> ! { loop {} }
+use panic_rtt_target as _;
 
 
 #[entry]
 fn main() -> ! {
-
-    // Configure MCU
-    let device_peripherals = pac::Peripherals::take().unwrap();
-    let mut reset_and_clock_control = device_peripherals.RCC.constrain();
-
-    // Initialize LEDs
-    let mut gpioe = device_peripherals.GPIOE.split(&mut reset_and_clock_control.ahb);
-    #[rustfmt::skip]
-    let mut leds = Leds::new(gpioe.pe8, gpioe.pe9, gpioe.pe10, gpioe.pe11, gpioe.pe12,
-        gpioe.pe13, gpioe.pe14, gpioe.pe15, &mut gpioe.moder, &mut gpioe.otyper).into_array();
-
-    let mut my_rng = MyRng {};
+    let mut board = Board::take().unwrap();
+    board.DCB.enable_trace();
+    board.DWT.enable_cycle_counter();
+    board.display_pins.col1.set_low().unwrap();
+    rtt_init_print!();
+
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(123);
+    let mut expected_cycles = 0;
     let mut i = 0u32;
 
     loop {
-        if (i % 10) == 0 { leds[0].toggle().ok(); };
+        if (i % 100) == 0 { board.display_pins.row1.set_high().unwrap(); };
+        if (i % 100) == 50 { board.display_pins.row1.set_low().unwrap(); };
         i += 1;
 
-        cortex_m::asm::isb();
+        rng.set_word_pos(1024 * i as u128);  // Removes odd variability in drawing rng data
+
+        asm::isb();
         let start = DWT::cycle_count();
-        cortex_m::asm::isb();
+        asm::isb();
 
-        let _res1 = ml_kem_512::KG::try_keygen_with_rng_vt(&mut my_rng);
+        let (ek, dk) = ml_kem_512::KG::try_keygen_with_rng_vt(&mut rng).unwrap();
+        let (ssk1, ct) = ek.try_encaps_with_rng_vt(&mut rng).unwrap();
+        let ssk2 = dk.try_decaps_vt(&ct).unwrap();
 
-        cortex_m::asm::isb();
+        asm::isb();
         let finish = DWT::cycle_count();
-        cortex_m::asm::isb();
+        asm::isb();
+
+        assert_eq!(ssk1.into_bytes(), ssk2.into_bytes());
 
-        // Code will 'soon' present the cycle counts via semi-hosting,
-        // and will also include encaps/decaps cycle
-        let _count = finish - start;
-        // print_semi("Top", _count);
+        let count = finish - start;
+        if (i == 5) & (expected_cycles == 0) { expected_cycles = count };
+        if (i > 5) & (count != expected_cycles) { panic!("Non constant-time operation!!") };
+        if i % 10 == 0 { rprintln!("Iteration {} cycle count: {}", i, count); }
     }
 }
diff --git a/src/helpers.rs b/src/helpers.rs
index 38bf2ec..56132ea 100644
--- a/src/helpers.rs
+++ b/src/helpers.rs
@@ -1,6 +1,5 @@
 use sha3::{Digest, Sha3_256, Sha3_512, Shake128, Shake256};
-use sha3::digest::{ExtendableOutput, XofReader};
-use sha3::digest::Update;
+use sha3::digest::{ExtendableOutput, Update, XofReader};
 
 use crate::ntt::multiply_ntts;
 use crate::Q;
diff --git a/src/sampling.rs b/src/sampling.rs
index 0cd474e..0ce3699 100644
--- a/src/sampling.rs
+++ b/src/sampling.rs
@@ -19,8 +19,12 @@ pub(crate) fn sample_ntt(mut byte_stream_b: impl XofReader) -> [Z; 256] {
     // 2: j ← 0
     let mut j = 0;
 
-    // 3: while j < 256 do
-    while j < 256 {
+    // The original sampling loop has inherent timing variability based on the need to reject
+    // `d1` > `Q` per step 6+ along with `d2` > `Q` per step 10+. The adapted loop below does
+    // "too much, but a constant amount of work" with the 384-256 margin impacting performance.
+    // 3: while j < 256 do  --> this is adapted for constant-time operation TODO: loop # vs fail odds
+    #[allow(clippy::cast_possible_truncation)]  // mask as u16
+    for _k in 0..384 {
         //
         byte_stream_b.read(&mut bbb); // Draw 3 bytes
 
@@ -31,34 +35,33 @@ pub(crate) fn sample_ntt(mut byte_stream_b: impl XofReader) -> [Z; 256] {
         let d2 = (u32::from(bbb[1]) >> 4) + 16 * u32::from(bbb[2]);
 
         // 6: if d1 < q then
-        if d1 < Q {
-            //
-            // 7: a_hat[j] ← d1         ▷ a_hat ∈ Z256
-            let mut ah = Z::default();
-            ah.set_u16(u16::try_from(d1).unwrap());
-            array_a_hat[j] = ah;
+        let mask = usize::from((d1 < Q) & (j < 256)).wrapping_neg();
+        //
+        // 7: a_hat[j] ← d1         ▷ a_hat ∈ Z256
+        let mut ah = Z::default();
+        ah.set_u16(u16::try_from(d1).unwrap() & (mask as u16));
+        array_a_hat[j & 0xff] = array_a_hat[j & 0xFF].add(ah);
 
-            // 8: j ← j+1
-            j += 1;
+        // 8: j ← j+1
+        j += 1 & mask;
 
-            // 9: end if
-        }
+        // 9: end if
 
         // 10: if d2 < q and j < 256 then
-        if (d2 < Q) & (j < 256) {
-            //
-            // 11: a_hat[j] ← d2
-            let mut ah = Z::default();
-            ah.set_u16(u16::try_from(d2).unwrap());
-            array_a_hat[j] = ah;
+        let mask2 = usize::from((d2 < Q) & (j < 256)).wrapping_neg();
 
-            // 12: j ← j+1
-            j += 1;
+        // 11: a_hat[j] ← d2
+        let mut ah = Z::default();
+        ah.set_u16(u16::try_from(d2).unwrap() & (mask2 as u16));
+        array_a_hat[j & 0xFF] = array_a_hat[j & 0xFF].add(ah);
 
-            // 13: end if
-        }
+        // 12: j ← j+1
+        j += 1 & mask2;
+
+        // 13: end if
 
         // 14: i ← i+3  (not needed as we draw 3 more bytes next time
+
         // 15: end while
     }
 
@@ -81,17 +84,16 @@ pub(crate) fn sample_poly_cbd(eta: u32, byte_array_b: &[u8]) -> [Z; 256] {
     let mut int_index = 0;
     let mut bit_index = 0;
     for byte in byte_array_b {
-        temp |= u64::from(*byte) << bit_index;
+        temp |= u32::from(*byte) << bit_index;
         bit_index += 8;
-        #[allow(clippy::cast_possible_truncation)]
         while bit_index >= 2 * (eta as usize) {
-            let tmask_x = temp & (2u64.pow(eta) - 1);
-            let x = (tmask_x as u8).count_ones();
-            let tmask_y = (temp >> eta) & (2u64.pow(eta) - 1);
-            let y = (tmask_y as u8).count_ones();
+            let tmask_x = temp & ((1 << eta) - 1);
+            let x = count_ones(tmask_x);
+            let tmask_y = (temp >> eta) & ((1 << eta) - 1);
+            let y = count_ones(tmask_y);
             let (mut xx, mut yy) = (Z::default(), Z::default());
-            xx.set_u16(x as u16);
-            yy.set_u16(y as u16);
+            xx.set_u16(x);
+            yy.set_u16(y);
             array_f[int_index] = xx.sub(yy);
             bit_index -= 2 * (eta as usize);
             temp >>= 2 * (eta as usize);
@@ -102,6 +104,18 @@ pub(crate) fn sample_poly_cbd(eta: u32, byte_array_b: &[u8]) -> [Z; 256] {
 }
 
 
+// Count u8 ones in constant time (u32 helps perf)
+#[allow(clippy::cast_possible_truncation)]  // return res as u16
+fn count_ones(x: u32) -> u16 {
+    let (mut res, mut x) = (x & 0xFF, x & 0xFF);
+    for _i in 1..8 {
+        x >>= 1;
+        res -= x;
+    }
+    res as u16
+}
+
+
 // The original pseudocode for Algorithm 7 follows...
 // Algorithm 7 `SamplePolyCBDη(B)` on page 20.
 // If the input is a stream of uniformly random bytes, outputs a sample from the distribution `D_η(R_q)`.
diff --git a/src/types.rs b/src/types.rs
index ea2a0eb..dd03c3e 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -28,6 +28,7 @@ pub struct CipherText<const CT_LEN: usize>(pub(crate) [u8; CT_LEN]);
 #[derive(Clone, Copy, Default)]
 pub(crate) struct Z(u16);
 
+
 #[allow(clippy::inline_always)]
 impl Z {
     const M: u64 = 2u64.pow(32) / (Self::Q64);
@@ -45,8 +46,7 @@ impl Z {
     pub(crate) fn add(self, other: Self) -> Self {
         let sum = self.0.wrapping_add(other.0);
         let (trial, borrow) = sum.overflowing_sub(Self::Q16);
-        let select_sum = u16::from(borrow).wrapping_neg();
-        let result = (!select_sum & trial) | (select_sum & sum);
+        let result = trial.wrapping_add(u16::from(borrow).wrapping_neg() & Self::Q16);
         Self(result)
     }