From f8ab279f821a92602bf7cd2439f373dc721b6cab Mon Sep 17 00:00:00 2001 From: eschorn1 Date: Sat, 23 Mar 2024 08:26:53 -0500 Subject: [PATCH] ct_cm4 update --- Cargo.toml | 6 ++-- benches/benchmark.rs | 29 +++++++++++++---- ct_cm4/Cargo.toml | 20 +++++++++--- ct_cm4/Embed.toml | 14 ++++++++ ct_cm4/README.md | 43 +++++-------------------- ct_cm4/openocd.gdb | 37 --------------------- ct_cm4/src/main.rs | 76 ++++++++++++++++++-------------------------- src/helpers.rs | 3 +- src/sampling.rs | 74 +++++++++++++++++++++++++----------------- src/types.rs | 4 +-- 10 files changed, 140 insertions(+), 166 deletions(-) create mode 100644 ct_cm4/Embed.toml delete mode 100644 ct_cm4/openocd.gdb diff --git a/Cargo.toml b/Cargo.toml index e51f3f5..e8314c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,9 +44,9 @@ harness = false [profile.dev] debug = true -lto = true -opt-level = 3 -codegen-units = 1 +#lto = true +#opt-level = 3 +#codegen-units = 1 [profile.release] diff --git a/benches/benchmark.rs b/benches/benchmark.rs index 90c0ec8..648f9b1 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -9,8 +9,11 @@ struct BenchRng(); impl RngCore for BenchRng { fn next_u32(&mut self) -> u32 { unimplemented!() } + fn next_u64(&mut self) -> u64 { unimplemented!() } + fn fill_bytes(&mut self, out: &mut [u8]) { out.iter_mut().for_each(|b| *b = 0); } + fn try_fill_bytes(&mut self, out: &mut [u8]) -> Result<(), rand_core::Error> { self.fill_bytes(out); Ok(()) @@ -31,13 +34,25 @@ pub fn criterion_benchmark(c: &mut Criterion) { let (ek_1024, dk_1024) = ml_kem_1024::KG::try_keygen_with_rng_vt(&mut bench_rng).unwrap(); let (_, ct_1024) = ek_1024.try_encaps_vt().unwrap(); - c.bench_function("ml_kem_512 KeyGen", |b| b.iter(|| ml_kem_512::KG::try_keygen_with_rng_vt(&mut bench_rng))); - c.bench_function("ml_kem_768 KeyGen", |b| b.iter(|| ml_kem_768::KG::try_keygen_with_rng_vt(&mut bench_rng))); - c.bench_function("ml_kem_1024 KeyGen", |b| b.iter(|| ml_kem_1024::KG::try_keygen_with_rng_vt(&mut bench_rng))); - - c.bench_function("ml_kem_512 Encaps", |b| b.iter(|| ek_512.try_encaps_with_rng_vt(&mut bench_rng))); - c.bench_function("ml_kem_768 Encaps", |b| b.iter(|| ek_768.try_encaps_with_rng_vt(&mut bench_rng))); - c.bench_function("ml_kem_1024 Encaps", |b| b.iter(|| ek_1024.try_encaps_with_rng_vt(&mut bench_rng))); + c.bench_function("ml_kem_512 KeyGen", |b| { + b.iter(|| ml_kem_512::KG::try_keygen_with_rng_vt(&mut bench_rng)) + }); + c.bench_function("ml_kem_768 KeyGen", |b| { + b.iter(|| ml_kem_768::KG::try_keygen_with_rng_vt(&mut bench_rng)) + }); + c.bench_function("ml_kem_1024 KeyGen", |b| { + b.iter(|| ml_kem_1024::KG::try_keygen_with_rng_vt(&mut bench_rng)) + }); + + c.bench_function("ml_kem_512 Encaps", |b| { + b.iter(|| ek_512.try_encaps_with_rng_vt(&mut bench_rng)) + }); + c.bench_function("ml_kem_768 Encaps", |b| { + b.iter(|| ek_768.try_encaps_with_rng_vt(&mut bench_rng)) + }); + c.bench_function("ml_kem_1024 Encaps", |b| { + b.iter(|| ek_1024.try_encaps_with_rng_vt(&mut bench_rng)) + }); c.bench_function("ml_kem_512 Decaps", |b| b.iter(|| dk_512.try_decaps_vt(&ct_512))); c.bench_function("ml_kem_768 Decaps", |b| b.iter(|| dk_768.try_decaps_vt(&ct_768))); diff --git a/ct_cm4/Cargo.toml b/ct_cm4/Cargo.toml index c310a22..432c3b0 100644 --- a/ct_cm4/Cargo.toml +++ b/ct_cm4/Cargo.toml @@ -10,13 +10,23 @@ edition = "2021" [dependencies] fips203 = { path = "..", default-features = false, features = ["ml-kem-512"] } -cortex-m-semihosting = "0.5.0" -panic-semihosting = { version = "0.6.0", features = ["exit"] } cortex-m = { version = "0.7.7", features = ["critical-section-single-core"] } -cortex-m-rt = "0.6.15" # Required by 'most recent' version of stm32f3-discovery below -stm32f3-discovery = "0.7.2" -panic-itm = "0.4.2" +cortex-m-rt = "0.7.3" +panic-rtt-target = { version = "0.1.2", features = ["cortex-m"] } +microbit-v2 = "0.13.0" +rtt-target = { version = "0.5.0" } #, features = ["cortex-m"] } rand_core = { version = "0.6.4", default-features = false } +hex-literal = "0.4.1" +rand_chacha = { version = "0.3.1", default-features = false } + [profile.dev] +debug = true +debug-assertions = false +overflow-checks = false +lto = true opt-level = 3 +codegen-units = 1 + + +# cargo update -p fixed@1.26.0 --precise 1.23.1 \ No newline at end of file diff --git a/ct_cm4/Embed.toml b/ct_cm4/Embed.toml new file mode 100644 index 0000000..8c4e687 --- /dev/null +++ b/ct_cm4/Embed.toml @@ -0,0 +1,14 @@ +[default.general] +chip = "nrf52833_xxAA" + +#[default.reset] +#halt_afterwards = true + +[default.rtt] +enabled = true + +[default.gdb] +enabled = false + +[default.probe] +protocol = "Swd" \ No newline at end of file diff --git a/ct_cm4/README.md b/ct_cm4/README.md index b3c1051..064726c 100644 --- a/ct_cm4/README.md +++ b/ct_cm4/README.md @@ -1,37 +1,10 @@ -An example for the STM Discovery Board -- https://docs.rust-embedded.org/discovery/f3discovery/index.html +An example for the Microbit v2 Board -- -One-off setup: +This example demonstrates the full loop of keygen, encaps then decaps functionality. +Cycle counts are measured, displayed, and operation confirmed to be constant-time. +See the link above for tooling setup. -~~~ -rustup target add thumbv7em-none-eabihf -rustup component add llvm-tools-preview -~~~ - -You will need to be running with two windows in parallel. - -1. In the first window: - - ~~~ - $ cd ct_cm4 # - $ cargo build --target thumbv7em-none-eabihf - $ cargo readobj --target thumbv7em-none-eabihf --bin ct_cm4-fips203 -- --file-header # double-checks built object - $ cargo size --bin ct_cm4-fips203 --release -- -A - ~~~ - -2. In the second window: - - ~~~ - $ cd /tmp && openocd -f interface/stlink-v2-1.cfg -f target/stm32f3x.cfg - ~~~ - -3. Back to the first window: - - ~~~ - $ cargo run - - then: - layout src - break k_pke.rs:29 - continue - s - ~~~ + ~~~ + $ cd ct_cm4 # + $ cargo embed + ~~~ diff --git a/ct_cm4/openocd.gdb b/ct_cm4/openocd.gdb deleted file mode 100644 index 4c9ff3f..0000000 --- a/ct_cm4/openocd.gdb +++ /dev/null @@ -1,37 +0,0 @@ -# Connect to gdb remote server -target remote :3333 - -# Load will flash the code -load - -# Enable demangling asm names on disassembly -set print asm-demangle on - -# Enable pretty printing -set print pretty on - -# Disable style sources as the default colors can be hard to read -set style sources off - -# Initialize monitoring so iprintln! macro output -# is sent from the itm port to itm.txt -monitor tpiu config internal itm.txt uart off 8000000 - -# Turn on the itm port and semihosting -monitor itm port 0 on -monitor arm semihosting enable - -# Set a breakpoint at main, aka entry -break main - -# Set a breakpoint at DefaultHandler -break DefaultHandler - -# Set a breakpoint at HardFault -break HardFault - -# Continue running until we hit the main breakpoint -continue - -# Step from the trampoline code in entry into main -step diff --git a/ct_cm4/src/main.rs b/ct_cm4/src/main.rs index 6658494..ec5fbaa 100644 --- a/ct_cm4/src/main.rs +++ b/ct_cm4/src/main.rs @@ -1,67 +1,53 @@ #![no_std] #![no_main] -use cortex_m::peripheral::DWT; +use cortex_m::asm; use cortex_m_rt::entry; use fips203::ml_kem_512; -use fips203::traits::KeyGen; -use rand_core::{CryptoRng, RngCore}; -use stm32f3_discovery::leds::Leds; -use stm32f3_discovery::stm32f3xx_hal::{pac, prelude::*}; -use stm32f3_discovery::switch_hal::ToggleableOutputSwitch; +use fips203::traits::{Decaps, Encaps, KeyGen, SerDes}; +use microbit::{board::Board, hal::{pac::DWT, prelude::OutputPin}}; +use rand_chacha::rand_core::SeedableRng; +use rtt_target::{rprintln, rtt_init_print}; - -// Dummy RNG that regurgitates zeros when 'asked' -struct MyRng(); -impl RngCore for MyRng { - fn next_u32(&mut self) -> u32 { unimplemented!() } - fn next_u64(&mut self) -> u64 { unimplemented!() } - fn fill_bytes(&mut self, out: &mut [u8]) { out.iter_mut().for_each(|b| *b = 0); } - fn try_fill_bytes(&mut self, out: &mut [u8]) -> Result<(), rand_core::Error> { - self.fill_bytes(out); - Ok(()) - } -} -impl CryptoRng for MyRng {} - - -#[panic_handler] -fn panic(_info: &core::panic::PanicInfo) -> ! { loop {} } +use panic_rtt_target as _; #[entry] fn main() -> ! { - - // Configure MCU - let device_peripherals = pac::Peripherals::take().unwrap(); - let mut reset_and_clock_control = device_peripherals.RCC.constrain(); - - // Initialize LEDs - let mut gpioe = device_peripherals.GPIOE.split(&mut reset_and_clock_control.ahb); - #[rustfmt::skip] - let mut leds = Leds::new(gpioe.pe8, gpioe.pe9, gpioe.pe10, gpioe.pe11, gpioe.pe12, - gpioe.pe13, gpioe.pe14, gpioe.pe15, &mut gpioe.moder, &mut gpioe.otyper).into_array(); - - let mut my_rng = MyRng {}; + let mut board = Board::take().unwrap(); + board.DCB.enable_trace(); + board.DWT.enable_cycle_counter(); + board.display_pins.col1.set_low().unwrap(); + rtt_init_print!(); + + let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(123); + let mut expected_cycles = 0; let mut i = 0u32; loop { - if (i % 10) == 0 { leds[0].toggle().ok(); }; + if (i % 100) == 0 { board.display_pins.row1.set_high().unwrap(); }; + if (i % 100) == 50 { board.display_pins.row1.set_low().unwrap(); }; i += 1; - cortex_m::asm::isb(); + rng.set_word_pos(1024 * i as u128); // Removes odd variability in drawing rng data + + asm::isb(); let start = DWT::cycle_count(); - cortex_m::asm::isb(); + asm::isb(); - let _res1 = ml_kem_512::KG::try_keygen_with_rng_vt(&mut my_rng); + let (ek, dk) = ml_kem_512::KG::try_keygen_with_rng_vt(&mut rng).unwrap(); + let (ssk1, ct) = ek.try_encaps_with_rng_vt(&mut rng).unwrap(); + let ssk2 = dk.try_decaps_vt(&ct).unwrap(); - cortex_m::asm::isb(); + asm::isb(); let finish = DWT::cycle_count(); - cortex_m::asm::isb(); + asm::isb(); + + assert_eq!(ssk1.into_bytes(), ssk2.into_bytes()); - // Code will 'soon' present the cycle counts via semi-hosting, - // and will also include encaps/decaps cycle - let _count = finish - start; - // print_semi("Top", _count); + let count = finish - start; + if (i == 5) & (expected_cycles == 0) { expected_cycles = count }; + if (i > 5) & (count != expected_cycles) { panic!("Non constant-time operation!!") }; + if i % 10 == 0 { rprintln!("Iteration {} cycle count: {}", i, count); } } } diff --git a/src/helpers.rs b/src/helpers.rs index 38bf2ec..56132ea 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -1,6 +1,5 @@ use sha3::{Digest, Sha3_256, Sha3_512, Shake128, Shake256}; -use sha3::digest::{ExtendableOutput, XofReader}; -use sha3::digest::Update; +use sha3::digest::{ExtendableOutput, Update, XofReader}; use crate::ntt::multiply_ntts; use crate::Q; diff --git a/src/sampling.rs b/src/sampling.rs index 0cd474e..0ce3699 100644 --- a/src/sampling.rs +++ b/src/sampling.rs @@ -19,8 +19,12 @@ pub(crate) fn sample_ntt(mut byte_stream_b: impl XofReader) -> [Z; 256] { // 2: j ← 0 let mut j = 0; - // 3: while j < 256 do - while j < 256 { + // The original sampling loop has inherent timing variability based on the need to reject + // `d1` > `Q` per step 6+ along with `d2` > `Q` per step 10+. The adapted loop below does + // "too much, but a constant amount of work" with the 384-256 margin impacting performance. + // 3: while j < 256 do --> this is adapted for constant-time operation TODO: loop # vs fail odds + #[allow(clippy::cast_possible_truncation)] // mask as u16 + for _k in 0..384 { // byte_stream_b.read(&mut bbb); // Draw 3 bytes @@ -31,34 +35,33 @@ pub(crate) fn sample_ntt(mut byte_stream_b: impl XofReader) -> [Z; 256] { let d2 = (u32::from(bbb[1]) >> 4) + 16 * u32::from(bbb[2]); // 6: if d1 < q then - if d1 < Q { - // - // 7: a_hat[j] ← d1 ▷ a_hat ∈ Z256 - let mut ah = Z::default(); - ah.set_u16(u16::try_from(d1).unwrap()); - array_a_hat[j] = ah; + let mask = usize::from((d1 < Q) & (j < 256)).wrapping_neg(); + // + // 7: a_hat[j] ← d1 ▷ a_hat ∈ Z256 + let mut ah = Z::default(); + ah.set_u16(u16::try_from(d1).unwrap() & (mask as u16)); + array_a_hat[j & 0xff] = array_a_hat[j & 0xFF].add(ah); - // 8: j ← j+1 - j += 1; + // 8: j ← j+1 + j += 1 & mask; - // 9: end if - } + // 9: end if // 10: if d2 < q and j < 256 then - if (d2 < Q) & (j < 256) { - // - // 11: a_hat[j] ← d2 - let mut ah = Z::default(); - ah.set_u16(u16::try_from(d2).unwrap()); - array_a_hat[j] = ah; + let mask2 = usize::from((d2 < Q) & (j < 256)).wrapping_neg(); - // 12: j ← j+1 - j += 1; + // 11: a_hat[j] ← d2 + let mut ah = Z::default(); + ah.set_u16(u16::try_from(d2).unwrap() & (mask2 as u16)); + array_a_hat[j & 0xFF] = array_a_hat[j & 0xFF].add(ah); - // 13: end if - } + // 12: j ← j+1 + j += 1 & mask2; + + // 13: end if // 14: i ← i+3 (not needed as we draw 3 more bytes next time + // 15: end while } @@ -81,17 +84,16 @@ pub(crate) fn sample_poly_cbd(eta: u32, byte_array_b: &[u8]) -> [Z; 256] { let mut int_index = 0; let mut bit_index = 0; for byte in byte_array_b { - temp |= u64::from(*byte) << bit_index; + temp |= u32::from(*byte) << bit_index; bit_index += 8; - #[allow(clippy::cast_possible_truncation)] while bit_index >= 2 * (eta as usize) { - let tmask_x = temp & (2u64.pow(eta) - 1); - let x = (tmask_x as u8).count_ones(); - let tmask_y = (temp >> eta) & (2u64.pow(eta) - 1); - let y = (tmask_y as u8).count_ones(); + let tmask_x = temp & ((1 << eta) - 1); + let x = count_ones(tmask_x); + let tmask_y = (temp >> eta) & ((1 << eta) - 1); + let y = count_ones(tmask_y); let (mut xx, mut yy) = (Z::default(), Z::default()); - xx.set_u16(x as u16); - yy.set_u16(y as u16); + xx.set_u16(x); + yy.set_u16(y); array_f[int_index] = xx.sub(yy); bit_index -= 2 * (eta as usize); temp >>= 2 * (eta as usize); @@ -102,6 +104,18 @@ pub(crate) fn sample_poly_cbd(eta: u32, byte_array_b: &[u8]) -> [Z; 256] { } +// Count u8 ones in constant time (u32 helps perf) +#[allow(clippy::cast_possible_truncation)] // return res as u16 +fn count_ones(x: u32) -> u16 { + let (mut res, mut x) = (x & 0xFF, x & 0xFF); + for _i in 1..8 { + x >>= 1; + res -= x; + } + res as u16 +} + + // The original pseudocode for Algorithm 7 follows... // Algorithm 7 `SamplePolyCBDη(B)` on page 20. // If the input is a stream of uniformly random bytes, outputs a sample from the distribution `D_η(R_q)`. diff --git a/src/types.rs b/src/types.rs index ea2a0eb..dd03c3e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -28,6 +28,7 @@ pub struct CipherText(pub(crate) [u8; CT_LEN]); #[derive(Clone, Copy, Default)] pub(crate) struct Z(u16); + #[allow(clippy::inline_always)] impl Z { const M: u64 = 2u64.pow(32) / (Self::Q64); @@ -45,8 +46,7 @@ impl Z { pub(crate) fn add(self, other: Self) -> Self { let sum = self.0.wrapping_add(other.0); let (trial, borrow) = sum.overflowing_sub(Self::Q16); - let select_sum = u16::from(borrow).wrapping_neg(); - let result = (!select_sum & trial) | (select_sum & sum); + let result = trial.wrapping_add(u16::from(borrow).wrapping_neg() & Self::Q16); Self(result) }