From 21b30c4511f5feca5c020b83219dafba1b0ac3a6 Mon Sep 17 00:00:00 2001 From: zac-williamson Date: Thu, 14 Mar 2024 11:53:01 +0000 Subject: [PATCH 1/4] multithreaded witness generation and removed redundant field inversions --- .../eccvm/eccvm_composer.test.cpp | 1 + .../eccvm/eccvm_circuit_builder.hpp | 278 +++++------ .../circuit_builder/eccvm/msm_builder.hpp | 463 +++++++++++++----- .../proof_system/op_queue/ecc_op_queue.hpp | 126 +++++ 4 files changed, 589 insertions(+), 279 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp index f9e2b72f39b..5b7d207daf7 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp @@ -96,6 +96,7 @@ TYPED_TEST(ECCVMComposerTests, EqFails) .z1 = 0, .z2 = 0, .mul_scalar_full = 0 }); + builder.op_queue->num_transcript_rows++; auto composer = ECCVMComposer_(); auto prover = composer.create_prover(builder); diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp index 88ad2e42908..bb78f8a413d 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp @@ -46,18 +46,7 @@ template class ECCVMCircuitBuilder { [[nodiscard]] uint32_t get_number_of_muls() const { - uint32_t num_muls = 0; - for (auto& op : op_queue->raw_ops) { - if (op.mul) { - if (op.z1 != 0) { - num_muls++; - } - if (op.z2 != 0) { - num_muls++; - } - } - } - return num_muls; + return op_queue->cached_num_muls + op_queue->cached_active_msm_count; } std::vector get_msms() const @@ -68,7 +57,7 @@ template class ECCVMCircuitBuilder { */ const auto compute_precomputed_table = [](const AffineElement& base_point) { const auto d2 = Element(base_point).dbl(); - std::array table; + std::array table; table[POINT_TABLE_SIZE / 2] = base_point; for (size_t i = 1; i < POINT_TABLE_SIZE / 2; ++i) { table[i + POINT_TABLE_SIZE / 2] = Element(table[i + POINT_TABLE_SIZE / 2 - 1]) + d2; @@ -76,7 +65,13 @@ template class ECCVMCircuitBuilder { for (size_t i = 0; i < POINT_TABLE_SIZE / 2; ++i) { table[i] = -table[POINT_TABLE_SIZE - 1 - i]; } - return table; + + Element::batch_normalize(&table[0], POINT_TABLE_SIZE); + std::array result; + for (size_t i = 0; i < POINT_TABLE_SIZE; ++i) { + result[i] = AffineElement{ .x = table[i].x, .y = table[i].y }; + } + return result; }; const auto compute_wnaf_slices = [](uint256_t scalar) { std::array output; @@ -262,8 +257,8 @@ template class ECCVMCircuitBuilder { ECCVMTranscriptBuilder::compute_transcript_state(op_queue->raw_ops, get_number_of_muls()); const auto precompute_table_state = ECCVMPrecomputedTablesBuilder::compute_precompute_state(flattened_muls); - const auto msm_state = - ECCVMMSMMBuilder::compute_msm_state(msms, point_table_read_counts, get_number_of_muls()); + const auto msm_state = ECCVMMSMMBuilder::compute_msm_state( + msms, point_table_read_counts, get_number_of_muls(), op_queue->get_num_msm_rows()); const size_t msm_size = msm_state.size(); const size_t transcript_size = transcript_state.size(); @@ -293,28 +288,30 @@ template class ECCVMCircuitBuilder { polys.lookup_read_counts_0[i + 1] = point_table_read_counts[0][i]; polys.lookup_read_counts_1[i + 1] = point_table_read_counts[1][i]; } - for (size_t i = 0; i < transcript_state.size(); ++i) { - polys.transcript_accumulator_empty[i] = transcript_state[i].accumulator_empty; - polys.transcript_add[i] = transcript_state[i].q_add; - polys.transcript_mul[i] = transcript_state[i].q_mul; - polys.transcript_eq[i] = transcript_state[i].q_eq; - polys.transcript_reset_accumulator[i] = transcript_state[i].q_reset_accumulator; - polys.transcript_msm_transition[i] = transcript_state[i].msm_transition; - polys.transcript_pc[i] = transcript_state[i].pc; - polys.transcript_msm_count[i] = transcript_state[i].msm_count; - polys.transcript_Px[i] = transcript_state[i].base_x; - polys.transcript_Py[i] = transcript_state[i].base_y; - polys.transcript_z1[i] = transcript_state[i].z1; - polys.transcript_z2[i] = transcript_state[i].z2; - polys.transcript_z1zero[i] = transcript_state[i].z1_zero; - polys.transcript_z2zero[i] = transcript_state[i].z2_zero; - polys.transcript_op[i] = transcript_state[i].opcode; - polys.transcript_accumulator_x[i] = transcript_state[i].accumulator_x; - polys.transcript_accumulator_y[i] = transcript_state[i].accumulator_y; - polys.transcript_msm_x[i] = transcript_state[i].msm_output_x; - polys.transcript_msm_y[i] = transcript_state[i].msm_output_y; - polys.transcript_collision_check[i] = transcript_state[i].collision_check; - } + run_loop_in_parallel(transcript_state.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + polys.transcript_accumulator_empty[i] = transcript_state[i].accumulator_empty; + polys.transcript_add[i] = transcript_state[i].q_add; + polys.transcript_mul[i] = transcript_state[i].q_mul; + polys.transcript_eq[i] = transcript_state[i].q_eq; + polys.transcript_reset_accumulator[i] = transcript_state[i].q_reset_accumulator; + polys.transcript_msm_transition[i] = transcript_state[i].msm_transition; + polys.transcript_pc[i] = transcript_state[i].pc; + polys.transcript_msm_count[i] = transcript_state[i].msm_count; + polys.transcript_Px[i] = transcript_state[i].base_x; + polys.transcript_Py[i] = transcript_state[i].base_y; + polys.transcript_z1[i] = transcript_state[i].z1; + polys.transcript_z2[i] = transcript_state[i].z2; + polys.transcript_z1zero[i] = transcript_state[i].z1_zero; + polys.transcript_z2zero[i] = transcript_state[i].z2_zero; + polys.transcript_op[i] = transcript_state[i].opcode; + polys.transcript_accumulator_x[i] = transcript_state[i].accumulator_x; + polys.transcript_accumulator_y[i] = transcript_state[i].accumulator_y; + polys.transcript_msm_x[i] = transcript_state[i].msm_output_x; + polys.transcript_msm_y[i] = transcript_state[i].msm_output_y; + polys.transcript_collision_check[i] = transcript_state[i].collision_check; + } + }); // TODO(@zac-williamson) if final opcode resets accumulator, all subsequent "is_accumulator_empty" row values // must be 1. Ideally we find a way to tweak this so that empty rows that do nothing have column values that are @@ -324,97 +321,101 @@ template class ECCVMCircuitBuilder { polys.transcript_accumulator_empty[i] = 1; } } - for (size_t i = 0; i < precompute_table_state.size(); ++i) { - // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st - // coefficient). All other rows in the precompute_table_state represent active wnaf gates (i.e. - // precompute_select = 1) - polys.precompute_select[i] = (i != 0) ? 1 : 0; - polys.precompute_pc[i] = precompute_table_state[i].pc; - polys.precompute_point_transition[i] = static_cast(precompute_table_state[i].point_transition); - polys.precompute_round[i] = precompute_table_state[i].round; - polys.precompute_scalar_sum[i] = precompute_table_state[i].scalar_sum; - - polys.precompute_s1hi[i] = precompute_table_state[i].s1; - polys.precompute_s1lo[i] = precompute_table_state[i].s2; - polys.precompute_s2hi[i] = precompute_table_state[i].s3; - polys.precompute_s2lo[i] = precompute_table_state[i].s4; - polys.precompute_s3hi[i] = precompute_table_state[i].s5; - polys.precompute_s3lo[i] = precompute_table_state[i].s6; - polys.precompute_s4hi[i] = precompute_table_state[i].s7; - polys.precompute_s4lo[i] = precompute_table_state[i].s8; - // If skew is active (i.e. we need to subtract a base point from the msm result), - // write `7` into rows.precompute_skew. `7`, in binary representation, equals `-1` when converted into WNAF - // form - polys.precompute_skew[i] = precompute_table_state[i].skew ? 7 : 0; - - polys.precompute_dx[i] = precompute_table_state[i].precompute_double.x; - polys.precompute_dy[i] = precompute_table_state[i].precompute_double.y; - polys.precompute_tx[i] = precompute_table_state[i].precompute_accumulator.x; - polys.precompute_ty[i] = precompute_table_state[i].precompute_accumulator.y; - } - - for (size_t i = 0; i < msm_state.size(); ++i) { - polys.msm_transition[i] = static_cast(msm_state[i].msm_transition); - polys.msm_add[i] = static_cast(msm_state[i].q_add); - polys.msm_double[i] = static_cast(msm_state[i].q_double); - polys.msm_skew[i] = static_cast(msm_state[i].q_skew); - polys.msm_accumulator_x[i] = msm_state[i].accumulator_x; - polys.msm_accumulator_y[i] = msm_state[i].accumulator_y; - polys.msm_pc[i] = msm_state[i].pc; - polys.msm_size_of_msm[i] = msm_state[i].msm_size; - polys.msm_count[i] = msm_state[i].msm_count; - polys.msm_round[i] = msm_state[i].msm_round; - polys.msm_add1[i] = static_cast(msm_state[i].add_state[0].add); - polys.msm_add2[i] = static_cast(msm_state[i].add_state[1].add); - polys.msm_add3[i] = static_cast(msm_state[i].add_state[2].add); - polys.msm_add4[i] = static_cast(msm_state[i].add_state[3].add); - polys.msm_x1[i] = msm_state[i].add_state[0].point.x; - polys.msm_y1[i] = msm_state[i].add_state[0].point.y; - polys.msm_x2[i] = msm_state[i].add_state[1].point.x; - polys.msm_y2[i] = msm_state[i].add_state[1].point.y; - polys.msm_x3[i] = msm_state[i].add_state[2].point.x; - polys.msm_y3[i] = msm_state[i].add_state[2].point.y; - polys.msm_x4[i] = msm_state[i].add_state[3].point.x; - polys.msm_y4[i] = msm_state[i].add_state[3].point.y; - polys.msm_collision_x1[i] = msm_state[i].add_state[0].collision_inverse; - polys.msm_collision_x2[i] = msm_state[i].add_state[1].collision_inverse; - polys.msm_collision_x3[i] = msm_state[i].add_state[2].collision_inverse; - polys.msm_collision_x4[i] = msm_state[i].add_state[3].collision_inverse; - polys.msm_lambda1[i] = msm_state[i].add_state[0].lambda; - polys.msm_lambda2[i] = msm_state[i].add_state[1].lambda; - polys.msm_lambda3[i] = msm_state[i].add_state[2].lambda; - polys.msm_lambda4[i] = msm_state[i].add_state[3].lambda; - polys.msm_slice1[i] = msm_state[i].add_state[0].slice; - polys.msm_slice2[i] = msm_state[i].add_state[1].slice; - polys.msm_slice3[i] = msm_state[i].add_state[2].slice; - polys.msm_slice4[i] = msm_state[i].add_state[3].slice; - } - - polys.transcript_mul_shift = Polynomial(polys.transcript_mul.shifted()); - polys.transcript_msm_count_shift = Polynomial(polys.transcript_msm_count.shifted()); - polys.transcript_accumulator_x_shift = Polynomial(polys.transcript_accumulator_x.shifted()); - polys.transcript_accumulator_y_shift = Polynomial(polys.transcript_accumulator_y.shifted()); - polys.precompute_scalar_sum_shift = Polynomial(polys.precompute_scalar_sum.shifted()); - polys.precompute_s1hi_shift = Polynomial(polys.precompute_s1hi.shifted()); - polys.precompute_dx_shift = Polynomial(polys.precompute_dx.shifted()); - polys.precompute_dy_shift = Polynomial(polys.precompute_dy.shifted()); - polys.precompute_tx_shift = Polynomial(polys.precompute_tx.shifted()); - polys.precompute_ty_shift = Polynomial(polys.precompute_ty.shifted()); - polys.msm_transition_shift = Polynomial(polys.msm_transition.shifted()); - polys.msm_add_shift = Polynomial(polys.msm_add.shifted()); - polys.msm_double_shift = Polynomial(polys.msm_double.shifted()); - polys.msm_skew_shift = Polynomial(polys.msm_skew.shifted()); - polys.msm_accumulator_x_shift = Polynomial(polys.msm_accumulator_x.shifted()); - polys.msm_accumulator_y_shift = Polynomial(polys.msm_accumulator_y.shifted()); - polys.msm_count_shift = Polynomial(polys.msm_count.shifted()); - polys.msm_round_shift = Polynomial(polys.msm_round.shifted()); - polys.msm_add1_shift = Polynomial(polys.msm_add1.shifted()); - polys.msm_pc_shift = Polynomial(polys.msm_pc.shifted()); - polys.precompute_pc_shift = Polynomial(polys.precompute_pc.shifted()); - polys.transcript_pc_shift = Polynomial(polys.transcript_pc.shifted()); - polys.precompute_round_shift = Polynomial(polys.precompute_round.shifted()); - polys.transcript_accumulator_empty_shift = Polynomial(polys.transcript_accumulator_empty.shifted()); - polys.precompute_select_shift = Polynomial(polys.precompute_select.shifted()); + run_loop_in_parallel(precompute_table_state.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st + // coefficient). All other rows in the precompute_table_state represent active wnaf gates (i.e. + // precompute_select = 1) + polys.precompute_select[i] = (i != 0) ? 1 : 0; + polys.precompute_pc[i] = precompute_table_state[i].pc; + polys.precompute_point_transition[i] = + static_cast(precompute_table_state[i].point_transition); + polys.precompute_round[i] = precompute_table_state[i].round; + polys.precompute_scalar_sum[i] = precompute_table_state[i].scalar_sum; + + polys.precompute_s1hi[i] = precompute_table_state[i].s1; + polys.precompute_s1lo[i] = precompute_table_state[i].s2; + polys.precompute_s2hi[i] = precompute_table_state[i].s3; + polys.precompute_s2lo[i] = precompute_table_state[i].s4; + polys.precompute_s3hi[i] = precompute_table_state[i].s5; + polys.precompute_s3lo[i] = precompute_table_state[i].s6; + polys.precompute_s4hi[i] = precompute_table_state[i].s7; + polys.precompute_s4lo[i] = precompute_table_state[i].s8; + // If skew is active (i.e. we need to subtract a base point from the msm result), + // write `7` into rows.precompute_skew. `7`, in binary representation, equals `-1` when converted into + // WNAF form + polys.precompute_skew[i] = precompute_table_state[i].skew ? 7 : 0; + + polys.precompute_dx[i] = precompute_table_state[i].precompute_double.x; + polys.precompute_dy[i] = precompute_table_state[i].precompute_double.y; + polys.precompute_tx[i] = precompute_table_state[i].precompute_accumulator.x; + polys.precompute_ty[i] = precompute_table_state[i].precompute_accumulator.y; + } + }); + + run_loop_in_parallel(msm_state.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + polys.msm_transition[i] = static_cast(msm_state[i].msm_transition); + polys.msm_add[i] = static_cast(msm_state[i].q_add); + polys.msm_double[i] = static_cast(msm_state[i].q_double); + polys.msm_skew[i] = static_cast(msm_state[i].q_skew); + polys.msm_accumulator_x[i] = msm_state[i].accumulator_x; + polys.msm_accumulator_y[i] = msm_state[i].accumulator_y; + polys.msm_pc[i] = msm_state[i].pc; + polys.msm_size_of_msm[i] = msm_state[i].msm_size; + polys.msm_count[i] = msm_state[i].msm_count; + polys.msm_round[i] = msm_state[i].msm_round; + polys.msm_add1[i] = static_cast(msm_state[i].add_state[0].add); + polys.msm_add2[i] = static_cast(msm_state[i].add_state[1].add); + polys.msm_add3[i] = static_cast(msm_state[i].add_state[2].add); + polys.msm_add4[i] = static_cast(msm_state[i].add_state[3].add); + polys.msm_x1[i] = msm_state[i].add_state[0].point.x; + polys.msm_y1[i] = msm_state[i].add_state[0].point.y; + polys.msm_x2[i] = msm_state[i].add_state[1].point.x; + polys.msm_y2[i] = msm_state[i].add_state[1].point.y; + polys.msm_x3[i] = msm_state[i].add_state[2].point.x; + polys.msm_y3[i] = msm_state[i].add_state[2].point.y; + polys.msm_x4[i] = msm_state[i].add_state[3].point.x; + polys.msm_y4[i] = msm_state[i].add_state[3].point.y; + polys.msm_collision_x1[i] = msm_state[i].add_state[0].collision_inverse; + polys.msm_collision_x2[i] = msm_state[i].add_state[1].collision_inverse; + polys.msm_collision_x3[i] = msm_state[i].add_state[2].collision_inverse; + polys.msm_collision_x4[i] = msm_state[i].add_state[3].collision_inverse; + polys.msm_lambda1[i] = msm_state[i].add_state[0].lambda; + polys.msm_lambda2[i] = msm_state[i].add_state[1].lambda; + polys.msm_lambda3[i] = msm_state[i].add_state[2].lambda; + polys.msm_lambda4[i] = msm_state[i].add_state[3].lambda; + polys.msm_slice1[i] = msm_state[i].add_state[0].slice; + polys.msm_slice2[i] = msm_state[i].add_state[1].slice; + polys.msm_slice3[i] = msm_state[i].add_state[2].slice; + polys.msm_slice4[i] = msm_state[i].add_state[3].slice; + } + }); + polys.transcript_mul_shift = polys.transcript_mul.shifted(); + polys.transcript_msm_count_shift = polys.transcript_msm_count.shifted(); + polys.transcript_accumulator_x_shift = polys.transcript_accumulator_x.shifted(); + polys.transcript_accumulator_y_shift = polys.transcript_accumulator_y.shifted(); + polys.precompute_scalar_sum_shift = polys.precompute_scalar_sum.shifted(); + polys.precompute_s1hi_shift = polys.precompute_s1hi.shifted(); + polys.precompute_dx_shift = polys.precompute_dx.shifted(); + polys.precompute_dy_shift = polys.precompute_dy.shifted(); + polys.precompute_tx_shift = polys.precompute_tx.shifted(); + polys.precompute_ty_shift = polys.precompute_ty.shifted(); + polys.msm_transition_shift = polys.msm_transition.shifted(); + polys.msm_add_shift = polys.msm_add.shifted(); + polys.msm_double_shift = polys.msm_double.shifted(); + polys.msm_skew_shift = polys.msm_skew.shifted(); + polys.msm_accumulator_x_shift = polys.msm_accumulator_x.shifted(); + polys.msm_accumulator_y_shift = polys.msm_accumulator_y.shifted(); + polys.msm_count_shift = polys.msm_count.shifted(); + polys.msm_round_shift = polys.msm_round.shifted(); + polys.msm_add1_shift = polys.msm_add1.shifted(); + polys.msm_pc_shift = polys.msm_pc.shifted(); + polys.precompute_pc_shift = polys.precompute_pc.shifted(); + polys.transcript_pc_shift = polys.transcript_pc.shifted(); + polys.precompute_round_shift = polys.precompute_round.shifted(); + polys.transcript_accumulator_empty_shift = polys.transcript_accumulator_empty.shifted(); + polys.precompute_select_shift = polys.precompute_select.shifted(); return polys; } @@ -497,25 +498,8 @@ template class ECCVMCircuitBuilder { [[nodiscard]] size_t get_num_gates() const { - // TODO(@zac-williamson) once we have a stable base to work off of, optimize this method! // (issue #2218) - const auto msms = get_msms(); - const auto flattened_muls = get_flattened_scalar_muls(msms); - - std::array, 2> point_table_read_counts; - const auto transcript_state = - ECCVMTranscriptBuilder::compute_transcript_state(op_queue->raw_ops, get_number_of_muls()); - const auto precompute_table_state = - ECCVMPrecomputedTablesBuilder::compute_precompute_state(flattened_muls); - const auto msm_state = - ECCVMMSMMBuilder::compute_msm_state(msms, point_table_read_counts, get_number_of_muls()); - - const size_t msm_size = msm_state.size(); - const size_t transcript_size = transcript_state.size(); - const size_t precompute_table_size = precompute_table_state.size(); - - const size_t num_rows = std::max(precompute_table_size, std::max(msm_size, transcript_size)); - return num_rows; + return op_queue->get_num_rows(); } [[nodiscard]] size_t get_circuit_subgroup_size(const size_t num_rows) const diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp index 6f0c45e3744..5630ca357e0 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp @@ -3,6 +3,7 @@ #include #include "./eccvm_builder_types.hpp" +#include "barretenberg/proof_system/op_queue/ecc_op_queue.hpp" namespace bb { @@ -17,7 +18,7 @@ template class ECCVMMSMMBuilder { static constexpr size_t NUM_SCALAR_BITS = bb::eccvm::NUM_SCALAR_BITS; static constexpr size_t WNAF_SLICE_BITS = bb::eccvm::WNAF_SLICE_BITS; - struct MSMState { + struct alignas(64) MSMState { uint32_t pc = 0; uint32_t msm_size = 0; uint32_t msm_count = 0; @@ -42,6 +43,21 @@ template class ECCVMMSMMBuilder { FF accumulator_y = 0; }; + struct alignas(64) MSMRowTranscript { + std::array lambda_numerator; + std::array lambda_denominator; + Element accumulator_in; + Element accumulator_out; + }; + + struct alignas(64) AdditionTrace { + Element p1; + Element p2; + Element p3; + bool predicate; + bool is_double; + }; + /** * @brief Computes the row values for the Straus MSM columns of the ECCVM. * @@ -55,7 +71,8 @@ template class ECCVMMSMMBuilder { */ static std::vector compute_msm_state(const std::vector>& msms, std::array, 2>& point_table_read_counts, - const uint32_t total_number_of_muls) + const uint32_t total_number_of_muls, + const size_t num_msm_rows) { // N.B. the following comments refer to a "point lookup table" frequently. // To perform a scalar multiplicaiton of a point [P] by a scalar x, we compute multiples of [P] and store in a @@ -100,171 +117,354 @@ template class ECCVMMSMMBuilder { point_table_read_counts[column_index][pc_offset + 15 - static_cast(slice_row)]++; } }; - std::vector msm_state; - // start with empty row (shiftable polynomials must have 0 as first coefficient) - msm_state.emplace_back(MSMState{}); - uint32_t pc = total_number_of_muls; - AffineElement accumulator = CycleGroup::affine_point_at_infinity; + // compute which row index each multiscalar multiplication will start at. + // also compute the program counter index that each multiscalar multiplication will start at. + // we use this information to populate the MSM row data across multiple threads + std::vector msm_row_indices; + std::vector pc_indices; + msm_row_indices.reserve(msms.size() + 1); + pc_indices.reserve(msms.size() + 1); + + msm_row_indices.push_back(1); + pc_indices.push_back(total_number_of_muls); for (const auto& msm : msms) { - const size_t msm_size = msm.size(); - - const size_t rows_per_round = (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0); - static constexpr size_t num_rounds = NUM_SCALAR_BITS / WNAF_SLICE_BITS; - - const auto add_points = [](auto& P1, auto& P2, auto& lambda, auto& collision_inverse, bool predicate) { - lambda = predicate ? (P2.y - P1.y) / (P2.x - P1.x) : 0; - collision_inverse = predicate ? (P2.x - P1.x).invert() : 0; - auto x3 = predicate ? lambda * lambda - (P2.x + P1.x) : P1.x; - auto y3 = predicate ? lambda * (P1.x - x3) - P1.y : P1.y; - return AffineElement(x3, y3); - }; + const size_t rows = ECCOpQueue::get_msm_row_count_for_single_msm(msm.size()); + msm_row_indices.push_back(msm_row_indices.back() + rows); + pc_indices.push_back(pc_indices.back() - msm.size()); + } + + static constexpr size_t num_rounds = NUM_SCALAR_BITS / WNAF_SLICE_BITS; + std::vector msm_state(num_msm_rows); + // start with empty row (shiftable polynomials must have 0 as first coefficient) + msm_state[0] = (MSMState{}); + + // compute "read counts" so that we can determine the number of times entries in our log-derivative lookup + // tables are called. + // Note: this part is single-threaded. THe amount of compute is low, however, so this is likely not a big + // concern. + for (size_t i = 0; i < msms.size(); ++i) { + for (size_t j = 0; j < num_rounds; ++j) { + uint32_t pc = static_cast(pc_indices[i]); + const auto& msm = msms[i]; + const size_t msm_size = msm.size(); + const size_t rows_per_round = + (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0); + for (size_t k = 0; k < rows_per_round; ++k) { - MSMState row; const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW; const size_t idx = k * ADDITIONS_PER_ROW; - row.msm_transition = (j == 0) && (k == 0); - - AffineElement acc(accumulator); - Element acc_expected = accumulator; for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) { - auto& add_state = row.add_state[m]; - add_state.add = points_per_row > m; - int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0; - // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row. - // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in the MSM - // columns into the MSM accumulator - // `add_state.slice` = A 4-bit WNAF slice of the scalar multiplier associated with the point we - // are adding (the specific slice chosen depends on the value of msm_round) (WNAF = - // windowed-non-adjacent-form. Value range is `-15, -13, ..., 15`) If `add_state.add = 1`, we - // want `add_state.slice` to be the *compressed* form of the WNAF slice value. (compressed = no - // gaps in the value range. i.e. -15, -13, ..., 15 maps to 0, ... , 15) - add_state.slice = add_state.add ? (slice + 15) / 2 : 0; - add_state.point = add_state.add - ? msm[idx + m].precomputed_table[static_cast(add_state.slice)] - : AffineElement{ 0, 0 }; - // predicate logic: - // add_predicate should normally equal add_state.add - // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point addition - // of a new MSM In this case, we do NOT add the 1st point into the accumulator, instead we SET - // the accumulator to equal the 1st point. add_predicate is used to determine whether we add the - // output of a point addition into the accumulator, therefore if j == 0 AND k == 0 AND m == 0, - // add_predicate = 0 even if add_state.add = true - bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add); - - auto& p1 = (m == 0) ? add_state.point : acc; - auto& p2 = (m == 0) ? acc : add_state.point; - - acc_expected = add_predicate ? (acc_expected + add_state.point) : Element(p1); - if (add_state.add) { + bool add = points_per_row > m; + if (add) { + int slice = add ? msm[idx + m].wnaf_slices[j] : 0; update_read_counts(pc - idx - m, slice); } - acc = add_points(p1, p2, add_state.lambda, add_state.collision_inverse, add_predicate); - ASSERT(acc == AffineElement(acc_expected)); } - row.q_add = true; - row.q_double = false; - row.q_skew = false; - row.msm_round = static_cast(j); - row.msm_size = static_cast(msm_size); - row.msm_count = static_cast(idx); - row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x; - row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y; - row.pc = pc; - accumulator = acc; - msm_state.push_back(row); } - if (j < num_rounds - 1) { - MSMState row; - row.msm_transition = false; - row.msm_round = static_cast(j + 1); - row.msm_size = static_cast(msm_size); - row.msm_count = static_cast(0); - row.q_add = false; - row.q_double = true; - row.q_skew = false; - - auto dx = accumulator.x; - auto dy = accumulator.y; - for (size_t m = 0; m < 4; ++m) { - auto& add_state = row.add_state[m]; - add_state.add = false; - add_state.slice = 0; - add_state.point = { 0, 0 }; - add_state.collision_inverse = 0; - add_state.lambda = ((dx + dx + dx) * dx) / (dy + dy); - auto x3 = add_state.lambda.sqr() - dx - dx; - dy = add_state.lambda * (dx - x3) - dy; - dx = x3; - } - row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x; - row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y; - accumulator = Element(accumulator).dbl().dbl().dbl().dbl(); - row.pc = pc; - msm_state.push_back(row); - } else { + if (j == num_rounds - 1) { for (size_t k = 0; k < rows_per_round; ++k) { - MSMState row; - const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW; const size_t idx = k * ADDITIONS_PER_ROW; - row.msm_transition = false; + for (size_t m = 0; m < 4; ++m) { + bool add = points_per_row > m; + + if (add) { + update_read_counts(pc - idx - m, msm[idx + m].wnaf_skew ? -1 : -15); + } + } + } + } + } + } - AffineElement acc(accumulator); - Element acc_expected = accumulator; + // The execution trace data for the MSM columns requires knowledge of intermediate values from *affine* point + // addition. The naive solution to compute this data requires 2 field inversions per in-circuit group addition + // evaluation. This is bad! To avoid this, we split the witness computation algorithm into 3 steps. Step 1: + // compute the execution trace group operations in *projective* coordinates Step 2: use batch inversion trick to + // convert all point traces into affine coordinates Step 3: populate the full execution trace, including the + // intermediate values from affine group operations This section sets up the data structures we need to store + // all intermediate ECC operations in projective form + const size_t num_point_adds_and_doubles = (num_msm_rows - 2) * 4; + const size_t num_accumulators = num_msm_rows - 1; + const size_t num_points_in_trace = (num_point_adds_and_doubles * 3) + num_accumulators; + // We create 1 vector to store the entire point trace. We split into multiple containers using std::span + // (we want 1 vector object to more efficiently batch normalize points) + std::vector point_trace(num_points_in_trace); + // the point traces record group operations. Either p1 + p2 = p3, or p1.dbl() = p3 + std::span p1_trace(&point_trace[0], num_point_adds_and_doubles); + std::span p2_trace(&point_trace[num_point_adds_and_doubles], num_point_adds_and_doubles); + std::span p3_trace(&point_trace[num_point_adds_and_doubles * 2], num_point_adds_and_doubles); + // operation_trace records whether an entry in the p1/p2/p3 trace represents a point addition or doubling + std::vector operation_trace(num_point_adds_and_doubles); + // accumulator_trace tracks the value of the ECCVM accumulator for each row + std::span accumulator_trace(&point_trace[num_point_adds_and_doubles * 3], num_accumulators); + + // we start the accumulator at the point at infinity + accumulator_trace[0] = (CycleGroup::affine_point_at_infinity); + + // populate point trace data, and the components of the MSM execution trace that do not relate to affine point + // operations + run_loop_in_parallel(msms.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + Element accumulator = CycleGroup::affine_point_at_infinity; + const auto& msm = msms[i]; + size_t msm_row_index = msm_row_indices[i]; + const size_t msm_size = msm.size(); + const size_t rows_per_round = + (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0); + size_t trace_index = (msm_row_indices[i] - 1) * 4; + + for (size_t j = 0; j < num_rounds; ++j) { + const uint32_t pc = static_cast(pc_indices[i]); + + for (size_t k = 0; k < rows_per_round; ++k) { + const size_t points_per_row = + (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW; + auto& row = msm_state[msm_row_index]; + const size_t idx = k * ADDITIONS_PER_ROW; + row.msm_transition = (j == 0) && (k == 0); + for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) { - for (size_t m = 0; m < 4; ++m) { auto& add_state = row.add_state[m]; add_state.add = points_per_row > m; - add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0; - + int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0; + // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row. + // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in + // the MSM columns into the MSM accumulator `add_state.slice` = A 4-bit WNAF slice of + // the scalar multiplier associated with the point we are adding (the specific slice + // chosen depends on the value of msm_round) (WNAF = windowed-non-adjacent-form. Value + // range is `-15, -13, + // ..., 15`) If `add_state.add = 1`, we want `add_state.slice` to be the *compressed* + // form of the WNAF slice value. (compressed = no gaps in the value range. i.e. -15, + // -13, ..., 15 maps to 0, ... , 15) + add_state.slice = add_state.add ? (slice + 15) / 2 : 0; add_state.point = add_state.add ? msm[idx + m].precomputed_table[static_cast(add_state.slice)] : AffineElement{ 0, 0 }; - bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false; - if (add_state.add) { - update_read_counts(pc - idx - m, msm[idx + m].wnaf_skew ? -1 : -15); - } - acc = add_points( - acc, add_state.point, add_state.lambda, add_state.collision_inverse, add_predicate); - acc_expected = add_predicate ? (acc_expected + add_state.point) : acc_expected; - ASSERT(acc == AffineElement(acc_expected)); + + // predicate logic: + // add_predicate should normally equal add_state.add + // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point + // addition of a new MSM In this case, we do NOT add the 1st point into the accumulator, + // instead we SET the accumulator to equal the 1st point. add_predicate is used to + // determine whether we add the output of a point addition into the accumulator, + // therefore if j == 0 AND k == 0 AND m == 0, add_predicate = 0 even if add_state.add = + // true + bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add); + + Element p1 = (m == 0) ? Element(add_state.point) : accumulator; + Element p2 = (m == 0) ? accumulator : Element(add_state.point); + + accumulator = add_predicate ? (accumulator + add_state.point) : Element(p1); + p1_trace[trace_index] = p1; + p2_trace[trace_index] = p2; + p3_trace[trace_index] = accumulator; + operation_trace[trace_index] = false; + trace_index++; } - row.q_add = false; + accumulator_trace[msm_row_index] = accumulator; + row.q_add = true; row.q_double = false; - row.q_skew = true; - row.msm_round = static_cast(j + 1); + row.q_skew = false; + row.msm_round = static_cast(j); row.msm_size = static_cast(msm_size); row.msm_count = static_cast(idx); + row.pc = pc; + msm_row_index++; + } + // doubling + if (j < num_rounds - 1) { + auto& row = msm_state[msm_row_index]; + row.msm_transition = false; + row.msm_round = static_cast(j + 1); + row.msm_size = static_cast(msm_size); + row.msm_count = static_cast(0); + row.q_add = false; + row.q_double = true; + row.q_skew = false; + for (size_t m = 0; m < 4; ++m) { - row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x; - row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y; + auto& add_state = row.add_state[m]; + add_state.add = false; + add_state.slice = 0; + add_state.point = { 0, 0 }; + add_state.collision_inverse = 0; - row.pc = pc; - accumulator = acc; - msm_state.emplace_back(row); + p1_trace[trace_index] = accumulator; + p2_trace[trace_index] = accumulator; + accumulator = accumulator.dbl(); + p3_trace[trace_index] = accumulator; + operation_trace[trace_index] = true; + trace_index++; + } + accumulator_trace[msm_row_index] = accumulator; + msm_row_index++; + } else { + for (size_t k = 0; k < rows_per_round; ++k) { + auto& row = msm_state[msm_row_index]; + + const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size + ? msm_size % ADDITIONS_PER_ROW + : ADDITIONS_PER_ROW; + const size_t idx = k * ADDITIONS_PER_ROW; + row.msm_transition = false; + + Element acc_expected = accumulator; + + for (size_t m = 0; m < 4; ++m) { + auto& add_state = row.add_state[m]; + add_state.add = points_per_row > m; + add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0; + + add_state.point = + add_state.add ? msm[idx + m].precomputed_table[static_cast(add_state.slice)] + : AffineElement{ 0, 0 }; + bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false; + auto p1 = accumulator; + accumulator = add_predicate ? accumulator + add_state.point : accumulator; + p1_trace[trace_index] = p1; + p2_trace[trace_index] = add_state.point; + p3_trace[trace_index] = accumulator; + operation_trace[trace_index] = false; + trace_index++; + } + row.q_add = false; + row.q_double = false; + row.q_skew = true; + row.msm_round = static_cast(j + 1); + row.msm_size = static_cast(msm_size); + row.msm_count = static_cast(idx); + row.pc = pc; + accumulator_trace[msm_row_index] = accumulator; + msm_row_index++; + } } } } - pc -= static_cast(msm_size); - // Validate our computed accumulator matches the real MSM result! - Element expected = CycleGroup::point_at_infinity; - for (size_t i = 0; i < msm.size(); ++i) { - expected += (Element(msm[i].base_point) * msm[i].scalar); + }); + + // Normalize the points in the point trace + run_loop_in_parallel(point_trace.size(), [&](size_t start, size_t end) { + Element::batch_normalize(&point_trace[start], end - start); + }); + + // inverse_trace is used to compute the value of the `collision_inverse` column in the ECCVM. + std::vector inverse_trace(num_point_adds_and_doubles); + run_loop_in_parallel(num_point_adds_and_doubles, [&](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + if (operation_trace[i]) { + inverse_trace[i] = (p1_trace[i].y + p1_trace[i].y); + } else { + inverse_trace[i] = (p2_trace[i].x - p1_trace[i].x); + } } - // Validate the accumulator is correct! - ASSERT(accumulator == AffineElement(expected)); - } + FF::batch_invert(&inverse_trace[start], end - start); + }); + + // complete the computation of the ECCVM execution trace, by adding the affine intermediate point data + // i.e. row.accumulator_x, row.accumulator_y, row.add_state[0...3].collision_inverse, + // row.add_state[0...3].lambda + run_loop_in_parallel(msms.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + const auto& msm = msms[i]; + size_t trace_index = ((msm_row_indices[i] - 1) * ADDITIONS_PER_ROW); + size_t msm_row_index = msm_row_indices[i]; + // 1st MSM row will have accumulator equal to the previous MSM output + // (or point at infinity for 1st MSM) + size_t accumulator_index = msm_row_indices[i] - 1; + const size_t msm_size = msm.size(); + const size_t rows_per_round = + (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0); + + for (size_t j = 0; j < num_rounds; ++j) { + for (size_t k = 0; k < rows_per_round; ++k) { + auto& row = msm_state[msm_row_index]; + const Element& normalized_accumulator = accumulator_trace[accumulator_index]; + const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x; + const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y; + row.accumulator_x = acc_x; + row.accumulator_y = acc_y; + + for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) { + auto& add_state = row.add_state[m]; + bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add); + + const auto& inverse = inverse_trace[trace_index]; + const auto& p1 = p1_trace[trace_index]; + const auto& p2 = p2_trace[trace_index]; + add_state.collision_inverse = add_predicate ? inverse : 0; + add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0; + trace_index++; + } + accumulator_index++; + msm_row_index++; + } + + if (j < num_rounds - 1) { + MSMState& row = msm_state[msm_row_index]; + const Element& normalized_accumulator = accumulator_trace[accumulator_index]; + const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x; + const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y; + row.accumulator_x = acc_x; + row.accumulator_y = acc_y; + + for (size_t m = 0; m < 4; ++m) { + auto& add_state = row.add_state[m]; + add_state.collision_inverse = 0; + const FF& dx = p1_trace[trace_index].x; + const FF& inverse = inverse_trace[trace_index]; + add_state.lambda = ((dx + dx + dx) * dx) * inverse; + trace_index++; + } + accumulator_index++; + msm_row_index++; + } else { + for (size_t k = 0; k < rows_per_round; ++k) { + MSMState& row = msm_state[msm_row_index]; + const Element& normalized_accumulator = accumulator_trace[accumulator_index]; + + const size_t idx = k * ADDITIONS_PER_ROW; + + const FF& acc_x = + normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x; + const FF& acc_y = + normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y; + row.accumulator_x = acc_x; + row.accumulator_y = acc_y; + + for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) { + auto& add_state = row.add_state[m]; + bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false; + + const auto& inverse = inverse_trace[trace_index]; + const auto& p1 = p1_trace[trace_index]; + const auto& p2 = p2_trace[trace_index]; + add_state.collision_inverse = add_predicate ? inverse : 0; + add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0; + trace_index++; + } + accumulator_index++; + msm_row_index++; + } + } + } + } + }); - MSMState final_row; - final_row.pc = pc; + // populate the final row in the MSM execution trace. + // we always require 1 extra row at the end of the trace, because the accumulator x/y coordinates for row `i` + // are present at row `i+1` + Element final_accumulator(accumulator_trace.back()); + MSMState& final_row = msm_state.back(); + final_row.pc = static_cast(pc_indices.back()); final_row.msm_transition = true; - final_row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x; - final_row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y; + final_row.accumulator_x = final_accumulator.is_point_at_infinity() ? 0 : final_accumulator.x; + final_row.accumulator_y = final_accumulator.is_point_at_infinity() ? 0 : final_accumulator.y; final_row.msm_size = 0; final_row.msm_count = 0; final_row.q_add = false; @@ -275,7 +475,6 @@ template class ECCVMMSMMBuilder { typename MSMState::AddState{ false, 0, AffineElement{ 0, 0 }, 0, 0 }, typename MSMState::AddState{ false, 0, AffineElement{ 0, 0 }, 0, 0 } }; - msm_state.emplace_back(final_row); return msm_state; } }; diff --git a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp index 3443fbdd2e0..ee9430eedff 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp @@ -46,6 +46,15 @@ class ECCOpQueue { std::array ultra_ops_commitments; + // as we populate the op_queue, we track the number of rows in each circuit section, + // as well as the number of multiplications performed. + // This is to avoid expensive O(n) logic to compute the number of rows and muls during witness computation + uint32_t cached_num_muls = 0; + uint32_t cached_active_msm_count = 0; + uint32_t num_transcript_rows = 0; + uint32_t num_precompute_table_rows = 0; + uint32_t num_msm_rows = 0; + Point get_accumulator() { return accumulator; } /** @@ -56,6 +65,20 @@ class ECCOpQueue { */ void prepend_previous_queue(const ECCOpQueue& previous) { + if (!previous.raw_ops.empty() && !raw_ops.empty()) { + // Check we are not merging op queue that does not reset accumulator! + // Note - eccvm does not directly constrain this to not happen. If we need such checks they need to be + // applied when the transcript is being written into + ASSERT(previous.raw_ops.back().eq || previous.raw_ops.back().reset); + } + // We shouldn't be merging if there is a previous active msm! + ASSERT(previous.cached_active_msm_count == 0); + + cached_num_muls += previous.cached_num_muls; + num_msm_rows += previous.num_msm_rows; + num_precompute_table_rows += previous.num_precompute_table_rows; + num_transcript_rows += previous.num_transcript_rows; + // Allocate enough space std::vector raw_ops_updated(raw_ops.size() + previous.raw_ops.size()); // Copy the previous raw ops to the beginning of the new vector @@ -116,6 +139,12 @@ class ECCOpQueue { auto commit_temp = lhs.ultra_ops_commitments; lhs.ultra_ops_commitments = rhs.ultra_ops_commitments; rhs.ultra_ops_commitments = commit_temp; + + std::swap(lhs.cached_num_muls, rhs.cached_num_muls); + std::swap(lhs.cached_active_msm_count, rhs.cached_active_msm_count); + std::swap(lhs.num_transcript_rows, rhs.num_transcript_rows); + std::swap(lhs.num_precompute_table_rows, rhs.num_precompute_table_rows); + std::swap(lhs.num_msm_rows, rhs.num_msm_rows); } /** @@ -190,6 +219,93 @@ class ECCOpQueue { this->set_commitment_data(mock_op_queue_commitments); } + /** + * @brief Get the number of rows in the 'msm' column section o the ECCVM, associated with a single multiscalar mul + * + * @param msm_count + * @return uint32_t + */ + static uint32_t get_msm_row_count_for_single_msm(const size_t msm_count) + { + const size_t rows_per_round = + (msm_count / eccvm::ADDITIONS_PER_ROW) + (msm_count % eccvm::ADDITIONS_PER_ROW != 0 ? 1 : 0); + constexpr size_t num_rounds = eccvm::NUM_SCALAR_BITS / eccvm::WNAF_SLICE_BITS; + const size_t num_rows_for_all_rounds = (num_rounds + 1) * rows_per_round; // + 1 round for skew + const size_t num_double_rounds = num_rounds - 1; + const size_t num_rows_for_msm = num_rows_for_all_rounds + num_double_rounds; + + return static_cast(num_rows_for_msm); + } + + /** + * @brief Get the precompute table row count for single msm object + * + * @param msm_count + * @return uint32_t + */ + static uint32_t get_precompute_table_row_count_for_single_msm(const size_t msm_count) + { + constexpr size_t num_precompute_rows_per_scalar = eccvm::NUM_WNAF_SLICES / eccvm::WNAF_SLICES_PER_ROW; + const size_t num_rows_for_precompute_table = msm_count * num_precompute_rows_per_scalar; + return static_cast(num_rows_for_precompute_table); + } + + /** + * @brief Get the number of rows in the 'msm' column section, for all msms in the circuit + * + * @return size_t + */ + size_t get_num_msm_rows() const + { + size_t msm_rows = num_msm_rows + 2; + if (cached_active_msm_count > 0) { + msm_rows += get_msm_row_count_for_single_msm(cached_active_msm_count); + } + return msm_rows; + } + + /** + * @brief Get the number of rows for the current ECCVM circuit + * + * @return size_t + */ + size_t get_num_rows() const + { + // add 1 row to start and end of transcript and msm sections + const size_t transcript_rows = num_transcript_rows + 2; + size_t msm_rows = num_msm_rows + 2; + // add 1 row to start of precompute table section + size_t precompute_rows = num_precompute_table_rows + 1; + if (cached_active_msm_count > 0) { + msm_rows += get_msm_row_count_for_single_msm(cached_active_msm_count); + precompute_rows += get_precompute_table_row_count_for_single_msm(cached_active_msm_count); + } + + return std::max(transcript_rows, std::max(msm_rows, precompute_rows)); + } + + /** + * @brief when inserting operations, update the number of multiplications in the latest scalar mul + * + * @param op + */ + void update_cached_msms(const ECCVMOperation& op) + { + if (op.mul) { + if (op.z1 != 0) { + cached_active_msm_count++; + } + if (op.z2 != 0) { + cached_active_msm_count++; + } + } else if (cached_active_msm_count != 0) { + num_msm_rows += get_msm_row_count_for_single_msm(cached_active_msm_count); + num_precompute_table_rows += get_precompute_table_row_count_for_single_msm(cached_active_msm_count); + cached_num_muls += cached_active_msm_count; + cached_active_msm_count = 0; + } + } + /** * @brief Write point addition op to queue and natively perform addition * @@ -211,6 +327,8 @@ class ECCOpQueue { .z2 = 0, .mul_scalar_full = 0, }); + num_transcript_rows += 1; + update_cached_msms(raw_ops.back()); } /** @@ -240,6 +358,9 @@ class ECCOpQueue { .z2 = z2, .mul_scalar_full = scalar, }); + num_transcript_rows += 1; + + update_cached_msms(raw_ops.back()); } /** @@ -262,7 +383,9 @@ class ECCOpQueue { .z2 = 0, .mul_scalar_full = 0, }); + num_transcript_rows += 1; + update_cached_msms(raw_ops.back()); return expected; } @@ -282,6 +405,9 @@ class ECCOpQueue { .z2 = 0, .mul_scalar_full = 0, }); + num_transcript_rows += 1; + + update_cached_msms(raw_ops.back()); } /** From c345b096e66e9d254ce5db0397f7386ca81f3ff9 Mon Sep 17 00:00:00 2001 From: zac-williamson Date: Thu, 14 Mar 2024 12:57:28 +0000 Subject: [PATCH 2/4] removed more reccvm edundant inverses, multithreaded eccvm table precomputation --- .../eccvm/eccvm_builder_types.hpp | 3 +- .../eccvm/eccvm_circuit_builder.hpp | 122 ++++++++++++----- .../eccvm/precomputed_tables_builder.hpp | 123 +++++++++--------- .../eccvm/transcript_builder.hpp | 24 ++-- 4 files changed, 165 insertions(+), 107 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp index 96873b6fd92..9ba785657d9 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp @@ -40,7 +40,8 @@ template struct ScalarMul { typename CycleGroup::affine_element base_point; std::array wnaf_slices; bool wnaf_skew; - std::array precomputed_table; + // size bumped by 1 to record base_point.dbl() + std::array precomputed_table; }; template using MSM = std::vector>; diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp index bb78f8a413d..f4084ca4416 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp @@ -57,7 +57,8 @@ template class ECCVMCircuitBuilder { */ const auto compute_precomputed_table = [](const AffineElement& base_point) { const auto d2 = Element(base_point).dbl(); - std::array table; + std::array table; + table[POINT_TABLE_SIZE] = d2; // need this for later table[POINT_TABLE_SIZE / 2] = base_point; for (size_t i = 1; i < POINT_TABLE_SIZE / 2; ++i) { table[i + POINT_TABLE_SIZE / 2] = Element(table[i + POINT_TABLE_SIZE / 2 - 1]) + d2; @@ -66,10 +67,10 @@ template class ECCVMCircuitBuilder { table[i] = -table[POINT_TABLE_SIZE - 1 - i]; } - Element::batch_normalize(&table[0], POINT_TABLE_SIZE); - std::array result; - for (size_t i = 0; i < POINT_TABLE_SIZE; ++i) { - result[i] = AffineElement{ .x = table[i].x, .y = table[i].y }; + Element::batch_normalize(&table[0], POINT_TABLE_SIZE + 1); + std::array result; + for (size_t i = 0; i < POINT_TABLE_SIZE + 1; ++i) { + result[i] = AffineElement(table[i].x, table[i].y); } return result; }; @@ -111,9 +112,83 @@ template class ECCVMCircuitBuilder { return output; }; - std::vector msms; - std::vector active_msm; + // a vector of MSMs = a vector of a vector of scalar muls + // each mul + size_t msm_count = 0; + size_t active_mul_count = 0; + std::vector msm_opqueue_index; + std::vector> msm_mul_index; + std::vector msm_sizes; + + // std::vector> msm_indices; + // std::vector active_msm_indices; + for (size_t i = 0; i < op_queue->raw_ops.size(); ++i) { + const auto& op = op_queue->raw_ops[i]; + if (op.mul) { + if (op.z1 != 0 || op.z2 != 0) { + msm_opqueue_index.push_back(i); + msm_mul_index.emplace_back(msm_count, active_mul_count); + } + if (op.z1 != 0) { + active_mul_count++; + } + if (op.z2 != 0) { + active_mul_count++; + } + } else if (active_mul_count > 0) { + msm_sizes.push_back(active_mul_count); + msm_count++; + active_mul_count = 0; + } + } + // if last op is a mul we have not correctly computed the total number of msms + if (op_queue->raw_ops.back().mul) { + msm_sizes.push_back(active_mul_count); + msm_count++; + } + std::vector msms_test(msm_count); + for (size_t i = 0; i < msm_count; ++i) { + auto& msm = msms_test[i]; + msm.resize(msm_sizes[i]); + } + + run_loop_in_parallel(msm_opqueue_index.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + // for (size_t i = 0; i < msm_opqueue_index.size(); ++i) { + const size_t opqueue_index = msm_opqueue_index[i]; + const auto& op = op_queue->raw_ops[opqueue_index]; + auto [msm_index, mul_index] = msm_mul_index[i]; + if (op.z1 != 0) { + ASSERT(msms_test.size() > msm_index); + ASSERT(msms_test[msm_index].size() > mul_index); + msms_test[msm_index][mul_index] = (ScalarMul{ + .pc = 0, + .scalar = op.z1, + .base_point = op.base_point, + .wnaf_slices = compute_wnaf_slices(op.z1), + .wnaf_skew = (op.z1 & 1) == 0, + .precomputed_table = compute_precomputed_table(op.base_point), + }); + mul_index++; + } + if (op.z2 != 0) { + ASSERT(msms_test.size() > msm_index); + ASSERT(msms_test[msm_index].size() > mul_index); + auto endo_point = AffineElement{ op.base_point.x * FF::cube_root_of_unity(), -op.base_point.y }; + msms_test[msm_index][mul_index] = (ScalarMul{ + .pc = 0, + .scalar = op.z2, + .base_point = endo_point, + .wnaf_slices = compute_wnaf_slices(op.z2), + .wnaf_skew = (op.z2 & 1) == 0, + .precomputed_table = compute_precomputed_table(endo_point), + }); + } + } + }); + + // update pc. easier to do this serially but in theory could be optimised out // We start pc at `num_muls` and decrement for each mul processed. // This gives us two desired properties: // 1: the value of pc at the 1st row = number of muls (easy to check) @@ -122,40 +197,15 @@ template class ECCVMCircuitBuilder { // sumcheck relations that involve pc (if we did the other way around, starting at 1 and ending at num_muls, // we create a discontinuity in pc values between the last transcript row and the following empty row) uint32_t pc = num_muls; - - const auto process_mul = [&active_msm, &pc, &compute_wnaf_slices, &compute_precomputed_table]( - const auto& scalar, const auto& base_point) { - if (scalar != 0) { - active_msm.push_back(ScalarMul{ - .pc = pc, - .scalar = scalar, - .base_point = base_point, - .wnaf_slices = compute_wnaf_slices(scalar), - .wnaf_skew = (scalar & 1) == 0, - .precomputed_table = compute_precomputed_table(base_point), - }); + for (auto& msm : msms_test) { + for (auto& mul : msm) { + mul.pc = pc; pc--; } - }; - - for (auto& op : op_queue->raw_ops) { - if (op.mul) { - process_mul(op.z1, op.base_point); - process_mul(op.z2, AffineElement{ op.base_point.x * FF::cube_root_of_unity(), -op.base_point.y }); - - } else { - if (!active_msm.empty()) { - msms.push_back(active_msm); - active_msm = {}; - } - } - } - if (!active_msm.empty()) { - msms.push_back(active_msm); } ASSERT(pc == 0); - return msms; + return msms_test; } static std::vector get_flattened_scalar_muls(const std::vector& msms) diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp index 1c7d2bb443e..8924edac6ca 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp @@ -36,75 +36,76 @@ template class ECCVMPrecomputedTablesBuilder { static std::vector compute_precompute_state( const std::vector>& ecc_muls) { - std::vector precompute_state; + static constexpr size_t num_rows_per_scalar = NUM_WNAF_SLICES / WNAF_SLICES_PER_ROW; + const size_t num_precompute_rows = num_rows_per_scalar * ecc_muls.size() + 1; + std::vector precompute_state(num_precompute_rows); // start with empty row (shiftable polynomials must have 0 as first coefficient) - precompute_state.push_back(PrecomputeState{}); - static constexpr size_t num_rows_per_scalar = NUM_WNAF_SLICES / WNAF_SLICES_PER_ROW; + precompute_state[0] = PrecomputeState{}; // current impl doesn't work if not 4 static_assert(WNAF_SLICES_PER_ROW == 4); - for (const auto& entry : ecc_muls) { - const auto& slices = entry.wnaf_slices; - uint256_t scalar_sum = 0; - - const Element point = entry.base_point; - const Element d2 = point.dbl(); - - for (size_t i = 0; i < num_rows_per_scalar; ++i) { - PrecomputeState row; - const int slice0 = slices[i * WNAF_SLICES_PER_ROW]; - const int slice1 = slices[i * WNAF_SLICES_PER_ROW + 1]; - const int slice2 = slices[i * WNAF_SLICES_PER_ROW + 2]; - const int slice3 = slices[i * WNAF_SLICES_PER_ROW + 3]; - - const int slice0base2 = (slice0 + 15) / 2; - const int slice1base2 = (slice1 + 15) / 2; - const int slice2base2 = (slice2 + 15) / 2; - const int slice3base2 = (slice3 + 15) / 2; - - // convert into 2-bit chunks - row.s1 = slice0base2 >> 2; - row.s2 = slice0base2 & 3; - row.s3 = slice1base2 >> 2; - row.s4 = slice1base2 & 3; - row.s5 = slice2base2 >> 2; - row.s6 = slice2base2 & 3; - row.s7 = slice3base2 >> 2; - row.s8 = slice3base2 & 3; - bool last_row = (i == num_rows_per_scalar - 1); - - row.skew = last_row ? entry.wnaf_skew : false; - - row.scalar_sum = scalar_sum; - - // N.B. we apply a constraint that requires slice1 to be positive for the 1st row of each scalar sum. - // This ensures we do not have WNAF representations of negative values - const int row_chunk = slice3 + slice2 * (1 << 4) + slice1 * (1 << 8) + slice0 * (1 << 12); - - bool chunk_negative = row_chunk < 0; - - scalar_sum = scalar_sum << (WNAF_SLICE_BITS * WNAF_SLICES_PER_ROW); - if (chunk_negative) { - scalar_sum -= static_cast(-row_chunk); - } else { - scalar_sum += static_cast(row_chunk); + run_loop_in_parallel(ecc_muls.size(), [&](size_t start, size_t end) { + for (size_t j = start; j < end; j++) { + const auto& entry = ecc_muls[j]; + const auto& slices = entry.wnaf_slices; + uint256_t scalar_sum = 0; + + for (size_t i = 0; i < num_rows_per_scalar; ++i) { + PrecomputeState row; + const int slice0 = slices[i * WNAF_SLICES_PER_ROW]; + const int slice1 = slices[i * WNAF_SLICES_PER_ROW + 1]; + const int slice2 = slices[i * WNAF_SLICES_PER_ROW + 2]; + const int slice3 = slices[i * WNAF_SLICES_PER_ROW + 3]; + + const int slice0base2 = (slice0 + 15) / 2; + const int slice1base2 = (slice1 + 15) / 2; + const int slice2base2 = (slice2 + 15) / 2; + const int slice3base2 = (slice3 + 15) / 2; + + // convert into 2-bit chunks + row.s1 = slice0base2 >> 2; + row.s2 = slice0base2 & 3; + row.s3 = slice1base2 >> 2; + row.s4 = slice1base2 & 3; + row.s5 = slice2base2 >> 2; + row.s6 = slice2base2 & 3; + row.s7 = slice3base2 >> 2; + row.s8 = slice3base2 & 3; + bool last_row = (i == num_rows_per_scalar - 1); + + row.skew = last_row ? entry.wnaf_skew : false; + + row.scalar_sum = scalar_sum; + + // N.B. we apply a constraint that requires slice1 to be positive for the 1st row of each scalar + // sum. This ensures we do not have WNAF representations of negative values + const int row_chunk = slice3 + slice2 * (1 << 4) + slice1 * (1 << 8) + slice0 * (1 << 12); + + bool chunk_negative = row_chunk < 0; + + scalar_sum = scalar_sum << (WNAF_SLICE_BITS * WNAF_SLICES_PER_ROW); + if (chunk_negative) { + scalar_sum -= static_cast(-row_chunk); + } else { + scalar_sum += static_cast(row_chunk); + } + row.round = static_cast(i); + row.point_transition = last_row; + row.pc = entry.pc; + + if (last_row) { + ASSERT(scalar_sum - entry.wnaf_skew == entry.scalar); + } + + row.precompute_double = entry.precomputed_table[bb::eccvm::POINT_TABLE_SIZE]; + // fill accumulator in reverse order i.e. first row = 15[P], then 13[P], ..., 1[P] + row.precompute_accumulator = entry.precomputed_table[bb::eccvm::POINT_TABLE_SIZE - 1 - i]; + precompute_state[j * num_rows_per_scalar + i + 1] = (row); } - row.round = static_cast(i); - row.point_transition = last_row; - row.pc = entry.pc; - - if (last_row) { - ASSERT(scalar_sum - entry.wnaf_skew == entry.scalar); - } - - row.precompute_double = d2; - // fill accumulator in reverse order i.e. first row = 15[P], then 13[P], ..., 1[P] - row.precompute_accumulator = entry.precomputed_table[bb::eccvm::POINT_TABLE_SIZE - 1 - i]; - precompute_state.emplace_back(row); } - } + }); return precompute_state; } }; diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp index 1ff3d8b4cba..69ea505b242 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp @@ -60,7 +60,10 @@ template class ECCVMTranscriptBuilder { static std::vector compute_transcript_state( const std::vector>& vm_operations, const uint32_t total_number_of_muls) { - std::vector transcript_state; + const size_t num_transcript_entries = vm_operations.size() + 2; + + std::vector transcript_state(num_transcript_entries); + std::vector inverse_trace(num_transcript_entries - 2); VMState state{ .pc = total_number_of_muls, .count = 0, @@ -69,11 +72,10 @@ template class ECCVMTranscriptBuilder { .is_accumulator_empty = true, }; VMState updated_state; - // add an empty row. 1st row all zeroes because of our shiftable polynomials - transcript_state.emplace_back(TranscriptState{}); + transcript_state[0] = (TranscriptState{}); for (size_t i = 0; i < vm_operations.size(); ++i) { - TranscriptState row; + TranscriptState& row = transcript_state[i + 1]; const bb::eccvm::VMOperation& entry = vm_operations[i]; const bool is_mul = entry.mul; @@ -158,11 +160,13 @@ template class ECCVMTranscriptBuilder { ASSERT((row.msm_output_x != row.accumulator_x) && "eccvm: attempting msm. Result point x-coordinate matches accumulator x-coordinate."); state.msm_accumulator = CycleGroup::affine_point_at_infinity; - row.collision_check = (row.msm_output_x - row.accumulator_x).invert(); + inverse_trace[i] = (row.msm_output_x - row.accumulator_x); } else if (entry.add && !row.accumulator_empty) { ASSERT((row.base_x != row.accumulator_x) && "eccvm: attempting to add points with matching x-coordinates"); - row.collision_check = (row.base_x - row.accumulator_x).invert(); + inverse_trace[i] = (row.base_x - row.accumulator_x); + } else { + inverse_trace[i] = (0); } state = updated_state; @@ -170,16 +174,18 @@ template class ECCVMTranscriptBuilder { if (entry.mul && next_not_msm) { state.msm_accumulator = CycleGroup::affine_point_at_infinity; } - transcript_state.emplace_back(row); } - TranscriptState final_row; + FF::batch_invert(&inverse_trace[0], inverse_trace.size()); + for (size_t i = 0; i < inverse_trace.size(); ++i) { + transcript_state[i + 1].collision_check = inverse_trace[i]; + } + TranscriptState& final_row = transcript_state.back(); final_row.pc = updated_state.pc; final_row.accumulator_x = (updated_state.accumulator.is_point_at_infinity()) ? 0 : updated_state.accumulator.x; final_row.accumulator_y = (updated_state.accumulator.is_point_at_infinity()) ? 0 : updated_state.accumulator.y; final_row.accumulator_empty = updated_state.is_accumulator_empty; - transcript_state.push_back(final_row); return transcript_state; } }; From 59966725fa04b84d519d638896f25de004d15cca Mon Sep 17 00:00:00 2001 From: zac-williamson Date: Fri, 15 Mar 2024 17:47:59 +0000 Subject: [PATCH 3/4] fixed ecc op queue test --- .../proof_system/op_queue/ecc_op_queue.hpp | 24 +++++++++++++++++++ .../op_queue/ecc_op_queue.test.cpp | 5 +++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp index ee9430eedff..e4ebd9719ea 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp @@ -389,6 +389,30 @@ class ECCOpQueue { return expected; } + /** + * @brief Write equality op using internal accumulator point + * + * @return current internal accumulator point (prior to reset to 0) + */ + void reset() + { + accumulator.self_set_infinity(); + + raw_ops.emplace_back(ECCVMOperation{ + .add = false, + .mul = false, + .eq = false, + .reset = true, + .base_point = { 0, 0 }, + .z1 = 0, + .z2 = 0, + .mul_scalar_full = 0, + }); + num_transcript_rows += 1; + + update_cached_msms(raw_ops.back()); + } + /** * @brief Write empty row to queue * diff --git a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp index d7a69547f1b..265727f4dd0 100644 --- a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp +++ b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp @@ -52,16 +52,18 @@ TEST(ECCOpQueueTest, PrependAndSwapTests) ECCOpQueue op_queue_a; op_queue_a.add_accumulate(P1 + P1); op_queue_a.mul_accumulate(P2, z + z); - + op_queue_a.reset(); // Add different operations to b ECCOpQueue op_queue_b; op_queue_b.mul_accumulate(P2, z); op_queue_b.add_accumulate(P1); + op_queue_b.reset(); // Add same operations as to a ECCOpQueue op_queue_c; op_queue_c.add_accumulate(P1 + P1); op_queue_c.mul_accumulate(P2, z + z); + op_queue_c.reset(); // Swap b with a std::swap(op_queue_b, op_queue_a); @@ -77,6 +79,7 @@ TEST(ECCOpQueueTest, PrependAndSwapTests) // Append same operations as now in a to c op_queue_c.mul_accumulate(P2, z); op_queue_c.add_accumulate(P1); + op_queue_c.reset(); // Check a==c for (size_t i = 0; i < op_queue_c.raw_ops.size(); i++) { From 25d8a3c0ed3dc5ce045c3b2c61d6cd1aa9b5af6b Mon Sep 17 00:00:00 2001 From: codygunton Date: Mon, 18 Mar 2024 11:05:49 +0000 Subject: [PATCH 4/4] Analysis no longer needed --- barretenberg/cpp/scripts/analyze_client_ivc_bench.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py index 6cedf3509e7..af374d05a5d 100644 --- a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py +++ b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py @@ -49,13 +49,6 @@ time_ms = bench[key]/1e6 print(f"{key:<{max_label_length}}{time_ms:>8.0f} {time_ms/sum_of_kept_times_ms:>8.2%}") - -print('\nBreakdown of ECCVMProver::create_prover:') -for key in ["ECCVMComposer::compute_witness(t)", "ECCVMComposer::create_proving_key(t)"]: - time_ms = bench[key]/1e6 - total_time_ms = bench["ECCVMComposer::create_prover(t)"]/1e6 - print(f"{key:<{max_label_length}}{time_ms:>8.0f} {time_ms/total_time_ms:>8.2%}") - print('\nBreakdown of ProtogalaxyProver::fold_instances:') protogalaxy_round_labels = [ "ProtoGalaxyProver_::preparation_round(t)",