From 21b30c4511f5feca5c020b83219dafba1b0ac3a6 Mon Sep 17 00:00:00 2001
From: zac-williamson <blorktronics@gmail.com>
Date: Thu, 14 Mar 2024 11:53:01 +0000
Subject: [PATCH 1/4] multithreaded witness generation and removed redundant
 field inversions

---
 .../eccvm/eccvm_composer.test.cpp             |   1 +
 .../eccvm/eccvm_circuit_builder.hpp           | 278 +++++------
 .../circuit_builder/eccvm/msm_builder.hpp     | 463 +++++++++++++-----
 .../proof_system/op_queue/ecc_op_queue.hpp    | 126 +++++
 4 files changed, 589 insertions(+), 279 deletions(-)
diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp
index f9e2b72f39b..5b7d207daf7 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_composer.test.cpp
@@ -96,6 +96,7 @@ TYPED_TEST(ECCVMComposerTests, EqFails)
                                                            .z1 = 0,
                                                            .z2 = 0,
                                                            .mul_scalar_full = 0 });
+    builder.op_queue->num_transcript_rows++;
     auto composer = ECCVMComposer_<Flavor>();
     auto prover = composer.create_prover(builder);
 
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp
index 88ad2e42908..bb78f8a413d 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp
@@ -46,18 +46,7 @@ template <typename Flavor> class ECCVMCircuitBuilder {
 
     [[nodiscard]] uint32_t get_number_of_muls() const
     {
-        uint32_t num_muls = 0;
-        for (auto& op : op_queue->raw_ops) {
-            if (op.mul) {
-                if (op.z1 != 0) {
-                    num_muls++;
-                }
-                if (op.z2 != 0) {
-                    num_muls++;
-                }
-            }
-        }
-        return num_muls;
+        return op_queue->cached_num_muls + op_queue->cached_active_msm_count;
     }
 
     std::vector<MSM> get_msms() const
@@ -68,7 +57,7 @@ template <typename Flavor> class ECCVMCircuitBuilder {
          */
         const auto compute_precomputed_table = [](const AffineElement& base_point) {
             const auto d2 = Element(base_point).dbl();
-            std::array<AffineElement, POINT_TABLE_SIZE> table;
+            std::array<Element, POINT_TABLE_SIZE> table;
             table[POINT_TABLE_SIZE / 2] = base_point;
             for (size_t i = 1; i < POINT_TABLE_SIZE / 2; ++i) {
                 table[i + POINT_TABLE_SIZE / 2] = Element(table[i + POINT_TABLE_SIZE / 2 - 1]) + d2;
@@ -76,7 +65,13 @@ template <typename Flavor> class ECCVMCircuitBuilder {
             for (size_t i = 0; i < POINT_TABLE_SIZE / 2; ++i) {
                 table[i] = -table[POINT_TABLE_SIZE - 1 - i];
             }
-            return table;
+
+            Element::batch_normalize(&table[0], POINT_TABLE_SIZE);
+            std::array<AffineElement, POINT_TABLE_SIZE> result;
+            for (size_t i = 0; i < POINT_TABLE_SIZE; ++i) {
+                result[i] = AffineElement{ .x = table[i].x, .y = table[i].y };
+            }
+            return result;
         };
         const auto compute_wnaf_slices = [](uint256_t scalar) {
             std::array<int, NUM_WNAF_SLICES> output;
@@ -262,8 +257,8 @@ template <typename Flavor> class ECCVMCircuitBuilder {
             ECCVMTranscriptBuilder<Flavor>::compute_transcript_state(op_queue->raw_ops, get_number_of_muls());
         const auto precompute_table_state =
             ECCVMPrecomputedTablesBuilder<Flavor>::compute_precompute_state(flattened_muls);
-        const auto msm_state =
-            ECCVMMSMMBuilder<Flavor>::compute_msm_state(msms, point_table_read_counts, get_number_of_muls());
+        const auto msm_state = ECCVMMSMMBuilder<Flavor>::compute_msm_state(
+            msms, point_table_read_counts, get_number_of_muls(), op_queue->get_num_msm_rows());
 
         const size_t msm_size = msm_state.size();
         const size_t transcript_size = transcript_state.size();
@@ -293,28 +288,30 @@ template <typename Flavor> class ECCVMCircuitBuilder {
             polys.lookup_read_counts_0[i + 1] = point_table_read_counts[0][i];
             polys.lookup_read_counts_1[i + 1] = point_table_read_counts[1][i];
         }
-        for (size_t i = 0; i < transcript_state.size(); ++i) {
-            polys.transcript_accumulator_empty[i] = transcript_state[i].accumulator_empty;
-            polys.transcript_add[i] = transcript_state[i].q_add;
-            polys.transcript_mul[i] = transcript_state[i].q_mul;
-            polys.transcript_eq[i] = transcript_state[i].q_eq;
-            polys.transcript_reset_accumulator[i] = transcript_state[i].q_reset_accumulator;
-            polys.transcript_msm_transition[i] = transcript_state[i].msm_transition;
-            polys.transcript_pc[i] = transcript_state[i].pc;
-            polys.transcript_msm_count[i] = transcript_state[i].msm_count;
-            polys.transcript_Px[i] = transcript_state[i].base_x;
-            polys.transcript_Py[i] = transcript_state[i].base_y;
-            polys.transcript_z1[i] = transcript_state[i].z1;
-            polys.transcript_z2[i] = transcript_state[i].z2;
-            polys.transcript_z1zero[i] = transcript_state[i].z1_zero;
-            polys.transcript_z2zero[i] = transcript_state[i].z2_zero;
-            polys.transcript_op[i] = transcript_state[i].opcode;
-            polys.transcript_accumulator_x[i] = transcript_state[i].accumulator_x;
-            polys.transcript_accumulator_y[i] = transcript_state[i].accumulator_y;
-            polys.transcript_msm_x[i] = transcript_state[i].msm_output_x;
-            polys.transcript_msm_y[i] = transcript_state[i].msm_output_y;
-            polys.transcript_collision_check[i] = transcript_state[i].collision_check;
-        }
+        run_loop_in_parallel(transcript_state.size(), [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                polys.transcript_accumulator_empty[i] = transcript_state[i].accumulator_empty;
+                polys.transcript_add[i] = transcript_state[i].q_add;
+                polys.transcript_mul[i] = transcript_state[i].q_mul;
+                polys.transcript_eq[i] = transcript_state[i].q_eq;
+                polys.transcript_reset_accumulator[i] = transcript_state[i].q_reset_accumulator;
+                polys.transcript_msm_transition[i] = transcript_state[i].msm_transition;
+                polys.transcript_pc[i] = transcript_state[i].pc;
+                polys.transcript_msm_count[i] = transcript_state[i].msm_count;
+                polys.transcript_Px[i] = transcript_state[i].base_x;
+                polys.transcript_Py[i] = transcript_state[i].base_y;
+                polys.transcript_z1[i] = transcript_state[i].z1;
+                polys.transcript_z2[i] = transcript_state[i].z2;
+                polys.transcript_z1zero[i] = transcript_state[i].z1_zero;
+                polys.transcript_z2zero[i] = transcript_state[i].z2_zero;
+                polys.transcript_op[i] = transcript_state[i].opcode;
+                polys.transcript_accumulator_x[i] = transcript_state[i].accumulator_x;
+                polys.transcript_accumulator_y[i] = transcript_state[i].accumulator_y;
+                polys.transcript_msm_x[i] = transcript_state[i].msm_output_x;
+                polys.transcript_msm_y[i] = transcript_state[i].msm_output_y;
+                polys.transcript_collision_check[i] = transcript_state[i].collision_check;
+            }
+        });
 
         // TODO(@zac-williamson) if final opcode resets accumulator, all subsequent "is_accumulator_empty" row values
         // must be 1. Ideally we find a way to tweak this so that empty rows that do nothing have column values that are
@@ -324,97 +321,101 @@ template <typename Flavor> class ECCVMCircuitBuilder {
                 polys.transcript_accumulator_empty[i] = 1;
             }
         }
-        for (size_t i = 0; i < precompute_table_state.size(); ++i) {
-            // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st
-            // coefficient). All other rows in the precompute_table_state represent active wnaf gates (i.e.
-            // precompute_select = 1)
-            polys.precompute_select[i] = (i != 0) ? 1 : 0;
-            polys.precompute_pc[i] = precompute_table_state[i].pc;
-            polys.precompute_point_transition[i] = static_cast<uint64_t>(precompute_table_state[i].point_transition);
-            polys.precompute_round[i] = precompute_table_state[i].round;
-            polys.precompute_scalar_sum[i] = precompute_table_state[i].scalar_sum;
-
-            polys.precompute_s1hi[i] = precompute_table_state[i].s1;
-            polys.precompute_s1lo[i] = precompute_table_state[i].s2;
-            polys.precompute_s2hi[i] = precompute_table_state[i].s3;
-            polys.precompute_s2lo[i] = precompute_table_state[i].s4;
-            polys.precompute_s3hi[i] = precompute_table_state[i].s5;
-            polys.precompute_s3lo[i] = precompute_table_state[i].s6;
-            polys.precompute_s4hi[i] = precompute_table_state[i].s7;
-            polys.precompute_s4lo[i] = precompute_table_state[i].s8;
-            // If skew is active (i.e. we need to subtract a base point from the msm result),
-            // write `7` into rows.precompute_skew. `7`, in binary representation, equals `-1` when converted into WNAF
-            // form
-            polys.precompute_skew[i] = precompute_table_state[i].skew ? 7 : 0;
-
-            polys.precompute_dx[i] = precompute_table_state[i].precompute_double.x;
-            polys.precompute_dy[i] = precompute_table_state[i].precompute_double.y;
-            polys.precompute_tx[i] = precompute_table_state[i].precompute_accumulator.x;
-            polys.precompute_ty[i] = precompute_table_state[i].precompute_accumulator.y;
-        }
-
-        for (size_t i = 0; i < msm_state.size(); ++i) {
-            polys.msm_transition[i] = static_cast<int>(msm_state[i].msm_transition);
-            polys.msm_add[i] = static_cast<int>(msm_state[i].q_add);
-            polys.msm_double[i] = static_cast<int>(msm_state[i].q_double);
-            polys.msm_skew[i] = static_cast<int>(msm_state[i].q_skew);
-            polys.msm_accumulator_x[i] = msm_state[i].accumulator_x;
-            polys.msm_accumulator_y[i] = msm_state[i].accumulator_y;
-            polys.msm_pc[i] = msm_state[i].pc;
-            polys.msm_size_of_msm[i] = msm_state[i].msm_size;
-            polys.msm_count[i] = msm_state[i].msm_count;
-            polys.msm_round[i] = msm_state[i].msm_round;
-            polys.msm_add1[i] = static_cast<int>(msm_state[i].add_state[0].add);
-            polys.msm_add2[i] = static_cast<int>(msm_state[i].add_state[1].add);
-            polys.msm_add3[i] = static_cast<int>(msm_state[i].add_state[2].add);
-            polys.msm_add4[i] = static_cast<int>(msm_state[i].add_state[3].add);
-            polys.msm_x1[i] = msm_state[i].add_state[0].point.x;
-            polys.msm_y1[i] = msm_state[i].add_state[0].point.y;
-            polys.msm_x2[i] = msm_state[i].add_state[1].point.x;
-            polys.msm_y2[i] = msm_state[i].add_state[1].point.y;
-            polys.msm_x3[i] = msm_state[i].add_state[2].point.x;
-            polys.msm_y3[i] = msm_state[i].add_state[2].point.y;
-            polys.msm_x4[i] = msm_state[i].add_state[3].point.x;
-            polys.msm_y4[i] = msm_state[i].add_state[3].point.y;
-            polys.msm_collision_x1[i] = msm_state[i].add_state[0].collision_inverse;
-            polys.msm_collision_x2[i] = msm_state[i].add_state[1].collision_inverse;
-            polys.msm_collision_x3[i] = msm_state[i].add_state[2].collision_inverse;
-            polys.msm_collision_x4[i] = msm_state[i].add_state[3].collision_inverse;
-            polys.msm_lambda1[i] = msm_state[i].add_state[0].lambda;
-            polys.msm_lambda2[i] = msm_state[i].add_state[1].lambda;
-            polys.msm_lambda3[i] = msm_state[i].add_state[2].lambda;
-            polys.msm_lambda4[i] = msm_state[i].add_state[3].lambda;
-            polys.msm_slice1[i] = msm_state[i].add_state[0].slice;
-            polys.msm_slice2[i] = msm_state[i].add_state[1].slice;
-            polys.msm_slice3[i] = msm_state[i].add_state[2].slice;
-            polys.msm_slice4[i] = msm_state[i].add_state[3].slice;
-        }
-
-        polys.transcript_mul_shift = Polynomial(polys.transcript_mul.shifted());
-        polys.transcript_msm_count_shift = Polynomial(polys.transcript_msm_count.shifted());
-        polys.transcript_accumulator_x_shift = Polynomial(polys.transcript_accumulator_x.shifted());
-        polys.transcript_accumulator_y_shift = Polynomial(polys.transcript_accumulator_y.shifted());
-        polys.precompute_scalar_sum_shift = Polynomial(polys.precompute_scalar_sum.shifted());
-        polys.precompute_s1hi_shift = Polynomial(polys.precompute_s1hi.shifted());
-        polys.precompute_dx_shift = Polynomial(polys.precompute_dx.shifted());
-        polys.precompute_dy_shift = Polynomial(polys.precompute_dy.shifted());
-        polys.precompute_tx_shift = Polynomial(polys.precompute_tx.shifted());
-        polys.precompute_ty_shift = Polynomial(polys.precompute_ty.shifted());
-        polys.msm_transition_shift = Polynomial(polys.msm_transition.shifted());
-        polys.msm_add_shift = Polynomial(polys.msm_add.shifted());
-        polys.msm_double_shift = Polynomial(polys.msm_double.shifted());
-        polys.msm_skew_shift = Polynomial(polys.msm_skew.shifted());
-        polys.msm_accumulator_x_shift = Polynomial(polys.msm_accumulator_x.shifted());
-        polys.msm_accumulator_y_shift = Polynomial(polys.msm_accumulator_y.shifted());
-        polys.msm_count_shift = Polynomial(polys.msm_count.shifted());
-        polys.msm_round_shift = Polynomial(polys.msm_round.shifted());
-        polys.msm_add1_shift = Polynomial(polys.msm_add1.shifted());
-        polys.msm_pc_shift = Polynomial(polys.msm_pc.shifted());
-        polys.precompute_pc_shift = Polynomial(polys.precompute_pc.shifted());
-        polys.transcript_pc_shift = Polynomial(polys.transcript_pc.shifted());
-        polys.precompute_round_shift = Polynomial(polys.precompute_round.shifted());
-        polys.transcript_accumulator_empty_shift = Polynomial(polys.transcript_accumulator_empty.shifted());
-        polys.precompute_select_shift = Polynomial(polys.precompute_select.shifted());
+        run_loop_in_parallel(precompute_table_state.size(), [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st
+                // coefficient). All other rows in the precompute_table_state represent active wnaf gates (i.e.
+                // precompute_select = 1)
+                polys.precompute_select[i] = (i != 0) ? 1 : 0;
+                polys.precompute_pc[i] = precompute_table_state[i].pc;
+                polys.precompute_point_transition[i] =
+                    static_cast<uint64_t>(precompute_table_state[i].point_transition);
+                polys.precompute_round[i] = precompute_table_state[i].round;
+                polys.precompute_scalar_sum[i] = precompute_table_state[i].scalar_sum;
+
+                polys.precompute_s1hi[i] = precompute_table_state[i].s1;
+                polys.precompute_s1lo[i] = precompute_table_state[i].s2;
+                polys.precompute_s2hi[i] = precompute_table_state[i].s3;
+                polys.precompute_s2lo[i] = precompute_table_state[i].s4;
+                polys.precompute_s3hi[i] = precompute_table_state[i].s5;
+                polys.precompute_s3lo[i] = precompute_table_state[i].s6;
+                polys.precompute_s4hi[i] = precompute_table_state[i].s7;
+                polys.precompute_s4lo[i] = precompute_table_state[i].s8;
+                // If skew is active (i.e. we need to subtract a base point from the msm result),
+                // write `7` into rows.precompute_skew. `7`, in binary representation, equals `-1` when converted into
+                // WNAF form
+                polys.precompute_skew[i] = precompute_table_state[i].skew ? 7 : 0;
+
+                polys.precompute_dx[i] = precompute_table_state[i].precompute_double.x;
+                polys.precompute_dy[i] = precompute_table_state[i].precompute_double.y;
+                polys.precompute_tx[i] = precompute_table_state[i].precompute_accumulator.x;
+                polys.precompute_ty[i] = precompute_table_state[i].precompute_accumulator.y;
+            }
+        });
+
+        run_loop_in_parallel(msm_state.size(), [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                polys.msm_transition[i] = static_cast<int>(msm_state[i].msm_transition);
+                polys.msm_add[i] = static_cast<int>(msm_state[i].q_add);
+                polys.msm_double[i] = static_cast<int>(msm_state[i].q_double);
+                polys.msm_skew[i] = static_cast<int>(msm_state[i].q_skew);
+                polys.msm_accumulator_x[i] = msm_state[i].accumulator_x;
+                polys.msm_accumulator_y[i] = msm_state[i].accumulator_y;
+                polys.msm_pc[i] = msm_state[i].pc;
+                polys.msm_size_of_msm[i] = msm_state[i].msm_size;
+                polys.msm_count[i] = msm_state[i].msm_count;
+                polys.msm_round[i] = msm_state[i].msm_round;
+                polys.msm_add1[i] = static_cast<int>(msm_state[i].add_state[0].add);
+                polys.msm_add2[i] = static_cast<int>(msm_state[i].add_state[1].add);
+                polys.msm_add3[i] = static_cast<int>(msm_state[i].add_state[2].add);
+                polys.msm_add4[i] = static_cast<int>(msm_state[i].add_state[3].add);
+                polys.msm_x1[i] = msm_state[i].add_state[0].point.x;
+                polys.msm_y1[i] = msm_state[i].add_state[0].point.y;
+                polys.msm_x2[i] = msm_state[i].add_state[1].point.x;
+                polys.msm_y2[i] = msm_state[i].add_state[1].point.y;
+                polys.msm_x3[i] = msm_state[i].add_state[2].point.x;
+                polys.msm_y3[i] = msm_state[i].add_state[2].point.y;
+                polys.msm_x4[i] = msm_state[i].add_state[3].point.x;
+                polys.msm_y4[i] = msm_state[i].add_state[3].point.y;
+                polys.msm_collision_x1[i] = msm_state[i].add_state[0].collision_inverse;
+                polys.msm_collision_x2[i] = msm_state[i].add_state[1].collision_inverse;
+                polys.msm_collision_x3[i] = msm_state[i].add_state[2].collision_inverse;
+                polys.msm_collision_x4[i] = msm_state[i].add_state[3].collision_inverse;
+                polys.msm_lambda1[i] = msm_state[i].add_state[0].lambda;
+                polys.msm_lambda2[i] = msm_state[i].add_state[1].lambda;
+                polys.msm_lambda3[i] = msm_state[i].add_state[2].lambda;
+                polys.msm_lambda4[i] = msm_state[i].add_state[3].lambda;
+                polys.msm_slice1[i] = msm_state[i].add_state[0].slice;
+                polys.msm_slice2[i] = msm_state[i].add_state[1].slice;
+                polys.msm_slice3[i] = msm_state[i].add_state[2].slice;
+                polys.msm_slice4[i] = msm_state[i].add_state[3].slice;
+            }
+        });
+        polys.transcript_mul_shift = polys.transcript_mul.shifted();
+        polys.transcript_msm_count_shift = polys.transcript_msm_count.shifted();
+        polys.transcript_accumulator_x_shift = polys.transcript_accumulator_x.shifted();
+        polys.transcript_accumulator_y_shift = polys.transcript_accumulator_y.shifted();
+        polys.precompute_scalar_sum_shift = polys.precompute_scalar_sum.shifted();
+        polys.precompute_s1hi_shift = polys.precompute_s1hi.shifted();
+        polys.precompute_dx_shift = polys.precompute_dx.shifted();
+        polys.precompute_dy_shift = polys.precompute_dy.shifted();
+        polys.precompute_tx_shift = polys.precompute_tx.shifted();
+        polys.precompute_ty_shift = polys.precompute_ty.shifted();
+        polys.msm_transition_shift = polys.msm_transition.shifted();
+        polys.msm_add_shift = polys.msm_add.shifted();
+        polys.msm_double_shift = polys.msm_double.shifted();
+        polys.msm_skew_shift = polys.msm_skew.shifted();
+        polys.msm_accumulator_x_shift = polys.msm_accumulator_x.shifted();
+        polys.msm_accumulator_y_shift = polys.msm_accumulator_y.shifted();
+        polys.msm_count_shift = polys.msm_count.shifted();
+        polys.msm_round_shift = polys.msm_round.shifted();
+        polys.msm_add1_shift = polys.msm_add1.shifted();
+        polys.msm_pc_shift = polys.msm_pc.shifted();
+        polys.precompute_pc_shift = polys.precompute_pc.shifted();
+        polys.transcript_pc_shift = polys.transcript_pc.shifted();
+        polys.precompute_round_shift = polys.precompute_round.shifted();
+        polys.transcript_accumulator_empty_shift = polys.transcript_accumulator_empty.shifted();
+        polys.precompute_select_shift = polys.precompute_select.shifted();
         return polys;
     }
 
@@ -497,25 +498,8 @@ template <typename Flavor> class ECCVMCircuitBuilder {
 
     [[nodiscard]] size_t get_num_gates() const
     {
-        // TODO(@zac-williamson) once we have a stable base to work off of, optimize this method!
         // (issue #2218)
-        const auto msms = get_msms();
-        const auto flattened_muls = get_flattened_scalar_muls(msms);
-
-        std::array<std::vector<size_t>, 2> point_table_read_counts;
-        const auto transcript_state =
-            ECCVMTranscriptBuilder<Flavor>::compute_transcript_state(op_queue->raw_ops, get_number_of_muls());
-        const auto precompute_table_state =
-            ECCVMPrecomputedTablesBuilder<Flavor>::compute_precompute_state(flattened_muls);
-        const auto msm_state =
-            ECCVMMSMMBuilder<Flavor>::compute_msm_state(msms, point_table_read_counts, get_number_of_muls());
-
-        const size_t msm_size = msm_state.size();
-        const size_t transcript_size = transcript_state.size();
-        const size_t precompute_table_size = precompute_table_state.size();
-
-        const size_t num_rows = std::max(precompute_table_size, std::max(msm_size, transcript_size));
-        return num_rows;
+        return op_queue->get_num_rows();
     }
 
     [[nodiscard]] size_t get_circuit_subgroup_size(const size_t num_rows) const
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp
index 6f0c45e3744..5630ca357e0 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/msm_builder.hpp
@@ -3,6 +3,7 @@
 #include <cstddef>
 
 #include "./eccvm_builder_types.hpp"
+#include "barretenberg/proof_system/op_queue/ecc_op_queue.hpp"
 
 namespace bb {
 
@@ -17,7 +18,7 @@ template <typename Flavor> class ECCVMMSMMBuilder {
     static constexpr size_t NUM_SCALAR_BITS = bb::eccvm::NUM_SCALAR_BITS;
     static constexpr size_t WNAF_SLICE_BITS = bb::eccvm::WNAF_SLICE_BITS;
 
-    struct MSMState {
+    struct alignas(64) MSMState {
         uint32_t pc = 0;
         uint32_t msm_size = 0;
         uint32_t msm_count = 0;
@@ -42,6 +43,21 @@ template <typename Flavor> class ECCVMMSMMBuilder {
         FF accumulator_y = 0;
     };
 
+    struct alignas(64) MSMRowTranscript {
+        std::array<FF, 4> lambda_numerator;
+        std::array<FF, 4> lambda_denominator;
+        Element accumulator_in;
+        Element accumulator_out;
+    };
+
+    struct alignas(64) AdditionTrace {
+        Element p1;
+        Element p2;
+        Element p3;
+        bool predicate;
+        bool is_double;
+    };
+
     /**
      * @brief Computes the row values for the Straus MSM columns of the ECCVM.
      *
@@ -55,7 +71,8 @@ template <typename Flavor> class ECCVMMSMMBuilder {
      */
     static std::vector<MSMState> compute_msm_state(const std::vector<bb::eccvm::MSM<CycleGroup>>& msms,
                                                    std::array<std::vector<size_t>, 2>& point_table_read_counts,
-                                                   const uint32_t total_number_of_muls)
+                                                   const uint32_t total_number_of_muls,
+                                                   const size_t num_msm_rows)
     {
         // N.B. the following comments refer to a "point lookup table" frequently.
         // To perform a scalar multiplicaiton of a point [P] by a scalar x, we compute multiples of [P] and store in a
@@ -100,171 +117,354 @@ template <typename Flavor> class ECCVMMSMMBuilder {
                 point_table_read_counts[column_index][pc_offset + 15 - static_cast<size_t>(slice_row)]++;
             }
         };
-        std::vector<MSMState> msm_state;
-        // start with empty row (shiftable polynomials must have 0 as first coefficient)
-        msm_state.emplace_back(MSMState{});
-        uint32_t pc = total_number_of_muls;
-        AffineElement accumulator = CycleGroup::affine_point_at_infinity;
 
+        // compute which row index each multiscalar multiplication will start at.
+        // also compute the program counter index that each multiscalar multiplication will start at.
+        // we use this information to populate the MSM row data across multiple threads
+        std::vector<size_t> msm_row_indices;
+        std::vector<size_t> pc_indices;
+        msm_row_indices.reserve(msms.size() + 1);
+        pc_indices.reserve(msms.size() + 1);
+
+        msm_row_indices.push_back(1);
+        pc_indices.push_back(total_number_of_muls);
         for (const auto& msm : msms) {
-            const size_t msm_size = msm.size();
-
-            const size_t rows_per_round = (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
-            static constexpr size_t num_rounds = NUM_SCALAR_BITS / WNAF_SLICE_BITS;
-
-            const auto add_points = [](auto& P1, auto& P2, auto& lambda, auto& collision_inverse, bool predicate) {
-                lambda = predicate ? (P2.y - P1.y) / (P2.x - P1.x) : 0;
-                collision_inverse = predicate ? (P2.x - P1.x).invert() : 0;
-                auto x3 = predicate ? lambda * lambda - (P2.x + P1.x) : P1.x;
-                auto y3 = predicate ? lambda * (P1.x - x3) - P1.y : P1.y;
-                return AffineElement(x3, y3);
-            };
+            const size_t rows = ECCOpQueue::get_msm_row_count_for_single_msm(msm.size());
+            msm_row_indices.push_back(msm_row_indices.back() + rows);
+            pc_indices.push_back(pc_indices.back() - msm.size());
+        }
+
+        static constexpr size_t num_rounds = NUM_SCALAR_BITS / WNAF_SLICE_BITS;
+        std::vector<MSMState> msm_state(num_msm_rows);
+        // start with empty row (shiftable polynomials must have 0 as first coefficient)
+        msm_state[0] = (MSMState{});
+
+        // compute "read counts" so that we can determine the number of times entries in our log-derivative lookup
+        // tables are called.
+        // Note: this part is single-threaded. THe amount of compute is low, however, so this is likely not a big
+        // concern.
+        for (size_t i = 0; i < msms.size(); ++i) {
+
             for (size_t j = 0; j < num_rounds; ++j) {
+                uint32_t pc = static_cast<uint32_t>(pc_indices[i]);
+                const auto& msm = msms[i];
+                const size_t msm_size = msm.size();
+                const size_t rows_per_round =
+                    (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
+
                 for (size_t k = 0; k < rows_per_round; ++k) {
-                    MSMState row;
                     const size_t points_per_row =
                         (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW;
                     const size_t idx = k * ADDITIONS_PER_ROW;
-                    row.msm_transition = (j == 0) && (k == 0);
-
-                    AffineElement acc(accumulator);
-                    Element acc_expected = accumulator;
                     for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
-                        auto& add_state = row.add_state[m];
-                        add_state.add = points_per_row > m;
-                        int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0;
-                        // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row.
-                        // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in the MSM
-                        // columns into the MSM accumulator
-                        // `add_state.slice` = A 4-bit WNAF slice of the scalar multiplier associated with the point we
-                        // are adding (the specific slice chosen depends on the value of msm_round) (WNAF =
-                        // windowed-non-adjacent-form. Value range is `-15, -13, ..., 15`) If `add_state.add = 1`, we
-                        // want `add_state.slice` to be the *compressed* form of the WNAF slice value. (compressed = no
-                        // gaps in the value range. i.e. -15, -13, ..., 15 maps to 0, ... , 15)
-                        add_state.slice = add_state.add ? (slice + 15) / 2 : 0;
-                        add_state.point = add_state.add
-                                              ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
-                                              : AffineElement{ 0, 0 };
-                        // predicate logic:
-                        // add_predicate should normally equal add_state.add
-                        // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point addition
-                        // of a new MSM In this case, we do NOT add the 1st point into the accumulator, instead we SET
-                        // the accumulator to equal the 1st point. add_predicate is used to determine whether we add the
-                        // output of a point addition into the accumulator, therefore if j == 0 AND k == 0 AND m == 0,
-                        // add_predicate = 0 even if add_state.add = true
-                        bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
-
-                        auto& p1 = (m == 0) ? add_state.point : acc;
-                        auto& p2 = (m == 0) ? acc : add_state.point;
-
-                        acc_expected = add_predicate ? (acc_expected + add_state.point) : Element(p1);
-                        if (add_state.add) {
+                        bool add = points_per_row > m;
+                        if (add) {
+                            int slice = add ? msm[idx + m].wnaf_slices[j] : 0;
                             update_read_counts(pc - idx - m, slice);
                         }
-                        acc = add_points(p1, p2, add_state.lambda, add_state.collision_inverse, add_predicate);
-                        ASSERT(acc == AffineElement(acc_expected));
                     }
-                    row.q_add = true;
-                    row.q_double = false;
-                    row.q_skew = false;
-                    row.msm_round = static_cast<uint32_t>(j);
-                    row.msm_size = static_cast<uint32_t>(msm_size);
-                    row.msm_count = static_cast<uint32_t>(idx);
-                    row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x;
-                    row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y;
-                    row.pc = pc;
-                    accumulator = acc;
-                    msm_state.push_back(row);
                 }
-                if (j < num_rounds - 1) {
-                    MSMState row;
-                    row.msm_transition = false;
-                    row.msm_round = static_cast<uint32_t>(j + 1);
-                    row.msm_size = static_cast<uint32_t>(msm_size);
-                    row.msm_count = static_cast<uint32_t>(0);
-                    row.q_add = false;
-                    row.q_double = true;
-                    row.q_skew = false;
-
-                    auto dx = accumulator.x;
-                    auto dy = accumulator.y;
-                    for (size_t m = 0; m < 4; ++m) {
-                        auto& add_state = row.add_state[m];
-                        add_state.add = false;
-                        add_state.slice = 0;
-                        add_state.point = { 0, 0 };
-                        add_state.collision_inverse = 0;
-                        add_state.lambda = ((dx + dx + dx) * dx) / (dy + dy);
-                        auto x3 = add_state.lambda.sqr() - dx - dx;
-                        dy = add_state.lambda * (dx - x3) - dy;
-                        dx = x3;
-                    }
 
-                    row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x;
-                    row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y;
-                    accumulator = Element(accumulator).dbl().dbl().dbl().dbl();
-                    row.pc = pc;
-                    msm_state.push_back(row);
-                } else {
+                if (j == num_rounds - 1) {
                     for (size_t k = 0; k < rows_per_round; ++k) {
-                        MSMState row;
-
                         const size_t points_per_row =
                             (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW;
                         const size_t idx = k * ADDITIONS_PER_ROW;
-                        row.msm_transition = false;
+                        for (size_t m = 0; m < 4; ++m) {
+                            bool add = points_per_row > m;
+
+                            if (add) {
+                                update_read_counts(pc - idx - m, msm[idx + m].wnaf_skew ? -1 : -15);
+                            }
+                        }
+                    }
+                }
+            }
+        }
 
-                        AffineElement acc(accumulator);
-                        Element acc_expected = accumulator;
+        // The execution trace data for the MSM columns requires knowledge of intermediate values from *affine* point
+        // addition. The naive solution to compute this data requires 2 field inversions per in-circuit group addition
+        // evaluation. This is bad! To avoid this, we split the witness computation algorithm into 3 steps. Step 1:
+        // compute the execution trace group operations in *projective* coordinates Step 2: use batch inversion trick to
+        // convert all point traces into affine coordinates Step 3: populate the full execution trace, including the
+        // intermediate values from affine group operations This section sets up the data structures we need to store
+        // all intermediate ECC operations in projective form
+        const size_t num_point_adds_and_doubles = (num_msm_rows - 2) * 4;
+        const size_t num_accumulators = num_msm_rows - 1;
+        const size_t num_points_in_trace = (num_point_adds_and_doubles * 3) + num_accumulators;
+        // We create 1 vector to store the entire point trace. We split into multiple containers using std::span
+        // (we want 1 vector object to more efficiently batch normalize points)
+        std::vector<Element> point_trace(num_points_in_trace);
+        // the point traces record group operations. Either p1 + p2 = p3, or p1.dbl() = p3
+        std::span<Element> p1_trace(&point_trace[0], num_point_adds_and_doubles);
+        std::span<Element> p2_trace(&point_trace[num_point_adds_and_doubles], num_point_adds_and_doubles);
+        std::span<Element> p3_trace(&point_trace[num_point_adds_and_doubles * 2], num_point_adds_and_doubles);
+        // operation_trace records whether an entry in the p1/p2/p3 trace represents a point addition or doubling
+        std::vector<bool> operation_trace(num_point_adds_and_doubles);
+        // accumulator_trace tracks the value of the ECCVM accumulator for each row
+        std::span<Element> accumulator_trace(&point_trace[num_point_adds_and_doubles * 3], num_accumulators);
+
+        // we start the accumulator at the point at infinity
+        accumulator_trace[0] = (CycleGroup::affine_point_at_infinity);
+
+        // populate point trace data, and the components of the MSM execution trace that do not relate to affine point
+        // operations
+        run_loop_in_parallel(msms.size(), [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                Element accumulator = CycleGroup::affine_point_at_infinity;
+                const auto& msm = msms[i];
+                size_t msm_row_index = msm_row_indices[i];
+                const size_t msm_size = msm.size();
+                const size_t rows_per_round =
+                    (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
+                size_t trace_index = (msm_row_indices[i] - 1) * 4;
+
+                for (size_t j = 0; j < num_rounds; ++j) {
+                    const uint32_t pc = static_cast<uint32_t>(pc_indices[i]);
+
+                    for (size_t k = 0; k < rows_per_round; ++k) {
+                        const size_t points_per_row =
+                            (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW;
+                        auto& row = msm_state[msm_row_index];
+                        const size_t idx = k * ADDITIONS_PER_ROW;
+                        row.msm_transition = (j == 0) && (k == 0);
+                        for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
 
-                        for (size_t m = 0; m < 4; ++m) {
                             auto& add_state = row.add_state[m];
                             add_state.add = points_per_row > m;
-                            add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0;
-
+                            int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0;
+                            // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row.
+                            // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in
+                            // the MSM columns into the MSM accumulator `add_state.slice` = A 4-bit WNAF slice of
+                            // the scalar multiplier associated with the point we are adding (the specific slice
+                            // chosen depends on the value of msm_round) (WNAF = windowed-non-adjacent-form. Value
+                            // range is `-15, -13,
+                            // ..., 15`) If `add_state.add = 1`, we want `add_state.slice` to be the *compressed*
+                            // form of the WNAF slice value. (compressed = no gaps in the value range. i.e. -15,
+                            // -13, ..., 15 maps to 0, ... , 15)
+                            add_state.slice = add_state.add ? (slice + 15) / 2 : 0;
                             add_state.point = add_state.add
                                                   ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
                                                   : AffineElement{ 0, 0 };
-                            bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
-                            if (add_state.add) {
-                                update_read_counts(pc - idx - m, msm[idx + m].wnaf_skew ? -1 : -15);
-                            }
-                            acc = add_points(
-                                acc, add_state.point, add_state.lambda, add_state.collision_inverse, add_predicate);
-                            acc_expected = add_predicate ? (acc_expected + add_state.point) : acc_expected;
-                            ASSERT(acc == AffineElement(acc_expected));
+
+                            // predicate logic:
+                            // add_predicate should normally equal add_state.add
+                            // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point
+                            // addition of a new MSM In this case, we do NOT add the 1st point into the accumulator,
+                            // instead we SET the accumulator to equal the 1st point. add_predicate is used to
+                            // determine whether we add the output of a point addition into the accumulator,
+                            // therefore if j == 0 AND k == 0 AND m == 0, add_predicate = 0 even if add_state.add =
+                            // true
+                            bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
+
+                            Element p1 = (m == 0) ? Element(add_state.point) : accumulator;
+                            Element p2 = (m == 0) ? accumulator : Element(add_state.point);
+
+                            accumulator = add_predicate ? (accumulator + add_state.point) : Element(p1);
+                            p1_trace[trace_index] = p1;
+                            p2_trace[trace_index] = p2;
+                            p3_trace[trace_index] = accumulator;
+                            operation_trace[trace_index] = false;
+                            trace_index++;
                         }
-                        row.q_add = false;
+                        accumulator_trace[msm_row_index] = accumulator;
+                        row.q_add = true;
                         row.q_double = false;
-                        row.q_skew = true;
-                        row.msm_round = static_cast<uint32_t>(j + 1);
+                        row.q_skew = false;
+                        row.msm_round = static_cast<uint32_t>(j);
                         row.msm_size = static_cast<uint32_t>(msm_size);
                         row.msm_count = static_cast<uint32_t>(idx);
+                        row.pc = pc;
+                        msm_row_index++;
+                    }
+                    // doubling
+                    if (j < num_rounds - 1) {
+                        auto& row = msm_state[msm_row_index];
+                        row.msm_transition = false;
+                        row.msm_round = static_cast<uint32_t>(j + 1);
+                        row.msm_size = static_cast<uint32_t>(msm_size);
+                        row.msm_count = static_cast<uint32_t>(0);
+                        row.q_add = false;
+                        row.q_double = true;
+                        row.q_skew = false;
+                        for (size_t m = 0; m < 4; ++m) {
 
-                        row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x;
-                        row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y;
+                            auto& add_state = row.add_state[m];
+                            add_state.add = false;
+                            add_state.slice = 0;
+                            add_state.point = { 0, 0 };
+                            add_state.collision_inverse = 0;
 
-                        row.pc = pc;
-                        accumulator = acc;
-                        msm_state.emplace_back(row);
+                            p1_trace[trace_index] = accumulator;
+                            p2_trace[trace_index] = accumulator;
+                            accumulator = accumulator.dbl();
+                            p3_trace[trace_index] = accumulator;
+                            operation_trace[trace_index] = true;
+                            trace_index++;
+                        }
+                        accumulator_trace[msm_row_index] = accumulator;
+                        msm_row_index++;
+                    } else {
+                        for (size_t k = 0; k < rows_per_round; ++k) {
+                            auto& row = msm_state[msm_row_index];
+
+                            const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size
+                                                              ? msm_size % ADDITIONS_PER_ROW
+                                                              : ADDITIONS_PER_ROW;
+                            const size_t idx = k * ADDITIONS_PER_ROW;
+                            row.msm_transition = false;
+
+                            Element acc_expected = accumulator;
+
+                            for (size_t m = 0; m < 4; ++m) {
+                                auto& add_state = row.add_state[m];
+                                add_state.add = points_per_row > m;
+                                add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0;
+
+                                add_state.point =
+                                    add_state.add ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
+                                                  : AffineElement{ 0, 0 };
+                                bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+                                auto p1 = accumulator;
+                                accumulator = add_predicate ? accumulator + add_state.point : accumulator;
+                                p1_trace[trace_index] = p1;
+                                p2_trace[trace_index] = add_state.point;
+                                p3_trace[trace_index] = accumulator;
+                                operation_trace[trace_index] = false;
+                                trace_index++;
+                            }
+                            row.q_add = false;
+                            row.q_double = false;
+                            row.q_skew = true;
+                            row.msm_round = static_cast<uint32_t>(j + 1);
+                            row.msm_size = static_cast<uint32_t>(msm_size);
+                            row.msm_count = static_cast<uint32_t>(idx);
+                            row.pc = pc;
+                            accumulator_trace[msm_row_index] = accumulator;
+                            msm_row_index++;
+                        }
                     }
                 }
             }
-            pc -= static_cast<uint32_t>(msm_size);
-            // Validate our computed accumulator matches the real MSM result!
-            Element expected = CycleGroup::point_at_infinity;
-            for (size_t i = 0; i < msm.size(); ++i) {
-                expected += (Element(msm[i].base_point) * msm[i].scalar);
+        });
+
+        // Normalize the points in the point trace
+        run_loop_in_parallel(point_trace.size(), [&](size_t start, size_t end) {
+            Element::batch_normalize(&point_trace[start], end - start);
+        });
+
+        // inverse_trace is used to compute the value of the `collision_inverse` column in the ECCVM.
+        std::vector<FF> inverse_trace(num_point_adds_and_doubles);
+        run_loop_in_parallel(num_point_adds_and_doubles, [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; ++i) {
+                if (operation_trace[i]) {
+                    inverse_trace[i] = (p1_trace[i].y + p1_trace[i].y);
+                } else {
+                    inverse_trace[i] = (p2_trace[i].x - p1_trace[i].x);
+                }
             }
-            // Validate the accumulator is correct!
-            ASSERT(accumulator == AffineElement(expected));
-        }
+            FF::batch_invert(&inverse_trace[start], end - start);
+        });
+
+        // complete the computation of the ECCVM execution trace, by adding the affine intermediate point data
+        // i.e. row.accumulator_x, row.accumulator_y, row.add_state[0...3].collision_inverse,
+        // row.add_state[0...3].lambda
+        run_loop_in_parallel(msms.size(), [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                const auto& msm = msms[i];
+                size_t trace_index = ((msm_row_indices[i] - 1) * ADDITIONS_PER_ROW);
+                size_t msm_row_index = msm_row_indices[i];
+                // 1st MSM row will have accumulator equal to the previous MSM output
+                // (or point at infinity for 1st MSM)
+                size_t accumulator_index = msm_row_indices[i] - 1;
+                const size_t msm_size = msm.size();
+                const size_t rows_per_round =
+                    (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
+
+                for (size_t j = 0; j < num_rounds; ++j) {
+                    for (size_t k = 0; k < rows_per_round; ++k) {
+                        auto& row = msm_state[msm_row_index];
+                        const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+                        const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                        const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                        row.accumulator_x = acc_x;
+                        row.accumulator_y = acc_y;
+
+                        for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
+                            auto& add_state = row.add_state[m];
+                            bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
+
+                            const auto& inverse = inverse_trace[trace_index];
+                            const auto& p1 = p1_trace[trace_index];
+                            const auto& p2 = p2_trace[trace_index];
+                            add_state.collision_inverse = add_predicate ? inverse : 0;
+                            add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
+                            trace_index++;
+                        }
+                        accumulator_index++;
+                        msm_row_index++;
+                    }
+
+                    if (j < num_rounds - 1) {
+                        MSMState& row = msm_state[msm_row_index];
+                        const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+                        const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                        const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                        row.accumulator_x = acc_x;
+                        row.accumulator_y = acc_y;
+
+                        for (size_t m = 0; m < 4; ++m) {
+                            auto& add_state = row.add_state[m];
+                            add_state.collision_inverse = 0;
+                            const FF& dx = p1_trace[trace_index].x;
+                            const FF& inverse = inverse_trace[trace_index];
+                            add_state.lambda = ((dx + dx + dx) * dx) * inverse;
+                            trace_index++;
+                        }
+                        accumulator_index++;
+                        msm_row_index++;
+                    } else {
+                        for (size_t k = 0; k < rows_per_round; ++k) {
+                            MSMState& row = msm_state[msm_row_index];
+                            const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+
+                            const size_t idx = k * ADDITIONS_PER_ROW;
+
+                            const FF& acc_x =
+                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                            const FF& acc_y =
+                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                            row.accumulator_x = acc_x;
+                            row.accumulator_y = acc_y;
+
+                            for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
+                                auto& add_state = row.add_state[m];
+                                bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+
+                                const auto& inverse = inverse_trace[trace_index];
+                                const auto& p1 = p1_trace[trace_index];
+                                const auto& p2 = p2_trace[trace_index];
+                                add_state.collision_inverse = add_predicate ? inverse : 0;
+                                add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
+                                trace_index++;
+                            }
+                            accumulator_index++;
+                            msm_row_index++;
+                        }
+                    }
+                }
+            }
+        });
 
-        MSMState final_row;
-        final_row.pc = pc;
+        // populate the final row in the MSM execution trace.
+        // we always require 1 extra row at the end of the trace, because the accumulator x/y coordinates for row `i`
+        // are present at row `i+1`
+        Element final_accumulator(accumulator_trace.back());
+        MSMState& final_row = msm_state.back();
+        final_row.pc = static_cast<uint32_t>(pc_indices.back());
         final_row.msm_transition = true;
-        final_row.accumulator_x = accumulator.is_point_at_infinity() ? 0 : accumulator.x;
-        final_row.accumulator_y = accumulator.is_point_at_infinity() ? 0 : accumulator.y;
+        final_row.accumulator_x = final_accumulator.is_point_at_infinity() ? 0 : final_accumulator.x;
+        final_row.accumulator_y = final_accumulator.is_point_at_infinity() ? 0 : final_accumulator.y;
         final_row.msm_size = 0;
         final_row.msm_count = 0;
         final_row.q_add = false;
@@ -275,7 +475,6 @@ template <typename Flavor> class ECCVMMSMMBuilder {
                                 typename MSMState::AddState{ false, 0, AffineElement{ 0, 0 }, 0, 0 },
                                 typename MSMState::AddState{ false, 0, AffineElement{ 0, 0 }, 0, 0 } };
 
-        msm_state.emplace_back(final_row);
         return msm_state;
     }
 };
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp
index 3443fbdd2e0..ee9430eedff 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp
@@ -46,6 +46,15 @@ class ECCOpQueue {
 
     std::array<Point, 4> ultra_ops_commitments;
 
+    // as we populate the op_queue, we track the number of rows in each circuit section,
+    // as well as the number of multiplications performed.
+    // This is to avoid expensive O(n) logic to compute the number of rows and muls during witness computation
+    uint32_t cached_num_muls = 0;
+    uint32_t cached_active_msm_count = 0;
+    uint32_t num_transcript_rows = 0;
+    uint32_t num_precompute_table_rows = 0;
+    uint32_t num_msm_rows = 0;
+
     Point get_accumulator() { return accumulator; }
 
     /**
@@ -56,6 +65,20 @@ class ECCOpQueue {
      */
     void prepend_previous_queue(const ECCOpQueue& previous)
     {
+        if (!previous.raw_ops.empty() && !raw_ops.empty()) {
+            // Check we are not merging op queue that does not reset accumulator!
+            // Note - eccvm does not directly constrain this to not happen. If we need such checks they need to be
+            // applied when the transcript is being written into
+            ASSERT(previous.raw_ops.back().eq || previous.raw_ops.back().reset);
+        }
+        // We shouldn't be merging if there is a previous active msm!
+        ASSERT(previous.cached_active_msm_count == 0);
+
+        cached_num_muls += previous.cached_num_muls;
+        num_msm_rows += previous.num_msm_rows;
+        num_precompute_table_rows += previous.num_precompute_table_rows;
+        num_transcript_rows += previous.num_transcript_rows;
+
         // Allocate enough space
         std::vector<ECCVMOperation> raw_ops_updated(raw_ops.size() + previous.raw_ops.size());
         // Copy the previous raw ops to the beginning of the new vector
@@ -116,6 +139,12 @@ class ECCOpQueue {
         auto commit_temp = lhs.ultra_ops_commitments;
         lhs.ultra_ops_commitments = rhs.ultra_ops_commitments;
         rhs.ultra_ops_commitments = commit_temp;
+
+        std::swap(lhs.cached_num_muls, rhs.cached_num_muls);
+        std::swap(lhs.cached_active_msm_count, rhs.cached_active_msm_count);
+        std::swap(lhs.num_transcript_rows, rhs.num_transcript_rows);
+        std::swap(lhs.num_precompute_table_rows, rhs.num_precompute_table_rows);
+        std::swap(lhs.num_msm_rows, rhs.num_msm_rows);
     }
 
     /**
@@ -190,6 +219,93 @@ class ECCOpQueue {
         this->set_commitment_data(mock_op_queue_commitments);
     }
 
+    /**
+     * @brief Get the number of rows in the 'msm' column section o the ECCVM, associated with a single multiscalar mul
+     *
+     * @param msm_count
+     * @return uint32_t
+     */
+    static uint32_t get_msm_row_count_for_single_msm(const size_t msm_count)
+    {
+        const size_t rows_per_round =
+            (msm_count / eccvm::ADDITIONS_PER_ROW) + (msm_count % eccvm::ADDITIONS_PER_ROW != 0 ? 1 : 0);
+        constexpr size_t num_rounds = eccvm::NUM_SCALAR_BITS / eccvm::WNAF_SLICE_BITS;
+        const size_t num_rows_for_all_rounds = (num_rounds + 1) * rows_per_round; // + 1 round for skew
+        const size_t num_double_rounds = num_rounds - 1;
+        const size_t num_rows_for_msm = num_rows_for_all_rounds + num_double_rounds;
+
+        return static_cast<uint32_t>(num_rows_for_msm);
+    }
+
+    /**
+     * @brief Get the precompute table row count for single msm object
+     *
+     * @param msm_count
+     * @return uint32_t
+     */
+    static uint32_t get_precompute_table_row_count_for_single_msm(const size_t msm_count)
+    {
+        constexpr size_t num_precompute_rows_per_scalar = eccvm::NUM_WNAF_SLICES / eccvm::WNAF_SLICES_PER_ROW;
+        const size_t num_rows_for_precompute_table = msm_count * num_precompute_rows_per_scalar;
+        return static_cast<uint32_t>(num_rows_for_precompute_table);
+    }
+
+    /**
+     * @brief Get the number of rows in the 'msm' column section, for all msms in the circuit
+     *
+     * @return size_t
+     */
+    size_t get_num_msm_rows() const
+    {
+        size_t msm_rows = num_msm_rows + 2;
+        if (cached_active_msm_count > 0) {
+            msm_rows += get_msm_row_count_for_single_msm(cached_active_msm_count);
+        }
+        return msm_rows;
+    }
+
+    /**
+     * @brief Get the number of rows for the current ECCVM circuit
+     *
+     * @return size_t
+     */
+    size_t get_num_rows() const
+    {
+        // add 1 row to start and end of transcript and msm sections
+        const size_t transcript_rows = num_transcript_rows + 2;
+        size_t msm_rows = num_msm_rows + 2;
+        // add 1 row to start of precompute table section
+        size_t precompute_rows = num_precompute_table_rows + 1;
+        if (cached_active_msm_count > 0) {
+            msm_rows += get_msm_row_count_for_single_msm(cached_active_msm_count);
+            precompute_rows += get_precompute_table_row_count_for_single_msm(cached_active_msm_count);
+        }
+
+        return std::max(transcript_rows, std::max(msm_rows, precompute_rows));
+    }
+
+    /**
+     * @brief when inserting operations, update the number of multiplications in the latest scalar mul
+     *
+     * @param op
+     */
+    void update_cached_msms(const ECCVMOperation& op)
+    {
+        if (op.mul) {
+            if (op.z1 != 0) {
+                cached_active_msm_count++;
+            }
+            if (op.z2 != 0) {
+                cached_active_msm_count++;
+            }
+        } else if (cached_active_msm_count != 0) {
+            num_msm_rows += get_msm_row_count_for_single_msm(cached_active_msm_count);
+            num_precompute_table_rows += get_precompute_table_row_count_for_single_msm(cached_active_msm_count);
+            cached_num_muls += cached_active_msm_count;
+            cached_active_msm_count = 0;
+        }
+    }
+
     /**
      * @brief Write point addition op to queue and natively perform addition
      *
@@ -211,6 +327,8 @@ class ECCOpQueue {
             .z2 = 0,
             .mul_scalar_full = 0,
         });
+        num_transcript_rows += 1;
+        update_cached_msms(raw_ops.back());
     }
 
     /**
@@ -240,6 +358,9 @@ class ECCOpQueue {
             .z2 = z2,
             .mul_scalar_full = scalar,
         });
+        num_transcript_rows += 1;
+
+        update_cached_msms(raw_ops.back());
     }
 
     /**
@@ -262,7 +383,9 @@ class ECCOpQueue {
             .z2 = 0,
             .mul_scalar_full = 0,
         });
+        num_transcript_rows += 1;
 
+        update_cached_msms(raw_ops.back());
         return expected;
     }
 
@@ -282,6 +405,9 @@ class ECCOpQueue {
             .z2 = 0,
             .mul_scalar_full = 0,
         });
+        num_transcript_rows += 1;
+
+        update_cached_msms(raw_ops.back());
     }
 
     /**

From c345b096e66e9d254ce5db0397f7386ca81f3ff9 Mon Sep 17 00:00:00 2001
From: zac-williamson <blorktronics@gmail.com>
Date: Thu, 14 Mar 2024 12:57:28 +0000
Subject: [PATCH 2/4] removed more reccvm edundant inverses, multithreaded
 eccvm table precomputation

---
 .../eccvm/eccvm_builder_types.hpp             |   3 +-
 .../eccvm/eccvm_circuit_builder.hpp           | 122 ++++++++++++-----
 .../eccvm/precomputed_tables_builder.hpp      | 123 +++++++++---------
 .../eccvm/transcript_builder.hpp              |  24 ++--
 4 files changed, 165 insertions(+), 107 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp
index 96873b6fd92..9ba785657d9 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_builder_types.hpp
@@ -40,7 +40,8 @@ template <typename CycleGroup> struct ScalarMul {
     typename CycleGroup::affine_element base_point;
     std::array<int, NUM_WNAF_SLICES> wnaf_slices;
     bool wnaf_skew;
-    std::array<typename CycleGroup::affine_element, POINT_TABLE_SIZE> precomputed_table;
+    // size bumped by 1 to record base_point.dbl()
+    std::array<typename CycleGroup::affine_element, POINT_TABLE_SIZE + 1> precomputed_table;
 };
 
 template <typename CycleGroup> using MSM = std::vector<ScalarMul<CycleGroup>>;
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp
index bb78f8a413d..f4084ca4416 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/eccvm_circuit_builder.hpp
@@ -57,7 +57,8 @@ template <typename Flavor> class ECCVMCircuitBuilder {
          */
         const auto compute_precomputed_table = [](const AffineElement& base_point) {
             const auto d2 = Element(base_point).dbl();
-            std::array<Element, POINT_TABLE_SIZE> table;
+            std::array<Element, POINT_TABLE_SIZE + 1> table;
+            table[POINT_TABLE_SIZE] = d2; // need this for later
             table[POINT_TABLE_SIZE / 2] = base_point;
             for (size_t i = 1; i < POINT_TABLE_SIZE / 2; ++i) {
                 table[i + POINT_TABLE_SIZE / 2] = Element(table[i + POINT_TABLE_SIZE / 2 - 1]) + d2;
@@ -66,10 +67,10 @@ template <typename Flavor> class ECCVMCircuitBuilder {
                 table[i] = -table[POINT_TABLE_SIZE - 1 - i];
             }
 
-            Element::batch_normalize(&table[0], POINT_TABLE_SIZE);
-            std::array<AffineElement, POINT_TABLE_SIZE> result;
-            for (size_t i = 0; i < POINT_TABLE_SIZE; ++i) {
-                result[i] = AffineElement{ .x = table[i].x, .y = table[i].y };
+            Element::batch_normalize(&table[0], POINT_TABLE_SIZE + 1);
+            std::array<AffineElement, POINT_TABLE_SIZE + 1> result;
+            for (size_t i = 0; i < POINT_TABLE_SIZE + 1; ++i) {
+                result[i] = AffineElement(table[i].x, table[i].y);
             }
             return result;
         };
@@ -111,9 +112,83 @@ template <typename Flavor> class ECCVMCircuitBuilder {
 
             return output;
         };
-        std::vector<MSM> msms;
-        std::vector<ScalarMul> active_msm;
 
+        // a vector of MSMs = a vector of a vector of scalar muls
+        // each mul
+        size_t msm_count = 0;
+        size_t active_mul_count = 0;
+        std::vector<size_t> msm_opqueue_index;
+        std::vector<std::pair<size_t, size_t>> msm_mul_index;
+        std::vector<size_t> msm_sizes;
+
+        // std::vector<std::vector<size_t>> msm_indices;
+        // std::vector<size_t> active_msm_indices;
+        for (size_t i = 0; i < op_queue->raw_ops.size(); ++i) {
+            const auto& op = op_queue->raw_ops[i];
+            if (op.mul) {
+                if (op.z1 != 0 || op.z2 != 0) {
+                    msm_opqueue_index.push_back(i);
+                    msm_mul_index.emplace_back(msm_count, active_mul_count);
+                }
+                if (op.z1 != 0) {
+                    active_mul_count++;
+                }
+                if (op.z2 != 0) {
+                    active_mul_count++;
+                }
+            } else if (active_mul_count > 0) {
+                msm_sizes.push_back(active_mul_count);
+                msm_count++;
+                active_mul_count = 0;
+            }
+        }
+        // if last op is a mul we have not correctly computed the total number of msms
+        if (op_queue->raw_ops.back().mul) {
+            msm_sizes.push_back(active_mul_count);
+            msm_count++;
+        }
+        std::vector<MSM> msms_test(msm_count);
+        for (size_t i = 0; i < msm_count; ++i) {
+            auto& msm = msms_test[i];
+            msm.resize(msm_sizes[i]);
+        }
+
+        run_loop_in_parallel(msm_opqueue_index.size(), [&](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                //  for (size_t i = 0; i < msm_opqueue_index.size(); ++i) {
+                const size_t opqueue_index = msm_opqueue_index[i];
+                const auto& op = op_queue->raw_ops[opqueue_index];
+                auto [msm_index, mul_index] = msm_mul_index[i];
+                if (op.z1 != 0) {
+                    ASSERT(msms_test.size() > msm_index);
+                    ASSERT(msms_test[msm_index].size() > mul_index);
+                    msms_test[msm_index][mul_index] = (ScalarMul{
+                        .pc = 0,
+                        .scalar = op.z1,
+                        .base_point = op.base_point,
+                        .wnaf_slices = compute_wnaf_slices(op.z1),
+                        .wnaf_skew = (op.z1 & 1) == 0,
+                        .precomputed_table = compute_precomputed_table(op.base_point),
+                    });
+                    mul_index++;
+                }
+                if (op.z2 != 0) {
+                    ASSERT(msms_test.size() > msm_index);
+                    ASSERT(msms_test[msm_index].size() > mul_index);
+                    auto endo_point = AffineElement{ op.base_point.x * FF::cube_root_of_unity(), -op.base_point.y };
+                    msms_test[msm_index][mul_index] = (ScalarMul{
+                        .pc = 0,
+                        .scalar = op.z2,
+                        .base_point = endo_point,
+                        .wnaf_slices = compute_wnaf_slices(op.z2),
+                        .wnaf_skew = (op.z2 & 1) == 0,
+                        .precomputed_table = compute_precomputed_table(endo_point),
+                    });
+                }
+            }
+        });
+
+        // update pc. easier to do this serially but in theory could be optimised out
         // We start pc at `num_muls` and decrement for each mul processed.
         // This gives us two desired properties:
         // 1: the value of pc at the 1st row = number of muls (easy to check)
@@ -122,40 +197,15 @@ template <typename Flavor> class ECCVMCircuitBuilder {
         // sumcheck relations that involve pc (if we did the other way around, starting at 1 and ending at num_muls,
         // we create a discontinuity in pc values between the last transcript row and the following empty row)
         uint32_t pc = num_muls;
-
-        const auto process_mul = [&active_msm, &pc, &compute_wnaf_slices, &compute_precomputed_table](
-                                     const auto& scalar, const auto& base_point) {
-            if (scalar != 0) {
-                active_msm.push_back(ScalarMul{
-                    .pc = pc,
-                    .scalar = scalar,
-                    .base_point = base_point,
-                    .wnaf_slices = compute_wnaf_slices(scalar),
-                    .wnaf_skew = (scalar & 1) == 0,
-                    .precomputed_table = compute_precomputed_table(base_point),
-                });
+        for (auto& msm : msms_test) {
+            for (auto& mul : msm) {
+                mul.pc = pc;
                 pc--;
             }
-        };
-
-        for (auto& op : op_queue->raw_ops) {
-            if (op.mul) {
-                process_mul(op.z1, op.base_point);
-                process_mul(op.z2, AffineElement{ op.base_point.x * FF::cube_root_of_unity(), -op.base_point.y });
-
-            } else {
-                if (!active_msm.empty()) {
-                    msms.push_back(active_msm);
-                    active_msm = {};
-                }
-            }
-        }
-        if (!active_msm.empty()) {
-            msms.push_back(active_msm);
         }
 
         ASSERT(pc == 0);
-        return msms;
+        return msms_test;
     }
 
     static std::vector<ScalarMul> get_flattened_scalar_muls(const std::vector<MSM>& msms)
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp
index 1c7d2bb443e..8924edac6ca 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/precomputed_tables_builder.hpp
@@ -36,75 +36,76 @@ template <typename Flavor> class ECCVMPrecomputedTablesBuilder {
     static std::vector<PrecomputeState> compute_precompute_state(
         const std::vector<bb::eccvm::ScalarMul<CycleGroup>>& ecc_muls)
     {
-        std::vector<PrecomputeState> precompute_state;
+        static constexpr size_t num_rows_per_scalar = NUM_WNAF_SLICES / WNAF_SLICES_PER_ROW;
+        const size_t num_precompute_rows = num_rows_per_scalar * ecc_muls.size() + 1;
+        std::vector<PrecomputeState> precompute_state(num_precompute_rows);
 
         // start with empty row (shiftable polynomials must have 0 as first coefficient)
-        precompute_state.push_back(PrecomputeState{});
-        static constexpr size_t num_rows_per_scalar = NUM_WNAF_SLICES / WNAF_SLICES_PER_ROW;
+        precompute_state[0] = PrecomputeState{};
 
         // current impl doesn't work if not 4
         static_assert(WNAF_SLICES_PER_ROW == 4);
 
-        for (const auto& entry : ecc_muls) {
-            const auto& slices = entry.wnaf_slices;
-            uint256_t scalar_sum = 0;
-
-            const Element point = entry.base_point;
-            const Element d2 = point.dbl();
-
-            for (size_t i = 0; i < num_rows_per_scalar; ++i) {
-                PrecomputeState row;
-                const int slice0 = slices[i * WNAF_SLICES_PER_ROW];
-                const int slice1 = slices[i * WNAF_SLICES_PER_ROW + 1];
-                const int slice2 = slices[i * WNAF_SLICES_PER_ROW + 2];
-                const int slice3 = slices[i * WNAF_SLICES_PER_ROW + 3];
-
-                const int slice0base2 = (slice0 + 15) / 2;
-                const int slice1base2 = (slice1 + 15) / 2;
-                const int slice2base2 = (slice2 + 15) / 2;
-                const int slice3base2 = (slice3 + 15) / 2;
-
-                // convert into 2-bit chunks
-                row.s1 = slice0base2 >> 2;
-                row.s2 = slice0base2 & 3;
-                row.s3 = slice1base2 >> 2;
-                row.s4 = slice1base2 & 3;
-                row.s5 = slice2base2 >> 2;
-                row.s6 = slice2base2 & 3;
-                row.s7 = slice3base2 >> 2;
-                row.s8 = slice3base2 & 3;
-                bool last_row = (i == num_rows_per_scalar - 1);
-
-                row.skew = last_row ? entry.wnaf_skew : false;
-
-                row.scalar_sum = scalar_sum;
-
-                // N.B. we apply a constraint that requires slice1 to be positive for the 1st row of each scalar sum.
-                //      This ensures we do not have WNAF representations of negative values
-                const int row_chunk = slice3 + slice2 * (1 << 4) + slice1 * (1 << 8) + slice0 * (1 << 12);
-
-                bool chunk_negative = row_chunk < 0;
-
-                scalar_sum = scalar_sum << (WNAF_SLICE_BITS * WNAF_SLICES_PER_ROW);
-                if (chunk_negative) {
-                    scalar_sum -= static_cast<uint64_t>(-row_chunk);
-                } else {
-                    scalar_sum += static_cast<uint64_t>(row_chunk);
+        run_loop_in_parallel(ecc_muls.size(), [&](size_t start, size_t end) {
+            for (size_t j = start; j < end; j++) {
+                const auto& entry = ecc_muls[j];
+                const auto& slices = entry.wnaf_slices;
+                uint256_t scalar_sum = 0;
+
+                for (size_t i = 0; i < num_rows_per_scalar; ++i) {
+                    PrecomputeState row;
+                    const int slice0 = slices[i * WNAF_SLICES_PER_ROW];
+                    const int slice1 = slices[i * WNAF_SLICES_PER_ROW + 1];
+                    const int slice2 = slices[i * WNAF_SLICES_PER_ROW + 2];
+                    const int slice3 = slices[i * WNAF_SLICES_PER_ROW + 3];
+
+                    const int slice0base2 = (slice0 + 15) / 2;
+                    const int slice1base2 = (slice1 + 15) / 2;
+                    const int slice2base2 = (slice2 + 15) / 2;
+                    const int slice3base2 = (slice3 + 15) / 2;
+
+                    // convert into 2-bit chunks
+                    row.s1 = slice0base2 >> 2;
+                    row.s2 = slice0base2 & 3;
+                    row.s3 = slice1base2 >> 2;
+                    row.s4 = slice1base2 & 3;
+                    row.s5 = slice2base2 >> 2;
+                    row.s6 = slice2base2 & 3;
+                    row.s7 = slice3base2 >> 2;
+                    row.s8 = slice3base2 & 3;
+                    bool last_row = (i == num_rows_per_scalar - 1);
+
+                    row.skew = last_row ? entry.wnaf_skew : false;
+
+                    row.scalar_sum = scalar_sum;
+
+                    // N.B. we apply a constraint that requires slice1 to be positive for the 1st row of each scalar
+                    // sum. This ensures we do not have WNAF representations of negative values
+                    const int row_chunk = slice3 + slice2 * (1 << 4) + slice1 * (1 << 8) + slice0 * (1 << 12);
+
+                    bool chunk_negative = row_chunk < 0;
+
+                    scalar_sum = scalar_sum << (WNAF_SLICE_BITS * WNAF_SLICES_PER_ROW);
+                    if (chunk_negative) {
+                        scalar_sum -= static_cast<uint64_t>(-row_chunk);
+                    } else {
+                        scalar_sum += static_cast<uint64_t>(row_chunk);
+                    }
+                    row.round = static_cast<uint32_t>(i);
+                    row.point_transition = last_row;
+                    row.pc = entry.pc;
+
+                    if (last_row) {
+                        ASSERT(scalar_sum - entry.wnaf_skew == entry.scalar);
+                    }
+
+                    row.precompute_double = entry.precomputed_table[bb::eccvm::POINT_TABLE_SIZE];
+                    // fill accumulator in reverse order i.e. first row = 15[P], then 13[P], ..., 1[P]
+                    row.precompute_accumulator = entry.precomputed_table[bb::eccvm::POINT_TABLE_SIZE - 1 - i];
+                    precompute_state[j * num_rows_per_scalar + i + 1] = (row);
                 }
-                row.round = static_cast<uint32_t>(i);
-                row.point_transition = last_row;
-                row.pc = entry.pc;
-
-                if (last_row) {
-                    ASSERT(scalar_sum - entry.wnaf_skew == entry.scalar);
-                }
-
-                row.precompute_double = d2;
-                // fill accumulator in reverse order i.e. first row = 15[P], then 13[P], ..., 1[P]
-                row.precompute_accumulator = entry.precomputed_table[bb::eccvm::POINT_TABLE_SIZE - 1 - i];
-                precompute_state.emplace_back(row);
             }
-        }
+        });
         return precompute_state;
     }
 };
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp
index 1ff3d8b4cba..69ea505b242 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_builder/eccvm/transcript_builder.hpp
@@ -60,7 +60,10 @@ template <typename Flavor> class ECCVMTranscriptBuilder {
     static std::vector<TranscriptState> compute_transcript_state(
         const std::vector<bb::eccvm::VMOperation<CycleGroup>>& vm_operations, const uint32_t total_number_of_muls)
     {
-        std::vector<TranscriptState> transcript_state;
+        const size_t num_transcript_entries = vm_operations.size() + 2;
+
+        std::vector<TranscriptState> transcript_state(num_transcript_entries);
+        std::vector<FF> inverse_trace(num_transcript_entries - 2);
         VMState state{
             .pc = total_number_of_muls,
             .count = 0,
@@ -69,11 +72,10 @@ template <typename Flavor> class ECCVMTranscriptBuilder {
             .is_accumulator_empty = true,
         };
         VMState updated_state;
-
         // add an empty row. 1st row all zeroes because of our shiftable polynomials
-        transcript_state.emplace_back(TranscriptState{});
+        transcript_state[0] = (TranscriptState{});
         for (size_t i = 0; i < vm_operations.size(); ++i) {
-            TranscriptState row;
+            TranscriptState& row = transcript_state[i + 1];
             const bb::eccvm::VMOperation<CycleGroup>& entry = vm_operations[i];
 
             const bool is_mul = entry.mul;
@@ -158,11 +160,13 @@ template <typename Flavor> class ECCVMTranscriptBuilder {
                 ASSERT((row.msm_output_x != row.accumulator_x) &&
                        "eccvm: attempting msm. Result point x-coordinate matches accumulator x-coordinate.");
                 state.msm_accumulator = CycleGroup::affine_point_at_infinity;
-                row.collision_check = (row.msm_output_x - row.accumulator_x).invert();
+                inverse_trace[i] = (row.msm_output_x - row.accumulator_x);
             } else if (entry.add && !row.accumulator_empty) {
                 ASSERT((row.base_x != row.accumulator_x) &&
                        "eccvm: attempting to add points with matching x-coordinates");
-                row.collision_check = (row.base_x - row.accumulator_x).invert();
+                inverse_trace[i] = (row.base_x - row.accumulator_x);
+            } else {
+                inverse_trace[i] = (0);
             }
 
             state = updated_state;
@@ -170,16 +174,18 @@ template <typename Flavor> class ECCVMTranscriptBuilder {
             if (entry.mul && next_not_msm) {
                 state.msm_accumulator = CycleGroup::affine_point_at_infinity;
             }
-            transcript_state.emplace_back(row);
         }
 
-        TranscriptState final_row;
+        FF::batch_invert(&inverse_trace[0], inverse_trace.size());
+        for (size_t i = 0; i < inverse_trace.size(); ++i) {
+            transcript_state[i + 1].collision_check = inverse_trace[i];
+        }
+        TranscriptState& final_row = transcript_state.back();
         final_row.pc = updated_state.pc;
         final_row.accumulator_x = (updated_state.accumulator.is_point_at_infinity()) ? 0 : updated_state.accumulator.x;
         final_row.accumulator_y = (updated_state.accumulator.is_point_at_infinity()) ? 0 : updated_state.accumulator.y;
         final_row.accumulator_empty = updated_state.is_accumulator_empty;
 
-        transcript_state.push_back(final_row);
         return transcript_state;
     }
 };

From 59966725fa04b84d519d638896f25de004d15cca Mon Sep 17 00:00:00 2001
From: zac-williamson <blorktronics@gmail.com>
Date: Fri, 15 Mar 2024 17:47:59 +0000
Subject: [PATCH 3/4] fixed ecc op queue test

---
 .../proof_system/op_queue/ecc_op_queue.hpp    | 24 +++++++++++++++++++
 .../op_queue/ecc_op_queue.test.cpp            |  5 +++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp
index ee9430eedff..e4ebd9719ea 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.hpp
@@ -389,6 +389,30 @@ class ECCOpQueue {
         return expected;
     }
 
+    /**
+     * @brief Write equality op using internal accumulator point
+     *
+     * @return current internal accumulator point (prior to reset to 0)
+     */
+    void reset()
+    {
+        accumulator.self_set_infinity();
+
+        raw_ops.emplace_back(ECCVMOperation{
+            .add = false,
+            .mul = false,
+            .eq = false,
+            .reset = true,
+            .base_point = { 0, 0 },
+            .z1 = 0,
+            .z2 = 0,
+            .mul_scalar_full = 0,
+        });
+        num_transcript_rows += 1;
+
+        update_cached_msms(raw_ops.back());
+    }
+
     /**
      * @brief Write empty row to queue
      *
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp
index d7a69547f1b..265727f4dd0 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/op_queue/ecc_op_queue.test.cpp
@@ -52,16 +52,18 @@ TEST(ECCOpQueueTest, PrependAndSwapTests)
     ECCOpQueue op_queue_a;
     op_queue_a.add_accumulate(P1 + P1);
     op_queue_a.mul_accumulate(P2, z + z);
-
+    op_queue_a.reset();
     // Add different operations to b
     ECCOpQueue op_queue_b;
     op_queue_b.mul_accumulate(P2, z);
     op_queue_b.add_accumulate(P1);
+    op_queue_b.reset();
 
     // Add same operations as to a
     ECCOpQueue op_queue_c;
     op_queue_c.add_accumulate(P1 + P1);
     op_queue_c.mul_accumulate(P2, z + z);
+    op_queue_c.reset();
 
     // Swap b with a
     std::swap(op_queue_b, op_queue_a);
@@ -77,6 +79,7 @@ TEST(ECCOpQueueTest, PrependAndSwapTests)
     // Append same operations as now in a to c
     op_queue_c.mul_accumulate(P2, z);
     op_queue_c.add_accumulate(P1);
+    op_queue_c.reset();
 
     // Check a==c
     for (size_t i = 0; i < op_queue_c.raw_ops.size(); i++) {

From 25d8a3c0ed3dc5ce045c3b2c61d6cd1aa9b5af6b Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Mon, 18 Mar 2024 11:05:49 +0000
Subject: [PATCH 4/4] Analysis no longer needed

---
 barretenberg/cpp/scripts/analyze_client_ivc_bench.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py
index 6cedf3509e7..af374d05a5d 100644
--- a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py
+++ b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py
@@ -49,13 +49,6 @@
     time_ms = bench[key]/1e6
     print(f"{key:<{max_label_length}}{time_ms:>8.0f} {time_ms/sum_of_kept_times_ms:>8.2%}")
 
-
-print('\nBreakdown of ECCVMProver::create_prover:')
-for key in ["ECCVMComposer::compute_witness(t)", "ECCVMComposer::create_proving_key(t)"]:
-    time_ms = bench[key]/1e6
-    total_time_ms = bench["ECCVMComposer::create_prover(t)"]/1e6
-    print(f"{key:<{max_label_length}}{time_ms:>8.0f}  {time_ms/total_time_ms:>8.2%}")
-
 print('\nBreakdown of ProtogalaxyProver::fold_instances:')
 protogalaxy_round_labels = [
     "ProtoGalaxyProver_::preparation_round(t)",