AztecProtocol · Rumata888 · Jan 11, 2024 · Jan 3, 2024 · Jan 8, 2024 · Jan 8, 2024
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(ipa_bench)
 add_subdirectory(decrypt_bench)
 add_subdirectory(pippenger_bench)
 add_subdirectory(plonk_bench)

diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Each source represents a separate benchmark suite 
+set(BENCHMARK_SOURCES
+ ipa.bench.cpp
+)
+
+# Required libraries for benchmark suites
+set(LINKED_LIBRARIES
+  benchmark::benchmark
+  ultra_honk
+)
+
+# Add executable and custom target for each suite, e.g. ultra_honk_bench
+foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
+  get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
+  add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
+  target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
+  add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endforeach()
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
@@ -0,0 +1,73 @@
+#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
+#include <benchmark/benchmark.h>
+
+using namespace benchmark;
+using namespace barretenberg;
+using namespace proof_system;
+using namespace proof_system::honk::pcs::ipa;
+namespace {
+using Curve = curve::Grumpkin;
+using Fr = Curve::ScalarField;
+using IPA = IPA<Curve>;
+using OpeningPair = honk::pcs::OpeningPair<Curve>;
+using OpeningClaim = honk::pcs::OpeningClaim<Curve>;
+using Polynomial = Polynomial<Curve::ScalarField>;
+using CommitmentKey = honk::pcs::CommitmentKey<Curve>;
+using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey<Curve>;
+
+constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10;
+constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16;
+std::shared_ptr<barretenberg::srs::factories::CrsFactory<curve::Grumpkin>> crs_factory(
+    new barretenberg::srs::factories::FileCrsFactory<curve::Grumpkin>("../srs_db/grumpkin", 1 << 16));
+
+auto ck = std::make_shared<CommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+auto vk = std::make_shared<VerifierCommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+
+std::vector<std::shared_ptr<honk::BaseTranscript>> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 -
+                                                                      MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+std::vector<OpeningClaim> opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+
+void ipa_open(State& state) noexcept
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t n = 1 << static_cast<size_t>(state.range(0));
+        // Construct the polynomial
+        Polynomial poly(n);
+        for (size_t i = 0; i < n; ++i) {
+            poly[i] = Fr::random_element(&engine);
+        }
+        auto x = Fr::random_element(&engine);
+        auto eval = poly.evaluate(x);
+        const OpeningPair opening_pair = { x, eval };
+        const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) };
+        // initialize empty prover transcript
+        auto prover_transcript = std::make_shared<honk::BaseTranscript>();
+        state.ResumeTiming();
+        // Compute proof
+        IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript);
+        // Store info for verifier
+        prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript;
+        opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim;
+    }
+}
+void ipa_verify(State& state) noexcept
+{
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Retrieve proofs
+        auto prover_transcript = prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        auto opening_claim = opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        // initialize verifier transcript from proof data
+        auto verifier_transcript = std::make_shared<honk::BaseTranscript>(prover_transcript->proof_data);
+
+        state.ResumeTiming();
+        auto result = IPA::verify(vk, opening_claim, verifier_transcript);
+        ASSERT(result);
+    }
+}
+} // namespace
+BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK_MAIN();
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -52,46 +52,67 @@ template <typename Curve> class IPA {
         auto a_vec = polynomial;
         auto srs_elements = ck->srs->get_monomial_points();
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         std::vector<Fr> b_vec(poly_degree);
-        Fr b_power = 1;
-        for (size_t i = 0; i < poly_degree; i++) {
-            b_vec[i] = b_power;
-            b_power *= opening_pair.challenge;
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&b_vec, &opening_pair](size_t start, size_t end) {
+                Fr b_power = opening_pair.challenge.pow(start);
+                for (size_t i = start; i < end; i++) {
+                    b_vec[i] = b_power;
+                    b_power *= opening_pair.challenge;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
         std::vector<GroupElement> L_elements(log_poly_degree);
         std::vector<GroupElement> R_elements(log_poly_degree);
         std::size_t round_size = poly_degree;
 
-        // TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the
-        // work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant
-        // improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the
-        // loop can also be integrated.
+        // Perform IPA rounds
         for (size_t i = 0; i < log_poly_degree; i++) {
             round_size >>= 1;
             // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
+            std::mutex addition_lock;
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
-            for (size_t j = 0; j < round_size; j++) {
-                inner_prod_L += a_vec[j] * b_vec[round_size + j];
-                inner_prod_R += a_vec[round_size + j] * b_vec[j];
-            }
+            // Run scalar product in parallel
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
+                    Fr current_inner_prod_L = Fr::zero();
+                    Fr current_inner_prod_R = Fr::zero();
+                    for (size_t j = start; j < end; j++) {
+                        current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
+                        current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
+                    }
+                    addition_lock.lock();
+                    inner_prod_L += current_inner_prod_L;
+                    inner_prod_R += current_inner_prod_R;
+                    addition_lock.unlock();
+                },
+                /*no_multhreading_if_less_or_equal=*/8);
+
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
-            L_elements[i] =
-                // TODO(#473)
-                barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
-                    &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
+            L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
+                &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
             L_elements[i] += aux_generator * inner_prod_L;
 
             // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
-            // TODO(#473)
             R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
                 &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
             R_elements[i] += aux_generator * inner_prod_R;
@@ -104,23 +125,32 @@ template <typename Curve> class IPA {
             const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index);
             const Fr round_challenge_inv = round_challenge.invert();
 
-            std::vector<Commitment> G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size));
-            std::vector<Commitment> G_hi(G_vec_local.begin() + static_cast<long>(round_size), G_vec_local.end());
-            G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv);
-            G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge);
+            auto G_lo = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
+                round_challenge_inv);
+            auto G_hi = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin() + static_cast<long>(round_size),
+                           G_vec_local.begin() + static_cast<long>(round_size * 2) },
+                round_challenge);
 
             // Update the vectors a_vec, b_vec and G_vec.
             // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
             // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
             // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
-            for (size_t j = 0; j < round_size; j++) {
-                a_vec[j] *= round_challenge;
-                a_vec[j] += round_challenge_inv * a_vec[round_size + j];
-                b_vec[j] *= round_challenge_inv;
-                b_vec[j] += round_challenge * b_vec[round_size + j];
-
-                G_vec_local[j] = G_lo[j] + G_hi[j];
-            }
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
+                    size_t start, size_t end) {
+                    for (size_t j = start; j < end; j++) {
+                        a_vec[j] *= round_challenge;
+                        a_vec[j] += round_challenge_inv * a_vec[round_size + j];
+                        b_vec[j] *= round_challenge_inv;
+                        b_vec[j] += round_challenge * b_vec[round_size + j];
+
+                        G_vec_local[j] = G_lo[j] + G_hi[j];
+                    }
+                },
+                /*no_multhreading_if_less_or_equal=*/4);
         }
 
         transcript->send_to_verifier("IPA:a_0", a_vec[0]);
@@ -166,7 +196,7 @@ template <typename Curve> class IPA {
             msm_scalars[2 * i] = round_challenges[i].sqr();
             msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
         }
-        // TODO(#473)
+
         GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
         GroupElement C_zero = C_prime + LR_sums;
@@ -188,29 +218,42 @@ template <typename Curve> class IPA {
         // Compute G_zero
         // First construct s_vec
         std::vector<Fr> s_vec(poly_degree);
-        for (size_t i = 0; i < poly_degree; i++) {
-            Fr s_vec_scalar = Fr::one();
-            for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
-                auto bit = (i >> j) & 1;
-                bool b = static_cast<bool>(bit);
-                if (b) {
-                    s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
-                } else {
-                    s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+        run_loop_in_parallel(
+            poly_degree,
+            [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
+                for (size_t i = start; i < end; i++) {
+                    Fr s_vec_scalar = Fr::one();
+                    for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
+                        auto bit = (i >> j) & 1;
+                        bool b = static_cast<bool>(bit);
+                        if (b) {
+                            s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
+                        } else {
+                            s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+                        }
+                    }
+                    s_vec[i] = s_vec_scalar;
                 }
-            }
-            s_vec[i] = s_vec_scalar;
-        }
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
+
         auto srs_elements = vk->srs->get_monomial_points();
+
         // Copy the G_vector to local memory.
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
-        // TODO(#473)
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);
 

diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp
@@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
 #endif
 #endif
 }
+
+/**
+ * @brief Split a loop into several loops running in parallel
+ *
+ * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
+ * that should contain the work loop
+ * @param num_points Total number of elements
+ * @param func A function or lambda expression with a for loop inside, for example:
+ * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
+ * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
+ *
+ */
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal)
+{
+    if (num_points <= no_multhreading_if_less_or_equal) {
+        func(0, num_points);
+        return;
+    }
+    // Get number of cpus we can split into
+    const size_t num_cpus = get_num_cpus();
+
+    // Compute the size of a single chunk
+    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
+    // Parallelize over chunks
+    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
+        // If num_points is small, sometimes we need fewer CPUs
+        if (chunk_size * chunk_index > num_points) {
+            return;
+        }
+        // Compute the current chunk size (can differ in case it's the last chunk)
+        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
+        if (current_chunk_size == 0) {
+            return;
+        }
+        size_t start = chunk_index * chunk_size;
+        size_t end = chunk_index * chunk_size + current_chunk_size;
+        func(start, end);
+    });
+};
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp
@@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2()
 }
 
 void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal = 0);
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -92,7 +92,7 @@ template <class Fq, class Fr, class Params> class alignas(32) element {
 
     static void batch_normalize(element* elements, size_t num_elements) noexcept;
     static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
-        const std::vector<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
+        const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
 
     Fq x;
     Fq y;