Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Parallel IPA #3882

Merged
merged 7 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_subdirectory(ipa_bench)
add_subdirectory(decrypt_bench)
add_subdirectory(pippenger_bench)
add_subdirectory(plonk_bench)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Each source represents a separate benchmark suite
set(BENCHMARK_SOURCES
ipa.bench.cpp
)

# Required libraries for benchmark suites
set(LINKED_LIBRARIES
benchmark::benchmark
ultra_honk
)

# Add executable and custom target for each suite, e.g. ultra_honk_bench
foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
endforeach()
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
#include <benchmark/benchmark.h>

using namespace benchmark;
using namespace barretenberg;
using namespace proof_system;
using namespace proof_system::honk::pcs::ipa;
namespace {
using Curve = curve::Grumpkin;
using Fr = Curve::ScalarField;
using IPA = IPA<Curve>;
using OpeningPair = honk::pcs::OpeningPair<Curve>;
using OpeningClaim = honk::pcs::OpeningClaim<Curve>;
using Polynomial = Polynomial<Curve::ScalarField>;
using CommitmentKey = honk::pcs::CommitmentKey<Curve>;
using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey<Curve>;

constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10;
constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16;
std::shared_ptr<barretenberg::srs::factories::CrsFactory<curve::Grumpkin>> crs_factory(
new barretenberg::srs::factories::FileCrsFactory<curve::Grumpkin>("../srs_db/grumpkin", 1 << 16));

auto ck = std::make_shared<CommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
auto vk = std::make_shared<VerifierCommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);

std::vector<std::shared_ptr<honk::BaseTranscript>> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 -
MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
std::vector<OpeningClaim> opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1);

void ipa_open(State& state) noexcept
{
numeric::random::Engine& engine = numeric::random::get_debug_engine();
for (auto _ : state) {
state.PauseTiming();
size_t n = 1 << static_cast<size_t>(state.range(0));
// Construct the polynomial
Polynomial poly(n);
for (size_t i = 0; i < n; ++i) {
poly[i] = Fr::random_element(&engine);
}
auto x = Fr::random_element(&engine);
auto eval = poly.evaluate(x);
const OpeningPair opening_pair = { x, eval };
const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) };
// initialize empty prover transcript
auto prover_transcript = std::make_shared<honk::BaseTranscript>();
state.ResumeTiming();
// Compute proof
IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript);
// Store info for verifier
prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript;
opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim;
}
}
void ipa_verify(State& state) noexcept
{
for (auto _ : state) {
state.PauseTiming();
// Retrieve proofs
auto prover_transcript = prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
auto opening_claim = opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
// initialize verifier transcript from proof data
auto verifier_transcript = std::make_shared<honk::BaseTranscript>(prover_transcript->proof_data);

state.ResumeTiming();
auto result = IPA::verify(vk, opening_claim, verifier_transcript);
ASSERT(result);
}
}
} // namespace
BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
BENCHMARK_MAIN();
143 changes: 93 additions & 50 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,46 +52,67 @@ template <typename Curve> class IPA {
auto a_vec = polynomial;
auto srs_elements = ck->srs->get_monomial_points();
std::vector<Commitment> G_vec_local(poly_degree);

// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
for (size_t i = 0; i < poly_degree * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
run_loop_in_parallel(
poly_degree,
[&G_vec_local, srs_elements](size_t start, size_t end) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*no_multhreading_if_less_or_equal=*/16);

std::vector<Fr> b_vec(poly_degree);
Fr b_power = 1;
for (size_t i = 0; i < poly_degree; i++) {
b_vec[i] = b_power;
b_power *= opening_pair.challenge;
}
run_loop_in_parallel(
poly_degree,
[&b_vec, &opening_pair](size_t start, size_t end) {
Fr b_power = opening_pair.challenge.pow(start);
for (size_t i = start; i < end; i++) {
b_vec[i] = b_power;
b_power *= opening_pair.challenge;
}
},
/*no_multhreading_if_less_or_equal=*/16);

// Iterate for log(poly_degree) rounds to compute the round commitments.
auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
std::vector<GroupElement> L_elements(log_poly_degree);
std::vector<GroupElement> R_elements(log_poly_degree);
std::size_t round_size = poly_degree;

// TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the
// work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant
// improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the
// loop can also be integrated.
// Perform IPA rounds
for (size_t i = 0; i < log_poly_degree; i++) {
round_size >>= 1;
// Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
std::mutex addition_lock;
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
for (size_t j = 0; j < round_size; j++) {
inner_prod_L += a_vec[j] * b_vec[round_size + j];
inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
// Run scalar product in parallel
run_loop_in_parallel(
round_size,
[&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
addition_lock.lock();
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
addition_lock.unlock();
},
/*no_multhreading_if_less_or_equal=*/8);

// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
L_elements[i] =
// TODO(#473)
barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
L_elements[i] += aux_generator * inner_prod_L;

// R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
// TODO(#473)
R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
R_elements[i] += aux_generator * inner_prod_R;
Expand All @@ -104,23 +125,32 @@ template <typename Curve> class IPA {
const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index);
const Fr round_challenge_inv = round_challenge.invert();

std::vector<Commitment> G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size));
std::vector<Commitment> G_hi(G_vec_local.begin() + static_cast<long>(round_size), G_vec_local.end());
G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv);
G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge);
auto G_lo = GroupElement::batch_mul_with_endomorphism(
std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
round_challenge_inv);
auto G_hi = GroupElement::batch_mul_with_endomorphism(
std::span{ G_vec_local.begin() + static_cast<long>(round_size),
G_vec_local.begin() + static_cast<long>(round_size * 2) },
round_challenge);

// Update the vectors a_vec, b_vec and G_vec.
// a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
// b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
// G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
for (size_t j = 0; j < round_size; j++) {
a_vec[j] *= round_challenge;
a_vec[j] += round_challenge_inv * a_vec[round_size + j];
b_vec[j] *= round_challenge_inv;
b_vec[j] += round_challenge * b_vec[round_size + j];

G_vec_local[j] = G_lo[j] + G_hi[j];
}
run_loop_in_parallel(
round_size,
[&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
a_vec[j] *= round_challenge;
a_vec[j] += round_challenge_inv * a_vec[round_size + j];
b_vec[j] *= round_challenge_inv;
b_vec[j] += round_challenge * b_vec[round_size + j];

G_vec_local[j] = G_lo[j] + G_hi[j];
}
},
/*no_multhreading_if_less_or_equal=*/4);
}

transcript->send_to_verifier("IPA:a_0", a_vec[0]);
Expand Down Expand Up @@ -166,7 +196,7 @@ template <typename Curve> class IPA {
msm_scalars[2 * i] = round_challenges[i].sqr();
msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
}
// TODO(#473)

GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
GroupElement C_zero = C_prime + LR_sums;
Expand All @@ -188,29 +218,42 @@ template <typename Curve> class IPA {
// Compute G_zero
// First construct s_vec
std::vector<Fr> s_vec(poly_degree);
for (size_t i = 0; i < poly_degree; i++) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
} else {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
run_loop_in_parallel(
poly_degree,
[&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
} else {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
}
}
s_vec[i] = s_vec_scalar;
}
}
s_vec[i] = s_vec_scalar;
}
},
/*no_multhreading_if_less_or_equal=*/4);

auto srs_elements = vk->srs->get_monomial_points();

// Copy the G_vector to local memory.
std::vector<Commitment> G_vec_local(poly_degree);

// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
for (size_t i = 0; i < poly_degree * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
// TODO(#473)
run_loop_in_parallel(
poly_degree,
[&G_vec_local, srs_elements](size_t start, size_t end) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*no_multhreading_if_less_or_equal=*/16);

auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);

Expand Down
41 changes: 41 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
#endif
#endif
}

/**
* @brief Split a loop into several loops running in parallel
*
* @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
* that should contain the work loop
* @param num_points Total number of elements
* @param func A function or lambda expression with a for loop inside, for example:
* [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
* @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
*
*/
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal)
{
if (num_points <= no_multhreading_if_less_or_equal) {
func(0, num_points);
return;
}
// Get number of cpus we can split into
const size_t num_cpus = get_num_cpus();

// Compute the size of a single chunk
const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
// Parallelize over chunks
parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
// If num_points is small, sometimes we need fewer CPUs
if (chunk_size * chunk_index > num_points) {
return;
}
// Compute the current chunk size (can differ in case it's the last chunk)
size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
if (current_chunk_size == 0) {
return;
}
size_t start = chunk_index * chunk_size;
size_t end = chunk_index * chunk_size + current_chunk_size;
func(start, end);
});
};
3 changes: 3 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2()
}

void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal = 0);
2 changes: 1 addition & 1 deletion barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ template <class Fq, class Fr, class Params> class alignas(32) element {

static void batch_normalize(element* elements, size_t num_elements) noexcept;
static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
const std::vector<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;

Fq x;
Fq y;
Expand Down
Loading