From 9f44311a2e77db731470601943d28c8a3f1331b2 Mon Sep 17 00:00:00 2001 From: Rumata888 Date: Wed, 10 Jan 2024 13:19:18 +0000 Subject: [PATCH 1/6] wip --- .../src/barretenberg/benchmark/CMakeLists.txt | 4 +- .../benchmark/parallel_bench/CMakeLists.txt | 18 ++ .../benchmark/parallel_bench/analysis.py | 25 ++ .../parallel_bench/parallel.bench.cpp | 221 ++++++++++++++++++ 4 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/parallel.bench.cpp diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt index 968acb82531..561b26af50b 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt +++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt @@ -1,9 +1,11 @@ -add_subdirectory(ipa_bench) add_subdirectory(decrypt_bench) +add_subdirectory(ipa_bench) add_subdirectory(pippenger_bench) add_subdirectory(plonk_bench) add_subdirectory(ultra_bench) add_subdirectory(goblin_bench) +add_subdirectory(honk_bench) +add_subdirectory(parallel_bench) add_subdirectory(relations_bench) add_subdirectory(widgets_bench) add_subdirectory(protogalaxy_bench) \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt new file mode 100644 index 00000000000..0f9b1356967 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt @@ -0,0 +1,18 @@ +# Each source represents a separate benchmark suite +set(BENCHMARK_SOURCES + parallel.bench.cpp +) + +# Required libraries for benchmark suites +set(LINKED_LIBRARIES + benchmark::benchmark + ecc +) + +# Add executable and custom target for each suite, e.g. ultra_honk_bench +foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES}) + get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension + add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE}) + target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES}) + add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +endforeach() \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py new file mode 100644 index 00000000000..977cd417563 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +import sys + +filename=sys.argv[1] +lines=[] +header_found=False +x_exponents=[] +with open (filename) as f: + for line in f: + if line.find("name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,label,error_occurred,error_message")!=-1: + header_found=True + lines.append(line) + continue + if header_found: + lines.append(line) + x_exponents.append(int(line.replace('"','').split(',')[0].split('/')[1])) +with open(filename,"w") as f: + f.writelines(lines) +import numpy as np +data=np.genfromtxt(filename,delimiter=",",usemask=True) +y=np.transpose(data[1:])[2] +x=np.array([1< + +using namespace benchmark; +using namespace barretenberg; +namespace { +using Curve = curve::BN254; +using Fr = Curve::ScalarField; +#define MAX_REPETITION_LOG 12 + +/** + * @brief Benchmark for evaluating the cost of starting parallel_for + * + * @details It seems parallel_for takes ~400 microseconds to start + * @param state + */ +void parallel_for_field_element_addition(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + size_t num_cpus = get_num_cpus(); + std::vector> copy_vector(num_cpus); + for (size_t i = 0; i < num_cpus; i++) { + for (size_t j = 0; j < 2; j++) { + copy_vector[i].emplace_back(Fr::random_element(&engine)); + copy_vector[i].emplace_back(Fr::random_element(&engine)); + } + } + for (auto _ : state) { + state.PauseTiming(); + size_t num_external_cycles = 1 << static_cast(state.range(0)); + size_t num_internal_cycles = 1 << (MAX_REPETITION_LOG - static_cast(state.range(0))); + state.ResumeTiming(); + for (size_t i = 0; i < num_external_cycles; i++) { + parallel_for(num_cpus, [num_internal_cycles, ©_vector](size_t index) { + for (size_t i = 0; i < num_internal_cycles; i++) { + copy_vector[index][i & 1] += copy_vector[index][1 - (i & 1)]; + } + }); + } + } +} + +/** + * @brief Evaluate how much finite addition costs (in cache) + * + *@details ~4 ns if we subtract i++ operation + * @param state + */ +void ff_addition(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Fr::random_element(&engine)); + copy_vector.emplace_back(Fr::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[i & 1] += copy_vector[1 - (i & 1)]; + } + } +} + +/** + * @brief Evaluate how much finite field multiplication costs (in cache) + * + *@details ~25 ns if we subtract i++ operation + * @param state + */ +void ff_multiplication(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Fr::random_element(&engine)); + copy_vector.emplace_back(Fr::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[i & 1] *= copy_vector[1 - (i & 1)]; + } + } +} + +/** + * @brief Evaluate how much finite field squaring costs (in cache) + * + *@details ~19 ns if we subtract i++ operation + * @param state + */ +void ff_sqr(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Fr::random_element(&engine)); + copy_vector.emplace_back(Fr::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[0] = copy_vector[0].sqr(); + } + } +} + +/** + * @brief Evaluate how much projective point addition costs (in cache) + * + *@details ~350 ns if we subtract i++ operation + * @param state + */ +void projective_point_addition(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[i & 1] += copy_vector[1 - (i & 1)]; + } + } +} + +/** + * @brief Evaluate how much projective point doubling costs when we trigger it through addition (in cache) + * + *@details ~354 ns if we subtract i++ operation + * @param state + */ +void projective_point_accidental_doubling(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[0] += copy_vector[0]; + } + } +} + +/** + * @brief Evaluate how much projective point doubling costs (in cache) + * + *@details ~195 ns if we subtract i++ operation + * @param state + */ +void projective_point_doubling(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[0] = copy_vector[0].dbl(); + } + } +} +/** + * @brief Evaluate how much running the loop costs in benchmarks + * + * @details 0.6~0.7 ns per cycle + * @param state + */ +void cycle_waste(State& state) +{ + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (volatile size_t i = 0; i < num_cycles;) { + i = i + 1; + } + } +} +} // namespace + +BENCHMARK(parallel_for_field_element_addition)->Unit(kMicrosecond)->DenseRange(0, MAX_REPETITION_LOG); +BENCHMARK(ff_addition)->Unit(kMicrosecond)->DenseRange(12, 30); +BENCHMARK(ff_multiplication)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_sqr)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(projective_point_addition)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(projective_point_accidental_doubling)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(projective_point_doubling)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(cycle_waste)->Unit(kMicrosecond)->DenseRange(20, 30); +BENCHMARK_MAIN(); \ No newline at end of file From 481560dfae709707c593de7617299e4f51a65454 Mon Sep 17 00:00:00 2001 From: Rumata888 Date: Fri, 12 Jan 2024 14:18:48 +0000 Subject: [PATCH 2/6] benchmark update --- .../parallel_bench/analyse_all_benchmarks.py | 71 ++++++++ .../benchmark/parallel_bench/analysis.py | 25 --- .../parallel_bench/parallel.bench.cpp | 163 +++++++++++++++++- .../single_benchmark_analysis.py | 37 ++++ 4 files changed, 269 insertions(+), 27 deletions(-) create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analyse_all_benchmarks.py delete mode 100644 barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/single_benchmark_analysis.py diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analyse_all_benchmarks.py b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analyse_all_benchmarks.py new file mode 100644 index 00000000000..7b8270c1064 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analyse_all_benchmarks.py @@ -0,0 +1,71 @@ +import argparse +import subprocess +import tempfile +from single_benchmark_analysis import evaluate_benchmark_from_file +import os +filter_rules={ + "sequential_copy":"cycle_waste", + "cycle_waste":None, + "parallel_for_field_element_addition:":None, + "ff_addition":"cycle_waste", + "ff_multiplication":"cycle_waste", + "ff_sqr":"cycle_waste", + "ff_invert":"ff_addition", + "ff_to_montgomery":"cycle_waste", + "ff_from_montgomery":"cycle_waste", + "ff_reduce":"ff_addition", + "projective_point_addition":"cycle_waste", + "projective_point_accidental_doubling":"cycle_waste", + "projective_point_doubling":"cycle_waste", + "scalar_multiplication":"ff_addition", +} +def get_benchmarks(filename): + result=subprocess.run([filename,"--benchmark_list_tests"],capture_output=True) + result.check_returncode() + output_lines=result.stdout.splitlines() + benchmark_names=set([x.decode().split('/')[0] for x in output_lines]) + return sorted(list(benchmark_names)) + +def run_benchmarks(filename,bnames): + benchmark_results=dict() + for bname in bnames: + output_file=tempfile.mktemp() + result=subprocess.run([filename,f"--benchmark_filter={bname}.*",f"--benchmark_out={output_file}","--benchmark_out_format=csv"]) + result.check_returncode() + benchmark_result=evaluate_benchmark_from_file(output_file)*1000 + benchmark_results[bname]=benchmark_result + print (f"Benchmark {bname} unfiltered: {benchmark_result} ns") + os.remove(output_file) + + return benchmark_results + +def filter_benchmarks(benchmark_results): + global filter_rules + print ("Filtered benchmark results:") + max_len=0 + for bname in sorted(benchmark_results.keys()): + if len(bname)>max_len: + max_len=len(bname) + for bname in sorted(benchmark_results.keys()): + if bname not in filter_rules.keys() or filter_rules[bname]==None: + print(f"\t{bname}:{' '*(max_len-len(bname))}\t{benchmark_results[bname]:.1f}") + else: + print(f"\t{bname}:{' '*(max_len-len(bname))}\t{benchmark_results[bname]-benchmark_results[filter_rules[bname]]:.1f}") + +if __name__=="__main__": + parser=argparse.ArgumentParser(description='Run all the individual benchmarks',epilog='This expects a single file with a single type of benchmark /i') + parser.add_argument("-f","--file",dest="filename",required=True,help="run benchmark FILE", metavar="FILE") + args=parser.parse_args() + filename=args.filename + if filename==None: + parser.print_help() + exit() + benchmark_names=get_benchmarks(filename) + print("Will run the following benchmarks:") + for bname in benchmark_names: + print(f'\t{bname}') + unfiltered_results=run_benchmarks(filename,benchmark_names) + filter_benchmarks(unfiltered_results) + + + \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py deleted file mode 100644 index 977cd417563..00000000000 --- a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analysis.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/python3 -import sys - -filename=sys.argv[1] -lines=[] -header_found=False -x_exponents=[] -with open (filename) as f: - for line in f: - if line.find("name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,label,error_occurred,error_message")!=-1: - header_found=True - lines.append(line) - continue - if header_found: - lines.append(line) - x_exponents.append(int(line.replace('"','').split(',')[0].split('/')[1])) -with open(filename,"w") as f: - f.writelines(lines) -import numpy as np -data=np.genfromtxt(filename,delimiter=",",usemask=True) -y=np.transpose(data[1:])[2] -x=np.array([1< @@ -69,7 +90,7 @@ void ff_addition(State& state) /** * @brief Evaluate how much finite field multiplication costs (in cache) * - *@details ~25 ns if we subtract i++ operation + *@details ~21 ns if we subtract i++ operation * @param state */ void ff_multiplication(State& state) @@ -94,7 +115,7 @@ void ff_multiplication(State& state) /** * @brief Evaluate how much finite field squaring costs (in cache) * - *@details ~19 ns if we subtract i++ operation + *@details ~18 ns if we subtract i++ operation * @param state */ void ff_sqr(State& state) @@ -116,6 +137,89 @@ void ff_sqr(State& state) } } +/** + * @brief Evaluate how much finite field inversion costs (in cache) + * + *@details ~7100 ns if we subtract addition and i++ operation + * @param state + */ +void ff_invert(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = (element + Fr::one()).invert(); + } + } +} + +/** + * @brief Evaluate how much conversion to montgomery costs (in cache) + * + *@details ~39 ns if we subtract i++ operation + * @param state + */ +void ff_to_montgomery(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = element.to_montgomery_form(); + } + } +} +/** + * @brief Evaluate how much conversion from montgomery costs (in cache) + * + *@details ~19 ns if we subtract i++ operation + * @param state + */ +void ff_from_montgomery(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = element.from_montgomery_form(); + } + } +} + +/** + * @brief Evaluate how much reduction costs (in cache) + * + *@details ~5 ns if we subtract addition and i++ operation + * @param state + */ +void ff_reduce(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = (element + element).reduce_once(); + } + } +} + /** * @brief Evaluate how much projective point addition costs (in cache) * @@ -190,6 +294,29 @@ void projective_point_doubling(State& state) } } } + +/** + * @brief Evaluate how much scalar multiplication costs (in cache) + * + *@details ~50000 ns + * @param state + */ +void scalar_multiplication(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + Curve::Element element = Curve::Element::random_element(&engine); + Fr scalar = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = element * scalar; + scalar += scalar; + } + } +} /** * @brief Evaluate how much running the loop costs in benchmarks * @@ -208,14 +335,46 @@ void cycle_waste(State& state) } } } + +/** + * @brief Evaluate how much copying memory for large vectors costs + * + * @details 5 ns per cycle + * @param state + */ +void sequential_copy(State& state) +{ + + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + std::vector input(num_cycles); + for (size_t i = 0; i < num_cycles; i++) { + *(uint256_t*)&input[i] = engine.get_random_uint256(); + } + std::vector output(num_cycles); + + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + output[i] = input[i]; + } + } +} } // namespace BENCHMARK(parallel_for_field_element_addition)->Unit(kMicrosecond)->DenseRange(0, MAX_REPETITION_LOG); BENCHMARK(ff_addition)->Unit(kMicrosecond)->DenseRange(12, 30); BENCHMARK(ff_multiplication)->Unit(kMicrosecond)->DenseRange(12, 27); BENCHMARK(ff_sqr)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_invert)->Unit(kMicrosecond)->DenseRange(12, 19); +BENCHMARK(ff_to_montgomery)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_from_montgomery)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_reduce)->Unit(kMicrosecond)->DenseRange(12, 29); BENCHMARK(projective_point_addition)->Unit(kMicrosecond)->DenseRange(12, 22); BENCHMARK(projective_point_accidental_doubling)->Unit(kMicrosecond)->DenseRange(12, 22); BENCHMARK(projective_point_doubling)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(scalar_multiplication)->Unit(kMicrosecond)->DenseRange(12, 18); BENCHMARK(cycle_waste)->Unit(kMicrosecond)->DenseRange(20, 30); +BENCHMARK(sequential_copy)->Unit(kMicrosecond)->DenseRange(20, 25); BENCHMARK_MAIN(); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/single_benchmark_analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/single_benchmark_analysis.py new file mode 100644 index 00000000000..6c390043cf2 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/single_benchmark_analysis.py @@ -0,0 +1,37 @@ +#!/usr/bin/python3 + +import numpy as np +import argparse +from io import StringIO + +def evaluate_benchmark_from_file(filename): + lines=[] + header_found=False + x_exponents=[] + # Google benchmarks have a few extra lines at the start, so we need to skip them + with open (filename) as f: + for line in f: + if line.find("name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,label,error_occurred,error_message")!=-1: + header_found=True + lines.append(line) + continue + if header_found: + lines.append(line) + x_exponents.append(int(line.replace('"','').split(',')[0].split('/')[1])) + + data=np.genfromtxt(StringIO('\n'.join(lines)),delimiter=",",usemask=True) + y=np.transpose(data[1:])[2] + x=np.array([1< Date: Fri, 12 Jan 2024 14:19:24 +0000 Subject: [PATCH 3/6] Update IPA --- .../commitment_schemes/ipa/ipa.hpp | 43 +++++++--- .../cpp/src/barretenberg/common/thread.cpp | 73 +++++++++++++++++ .../cpp/src/barretenberg/common/thread.hpp | 11 ++- .../barretenberg/ecc/groups/element_impl.hpp | 80 ++++++++++++++----- 4 files changed, 175 insertions(+), 32 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 460f5d6dc49..49ff8aa7a70 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -56,17 +56,23 @@ template class IPA { // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&G_vec_local, srs_elements](size_t start, size_t end) { for (size_t i = start * 2; i < end * 2; i += 2) { G_vec_local[i >> 1] = srs_elements[i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); std::vector b_vec(poly_degree); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&b_vec, &opening_pair](size_t start, size_t end) { Fr b_power = opening_pair.challenge.pow(start); @@ -75,7 +81,8 @@ template class IPA { b_power *= opening_pair.challenge; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/1); // Iterate for log(poly_degree) rounds to compute the round commitments. auto log_poly_degree = static_cast(numeric::get_msb(poly_degree)); @@ -91,7 +98,7 @@ template class IPA { Fr inner_prod_L = Fr::zero(); Fr inner_prod_R = Fr::zero(); // Run scalar product in parallel - run_loop_in_parallel( + run_loop_in_parallel_if_effective( round_size, [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) { Fr current_inner_prod_L = Fr::zero(); @@ -105,7 +112,8 @@ template class IPA { inner_prod_R += current_inner_prod_R; addition_lock.unlock(); }, - /*no_multhreading_if_less_or_equal=*/8); + /*finite_field_additions_per_iteration=*/2, + /*finite_field_multiplications_per_iteration=*/2); // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( @@ -137,7 +145,7 @@ template class IPA { // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge - run_loop_in_parallel( + run_loop_in_parallel_if_effective( round_size, [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size]( size_t start, size_t end) { @@ -146,11 +154,13 @@ template class IPA { a_vec[j] += round_challenge_inv * a_vec[round_size + j]; b_vec[j] *= round_challenge_inv; b_vec[j] += round_challenge * b_vec[round_size + j]; - + // TODO(kesha): This is affine and horrible, need to fix G_vec_local[j] = G_lo[j] + G_hi[j]; } }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/4, + /*finite_field_multiplications_per_iteration=*/8, + /*finite_field_inversions_per_iteration=*/1); } transcript->send_to_verifier("IPA:a_0", a_vec[0]); @@ -218,7 +228,7 @@ template class IPA { // Compute G_zero // First construct s_vec std::vector s_vec(poly_degree); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) { for (size_t i = start; i < end; i++) { @@ -235,7 +245,8 @@ template class IPA { s_vec[i] = s_vec_scalar; } }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/log_poly_degree); auto srs_elements = vk->srs->get_monomial_points(); @@ -245,14 +256,20 @@ template class IPA { // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&G_vec_local, srs_elements](size_t start, size_t end) { for (size_t i = start * 2; i < end * 2; i += 2) { G_vec_local[i >> 1] = srs_elements[i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state); diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index 232c30be69d..3f6bd47cb3c 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -127,3 +127,76 @@ void run_loop_in_parallel(size_t num_points, func(start, end); }); }; + +/** + * @brief Split a loop into several loops running in parallel based on operations in 1 iteration + * + * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function + * that should contain the work loop, but only if it's worth it + * @param num_points Total number of elements + * @param func A function or lambda expression with a for loop inside, for example: + * [](size_t start, size_t end){for (size_t i=start; i& func, + size_t finite_field_additions_per_iteration, + size_t finite_field_multiplications_per_iteration, + size_t finite_field_inversions_per_iteration, + size_t group_element_additions_per_iteration, + size_t group_element_doublings_per_iteration, + size_t scalar_multiplications_per_iteration, + size_t sequential_copy_ops_per_iteration) +{ + // Rough cost of operations: + constexpr size_t FF_ADDITION_COST = 4; + constexpr size_t FF_MULTIPLICATION_COST = 21; + constexpr size_t FF_INVERSION_COST = 7000; + constexpr size_t GE_ADDITION_COST = 350; + constexpr size_t GE_DOUBLING_COST = 194; + constexpr size_t SM_COST = 50000; + constexpr size_t SEQ_COPY_COST = 3; + constexpr size_t PARALLEL_FOR_COST = 376000; + // Get number of cpus we can split into + const size_t num_cpus = get_num_cpus(); + + // Compute the size of a single chunk + const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1); + + // Compute the cost of all operations done by other threads + const size_t offset_cost = + (num_points - chunk_size) * + (finite_field_additions_per_iteration * FF_ADDITION_COST + + finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST + + finite_field_inversions_per_iteration * FF_INVERSION_COST + + group_element_additions_per_iteration * GE_ADDITION_COST + + group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST + + sequential_copy_ops_per_iteration * SEQ_COPY_COST); + + // If starting parallel for is longer than computing, just compute + if (offset_cost < PARALLEL_FOR_COST) { + func(0, num_points); + return; + } + // Parallelize over chunks + parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) { + // If num_points is small, sometimes we need fewer CPUs + if (chunk_size * chunk_index > num_points) { + return; + } + // Compute the current chunk size (can differ in case it's the last chunk) + size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size); + if (current_chunk_size == 0) { + return; + } + size_t start = chunk_index * chunk_size; + size_t end = chunk_index * chunk_size + current_chunk_size; + func(start, end); + }); +}; \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp index fe799e1d46f..787f0313d9e 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.hpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp @@ -25,4 +25,13 @@ inline size_t get_num_cpus_pow2() void parallel_for(size_t num_iterations, const std::function& func); void run_loop_in_parallel(size_t num_points, const std::function& func, - size_t no_multhreading_if_less_or_equal = 0); \ No newline at end of file + size_t no_multhreading_if_less_or_equal = 0); +void run_loop_in_parallel_if_effective(size_t num_points, + const std::function& func, + size_t finite_field_additions_per_iteration = 0, + size_t finite_field_multiplications_per_iteration = 0, + size_t finite_field_inversions_per_iteration = 0, + size_t group_element_additions_per_iteration = 0, + size_t group_element_doublings_per_iteration = 0, + size_t scalar_multiplications_per_iteration = 0, + size_t sequential_copy_ops_per_iteration = 0); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index da98fb74f8f..afd666247d7 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -745,12 +745,13 @@ std::vector> element::batch_mul_with_endomo */ const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/6, + /*finite_field_multiplications_per_iteration=*/6); }; /** @@ -789,12 +790,13 @@ std::vector> element::batch_mul_with_endomo * */ const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) { batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start); }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/7, + /*finite_field_multiplications_per_iteration=*/6); }; // Compute wnaf for scalar const Fr converted_scalar = exponent.from_montgomery_form(); @@ -804,14 +806,20 @@ std::vector> element::batch_mul_with_endomo affine_element result{ Fq::zero(), Fq::zero() }; result.self_set_infinity(); std::vector results(num_points); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&results, result](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { results[i] = result; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); return results; } @@ -824,7 +832,7 @@ std::vector> element::batch_mul_with_endomo } // Initialize first etnries in lookup table std::vector temp_point_vector(num_points); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -832,19 +840,31 @@ std::vector> element::batch_mul_with_endomo lookup_table[0][i] = points[i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/2); // Construct lookup table batch_affine_double(&temp_point_vector[0]); for (size_t j = 1; j < lookup_size; ++j) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [j, &lookup_table](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { lookup_table[j][i] = lookup_table[j - 1][i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]); } @@ -873,7 +893,7 @@ std::vector> element::batch_mul_with_endomo index = wnaf_entry & 0x0fffffffU; sign = static_cast((wnaf_entry >> 31) & 1); const bool is_odd = ((j & 1) == 1); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start, size_t end) { @@ -891,7 +911,13 @@ std::vector> element::batch_mul_with_endomo } } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/1, + /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); } // First cycle of addition batch_affine_add(&temp_point_vector[0], &work_elements[0]); @@ -906,7 +932,7 @@ std::vector> element::batch_mul_with_endomo batch_affine_double(&work_elements[0]); } } - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -919,13 +945,19 @@ std::vector> element::batch_mul_with_endomo temp_point_vector[i] = to_add; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/1, + /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the first endo scalar if (skew) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&lookup_table, &temp_point_vector](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -933,12 +965,18 @@ std::vector> element::batch_mul_with_endomo temp_point_vector[i] = -lookup_table[0][i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the second endo scalar if (endo_skew) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -947,7 +985,13 @@ std::vector> element::batch_mul_with_endomo temp_point_vector[i].x *= beta; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/1, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } From 8451b461c06fc2364a7cb97e1a5dfffaddaae53d Mon Sep 17 00:00:00 2001 From: Rumata888 Date: Fri, 12 Jan 2024 15:45:08 +0000 Subject: [PATCH 4/6] Batch affine in IPA --- .../commitment_schemes/ipa/ipa.hpp | 6 +- .../cpp/src/barretenberg/common/thread.cpp | 6 +- .../src/barretenberg/ecc/groups/element.hpp | 3 + .../barretenberg/ecc/groups/element_impl.hpp | 118 +++++++++++++++--- 4 files changed, 109 insertions(+), 24 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 49ff8aa7a70..6ef30e0d296 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -147,20 +147,18 @@ template class IPA { // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge run_loop_in_parallel_if_effective( round_size, - [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size]( - size_t start, size_t end) { + [&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) { for (size_t j = start; j < end; j++) { a_vec[j] *= round_challenge; a_vec[j] += round_challenge_inv * a_vec[round_size + j]; b_vec[j] *= round_challenge_inv; b_vec[j] += round_challenge * b_vec[round_size + j]; - // TODO(kesha): This is affine and horrible, need to fix - G_vec_local[j] = G_lo[j] + G_hi[j]; } }, /*finite_field_additions_per_iteration=*/4, /*finite_field_multiplications_per_iteration=*/8, /*finite_field_inversions_per_iteration=*/1); + GroupElement::batch_affine_add(G_lo, G_hi, G_vec_local); } transcript->send_to_verifier("IPA:a_0", a_vec[0]); diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index 3f6bd47cb3c..7fdaa65db6d 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -139,10 +139,10 @@ void run_loop_in_parallel(size_t num_points, * @param finite_field_additions_per_iteration * @param finite_field_multiplications_per_iteration * @param finite_field_inversions_per_iteration - * @param group_element_additions_per_iteration - * @param group_element_doublings_per_iteration + * @param group_element_additions_per_iteration Projective addition number + * @param group_element_doublings_per_iteration Projective doubling number * @param scalar_multiplications_per_iteration - * @param sequential_copy_ops_per_iteration + * @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number */ void run_loop_in_parallel_if_effective(size_t num_points, const std::function& func, diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp index 420b4649856..22477f8352e 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp @@ -91,6 +91,9 @@ template class alignas(32) element { BBERG_INLINE constexpr bool operator==(const element& other) const noexcept; static void batch_normalize(element* elements, size_t num_elements) noexcept; + static void batch_affine_add(const std::span>& first_group, + const std::span>& second_group, + const std::span>& results) noexcept; static std::vector> batch_mul_with_endomorphism( const std::span>& points, const Fr& exponent) noexcept; diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index afd666247d7..a679fa6152c 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -688,6 +688,89 @@ element element::mul_with_endomorphism(const Fr& exponent) return work_element; } +/** + * @brief Pairwise affine add points in first and second group + * + * @param first_group + * @param second_group + * @param results + */ +template +void element::batch_affine_add(const std::span>& first_group, + const std::span>& second_group, + const std::span>& results) noexcept +{ + typedef affine_element affine_element; + const size_t num_points = first_group.size(); + ASSERT(second_group.size() == first_group.size()); + + // Space for temporary values + std::vector scratch_space(num_points); + + run_loop_in_parallel_if_effective( + num_points, + [&results, &first_group](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + results[i] = first_group[i]; + } + }, + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/2); + + // TODO(#826): Same code as in batch mul + // we can mutate rhs but NOT lhs! + // output is stored in rhs + /** + * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion + * + */ + const auto batch_affine_add_chunked = + [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) { + Fq batch_inversion_accumulator = Fq::one(); + + for (size_t i = 0; i < point_count; i += 1) { + personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 + rhs[i].x -= lhs[i].x; // x2 - x1 + rhs[i].y -= lhs[i].y; // y2 - y1 + rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old + batch_inversion_accumulator *= (rhs[i].x); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); + + for (size_t i = (point_count)-1; i < point_count; i -= 1) { + rhs[i].y *= batch_inversion_accumulator; // update accumulator + batch_inversion_accumulator *= rhs[i].x; + rhs[i].x = rhs[i].y.sqr(); + rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2 + // - x1 + personal_scratch_space[i] = lhs[i].x - rhs[i].x; + personal_scratch_space[i] *= rhs[i].y; + rhs[i].y = personal_scratch_space[i] - lhs[i].y; + } + }; + + /** + * @brief Perform batch affine addition in parallel + * + */ + const auto batch_affine_add_internal = + [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) { + run_loop_in_parallel_if_effective( + num_points, + [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); + }, + /*finite_field_additions_per_iteration=*/6, + /*finite_field_multiplications_per_iteration=*/6); + }; + batch_affine_add_internal(&second_group[0], &results[0]); +} + /** * @brief Multiply each point by the same exponent * @@ -708,8 +791,9 @@ std::vector> element::batch_mul_with_endomo // Space for temporary values std::vector scratch_space(num_points); - // we can mutate rhs but NOT lhs! - // output is stored in rhs + // TODO(#826): Same code as in batch add + // we can mutate rhs but NOT lhs! + // output is stored in rhs /** * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion * @@ -743,16 +827,16 @@ std::vector> element::batch_mul_with_endomo * @brief Perform batch affine addition in parallel * */ - const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, - affine_element* rhs) { - run_loop_in_parallel_if_effective( - num_points, - [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { - batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); - }, - /*finite_field_additions_per_iteration=*/6, - /*finite_field_multiplications_per_iteration=*/6); - }; + const auto batch_affine_add_internal = + [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) { + run_loop_in_parallel_if_effective( + num_points, + [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); + }, + /*finite_field_additions_per_iteration=*/6, + /*finite_field_multiplications_per_iteration=*/6); + }; /** * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion @@ -865,7 +949,7 @@ std::vector> element::batch_mul_with_endomo /*group_element_doublings_per_iteration=*/0, /*scalar_multiplications_per_iteration=*/0, /*sequential_copy_ops_per_iteration=*/1); - batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]); + batch_affine_add_internal(&temp_point_vector[0], &lookup_table[j][0]); } uint64_t wnaf_table[num_rounds * 2]; @@ -920,7 +1004,7 @@ std::vector> element::batch_mul_with_endomo /*sequential_copy_ops_per_iteration=*/1); } // First cycle of addition - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); // Run through SM logic in wnaf form (excluding the skew) for (size_t j = 2; j < num_rounds * 2; ++j) { wnaf_entry = wnaf_table[j]; @@ -952,7 +1036,7 @@ std::vector> element::batch_mul_with_endomo /*group_element_doublings_per_iteration=*/0, /*scalar_multiplications_per_iteration=*/0, /*sequential_copy_ops_per_iteration=*/1); - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the first endo scalar @@ -972,7 +1056,7 @@ std::vector> element::batch_mul_with_endomo /*group_element_doublings_per_iteration=*/0, /*scalar_multiplications_per_iteration=*/0, /*sequential_copy_ops_per_iteration=*/1); - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the second endo scalar if (endo_skew) { @@ -992,7 +1076,7 @@ std::vector> element::batch_mul_with_endomo /*group_element_doublings_per_iteration=*/0, /*scalar_multiplications_per_iteration=*/0, /*sequential_copy_ops_per_iteration=*/1); - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } return work_elements; From 8edeeb78da901b2e2adae6ce1f1a7c9343775d4c Mon Sep 17 00:00:00 2001 From: Rumata888 Date: Fri, 12 Jan 2024 20:02:17 +0000 Subject: [PATCH 5/6] Rename --- barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt | 3 +-- .../benchmark/{parallel_bench => basics_bench}/CMakeLists.txt | 2 +- .../{parallel_bench => basics_bench}/analyse_all_benchmarks.py | 0 .../parallel.bench.cpp => basics_bench/basics.bench.cpp} | 0 .../single_benchmark_analysis.py | 0 5 files changed, 2 insertions(+), 3 deletions(-) rename barretenberg/cpp/src/barretenberg/benchmark/{parallel_bench => basics_bench}/CMakeLists.txt (96%) rename barretenberg/cpp/src/barretenberg/benchmark/{parallel_bench => basics_bench}/analyse_all_benchmarks.py (100%) rename barretenberg/cpp/src/barretenberg/benchmark/{parallel_bench/parallel.bench.cpp => basics_bench/basics.bench.cpp} (100%) rename barretenberg/cpp/src/barretenberg/benchmark/{parallel_bench => basics_bench}/single_benchmark_analysis.py (100%) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt index 561b26af50b..285f2bb5937 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt +++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt @@ -4,8 +4,7 @@ add_subdirectory(pippenger_bench) add_subdirectory(plonk_bench) add_subdirectory(ultra_bench) add_subdirectory(goblin_bench) -add_subdirectory(honk_bench) -add_subdirectory(parallel_bench) +add_subdirectory(basics_bench) add_subdirectory(relations_bench) add_subdirectory(widgets_bench) add_subdirectory(protogalaxy_bench) \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt similarity index 96% rename from barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt rename to barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt index 0f9b1356967..d23c4f6597f 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/CMakeLists.txt +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt @@ -1,6 +1,6 @@ # Each source represents a separate benchmark suite set(BENCHMARK_SOURCES - parallel.bench.cpp + basics.bench.cpp ) # Required libraries for benchmark suites diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analyse_all_benchmarks.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py similarity index 100% rename from barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/analyse_all_benchmarks.py rename to barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/parallel.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp similarity index 100% rename from barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/parallel.bench.cpp rename to barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp diff --git a/barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/single_benchmark_analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py similarity index 100% rename from barretenberg/cpp/src/barretenberg/benchmark/parallel_bench/single_benchmark_analysis.py rename to barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py From d837bc894aa8e5b8e8f76f44872a3d848a970b52 Mon Sep 17 00:00:00 2001 From: Rumata888 Date: Sat, 13 Jan 2024 15:40:17 +0000 Subject: [PATCH 6/6] Address Luke's comments --- .../basics_bench/analyse_all_benchmarks.py | 17 +++++++++++++++++ .../benchmark/basics_bench/basics.bench.cpp | 6 ++++-- .../basics_bench/single_benchmark_analysis.py | 11 ++++++++++- .../cpp/src/barretenberg/common/thread.cpp | 12 ++++++++---- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py index 7b8270c1064..5edebdb815c 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py @@ -1,8 +1,16 @@ +#!/usr/bin/python3 +""" +Tool for analysing several benchmarks from basics_bench to calculate operation timings +For example, in src directory: +python3 ../src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py -f bin/basics_bench +""" import argparse import subprocess import tempfile from single_benchmark_analysis import evaluate_benchmark_from_file import os + +# Some of the benchmarks use other operations to randomise the procedure, so we need to subtract the results filter_rules={ "sequential_copy":"cycle_waste", "cycle_waste":None, @@ -20,6 +28,9 @@ "scalar_multiplication":"ff_addition", } def get_benchmarks(filename): + """ + Get a list of benchmarks from the binary + """ result=subprocess.run([filename,"--benchmark_list_tests"],capture_output=True) result.check_returncode() output_lines=result.stdout.splitlines() @@ -27,6 +38,9 @@ def get_benchmarks(filename): return sorted(list(benchmark_names)) def run_benchmarks(filename,bnames): + """ + Run benchmarks for each type and collect results + """ benchmark_results=dict() for bname in bnames: output_file=tempfile.mktemp() @@ -40,6 +54,9 @@ def run_benchmarks(filename,bnames): return benchmark_results def filter_benchmarks(benchmark_results): + """ + Apply filtering rules and print the benchmarks + """ global filter_rules print ("Filtered benchmark results:") max_len=0 diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp index 3fa40f5e244..32f6082d3a8 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp @@ -11,7 +11,8 @@ * ff_reduce: 5.1 * ff_sqr: 17.9 * ff_to_montgomery: 39.1 - * parallel_for_field_element_addition: 376060.9 + * parallel_for_field_element_addition: 198000~388000 (The number is somewhat dependent on the number of cores + * used) * projective_point_accidental_doubling: 347.6 * projective_point_addition: 348.6 * projective_point_doubling: 194.2 @@ -33,7 +34,8 @@ using Fr = Curve::ScalarField; /** * @brief Benchmark for evaluating the cost of starting parallel_for * - * @details It seems parallel_for takes ~400 microseconds to start + * @details It seems parallel_for takes ~400 microseconds to start when we use all the cores. When it's just 1 it's 200 + * microseconds. The dependency is not exactly linear, so in code we use the largest value for convenience * @param state */ void parallel_for_field_element_addition(State& state) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py index 6c390043cf2..3f039a21860 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py @@ -1,10 +1,18 @@ #!/usr/bin/python3 - +""" +Tool for analyzing a single benchmark file generated by basics_bench. Also used by "analyse_all_benchmarks.py" +For example, in src directory: +./bin/basics --benchmark_filter="par.*" --benchmark_out=parallel_for.csv --benchmark_out_format=csv +python3 ../src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py -f parallel_for.csv +""" import numpy as np import argparse from io import StringIO def evaluate_benchmark_from_file(filename): + """ + Take a benchmark file, remove waste and calculate the linear factor + """ lines=[] header_found=False x_exponents=[] @@ -20,6 +28,7 @@ def evaluate_benchmark_from_file(filename): x_exponents.append(int(line.replace('"','').split(',')[0].split('/')[1])) data=np.genfromtxt(StringIO('\n'.join(lines)),delimiter=",",usemask=True) + # Calculate the linear factor y=np.transpose(data[1:])[2] x=np.array([1<