-
Notifications
You must be signed in to change notification settings - Fork 2
/
merklize.hpp
138 lines (115 loc) · 4.94 KB
/
merklize.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#pragma once
#include "blake3.hpp"
sycl::cl_ulong
merklize(sycl::queue& q,
sycl::uint* const __restrict leaf_nodes,
size_t i_size, // leaf nodes size in bytes
size_t leaf_cnt,
sycl::uint* const __restrict intermediates,
size_t o_size, // intermediate nodes size in bytes
size_t itmd_cnt,
size_t wg_size)
{
// A binary merkle tree with N -many leaf
// nodes should have (N - 1) -many intermediates
//
// Note N = power of 2
assert(leaf_cnt == itmd_cnt + 1);
// because each leaf node is 32 -bytes wide
assert(i_size == (leaf_cnt << 5));
// if a binary tree has N -many leaf nodes, where N = 2 ^ i | i = {1, 2, ...}
// then # -of intermediate nodes in that binary merkle tree should be =
// leaf_count - 1
assert(o_size == ((itmd_cnt + 1) << 5));
// both input and output allocation has same size
assert(i_size == o_size);
// only tree with power of 2 many leaf nodes
// can be merklized by this implementation
assert((leaf_cnt & (leaf_cnt - 1)) == 0);
// At first N -many leaf nodes to be merged into
// N/ 2 -many intermediate nodes, which are living
// just above leaf nodes
const size_t work_item_cnt = leaf_cnt >> 1;
// validate whether this work group size can be used for next kernel dispatch
assert(wg_size <= work_item_cnt);
// ensure all dispatched work groups have equal many
// active work-items
assert(work_item_cnt % wg_size == 0);
// # -of 32 -bit unsigned integers, which can be contiguously placed
// on output memory allocation
const size_t elm_cnt = o_size >> 2;
const size_t i_offset = 0;
const size_t o_offset = elm_cnt >> 1;
// computes all intermediate nodes which are living just above leaf nodes of
// binary merkle tree
sycl::event evt_0 = q.submit([&](sycl::handler& h) {
h.parallel_for<class kernelBinaryMerklizationUsingBLAKE3Phase0>(
sycl::nd_range<1>{ sycl::range<1>{ work_item_cnt },
sycl::range<1>{ wg_size } },
[=](sycl::nd_item<1> it) {
const size_t idx = it.get_global_linear_id();
// copy message words to array allocated on private memory
// because these message words are permuted, when `blake3::merge`
// function is invoked
sycl::uint msg_words[16];
#pragma unroll 16 // attempt to fully parallelize the loop !
for (size_t i = 0; i < 16; i++) {
msg_words[i] = *(leaf_nodes + i_offset + (idx << 4) + i);
}
blake3::v1::merge(msg_words, intermediates + o_offset + (idx << 3));
});
});
// these many kernel dispatch rounds still remaining
const size_t rounds =
static_cast<size_t>(sycl::log2(static_cast<double>(work_item_cnt)));
std::vector<sycl::event> evts_0;
// reserve enough space in vector so that all events obtained as result
// of enqueuing kernel execution commands, can be accomodated
evts_0.reserve(rounds + 1);
// storing event obtained as result of enqueuing first phase of kernel
// execution command, where leaf nodes are paired to generate all intermediate
// nodes living immediately above them
evts_0.push_back(evt_0);
for (size_t r = 0; r < rounds; r++) {
// multiple rounds of kernel dispatches, where intermediate nodes are being
// computed from already computed (in previous dispatch round) intermediate
// nodes
sycl::event evt_1 = q.submit([&](sycl::handler& h) {
// note dependency chain being built !
//
// each dispatch round depends on previously enqueued dispatch round
h.depends_on(evts_0.at(r));
const size_t work_item_cnt_ = work_item_cnt >> (r + 1);
const size_t wg_size_ =
wg_size <= work_item_cnt_ ? wg_size : work_item_cnt_;
const size_t i_offset_ = o_offset >> r;
const size_t o_offset_ = i_offset_ >> 1;
h.parallel_for<class kernelBinaryMerklizationUsingBLAKE3Phase1>(
sycl::nd_range<1>{ sycl::range<1>{ work_item_cnt_ },
sycl::range<1>{ wg_size_ } },
[=](sycl::nd_item<1> it) {
const size_t idx = it.get_global_linear_id();
// copy message words to array allocated on private memory
// because these message words are permuted, when `blake3::merge`
// function is invoked
sycl::uint msg_words[16];
#pragma unroll 16 // attempt to fully parallelize the loop !
for (size_t i = 0; i < 16; i++) {
msg_words[i] = *(intermediates + i_offset_ + (idx << 4) + i);
}
blake3::v1::merge(msg_words, intermediates + o_offset_ + (idx << 3));
});
});
evts_0.push_back(evt_1);
}
// wait for last kernel dispatch round, where root of binary merkle tree is
// computed !
evts_0.at(rounds).wait();
// time execution of all enqueued kernels with nanosecond level granularity
sycl::cl_ulong ts = 0;
for (size_t r = 0; r < rounds + 1; r++) {
ts += time_event(evts_0.at(r));
}
// return total kernel execution cost, in terms of nanosecond
return ts;
}