-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathttvecadd.cpp
208 lines (187 loc) · 8.59 KB
/
ttvecadd.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#include "common/core_coord.h"
#include "tt_metal/host_api.hpp"
#include "common/bfloat16.hpp"
#include <cstddef>
#include <cstdint>
#include <memory>
#include <random>
#include <string_view>
#include <vector>
using namespace tt::tt_metal;
std::shared_ptr<Buffer> MakeBuffer(Device *device, uint32_t size, uint32_t page_size, bool sram)
{
InterleavedBufferConfig config{
.device= device,
.size = size,
.page_size = page_size,
.buffer_type = (sram ? BufferType::L1 : BufferType::DRAM)
};
return CreateBuffer(config);
}
// Allocate a buffer on DRAM or SRAM. Assuming the buffer holds BFP16 data.
// A tile on Tenstorrent is 32x32 elements, given us using BFP16, we need 2 bytes per element.
// Making the tile size 32x32x2 = 2048 bytes.
// @param device: The device to allocate the buffer on.
// @param n_tiles: The number of tiles to allocate.
// @param sram: If true, allocate the buffer on SRAM, otherwise allocate it on DRAM.
std::shared_ptr<Buffer> MakeBufferBFP16(Device *device, uint32_t n_tiles, bool sram)
{
constexpr uint32_t tile_size = 2 * (32 * 32);
// For simplicity, all DRAM buffers have page size = tile size.
const uint32_t page_tiles = sram ? n_tiles : 1;
return MakeBuffer(device, tile_size * n_tiles, page_tiles * tile_size, sram);
}
CBHandle MakeCircularBuffer(Program& program, const CoreCoord& core, tt::CB cb, uint32_t size, uint32_t page_size, tt::DataFormat format)
{
CircularBufferConfig cb_src0_config = CircularBufferConfig(
size,
{{
cb,
format
}})
.set_page_size(cb, page_size);
return CreateCircularBuffer(program, core, cb_src0_config);
}
// Circular buffers are Tenstorrent's way of communicating between the data movement and the compute kernels.
// kernels queue tiles into the circular buffer and takes them when they are ready. The circular buffer is
// backed by SRAM. There can be multiple circular buffers on a single Tensix core.
// @param program: The program to create the circular buffer on.
// @param core: The core to create the circular buffer on.
// @param cb: Which circular buffer to create (c_in0, c_in1, c_out0, c_out1, etc..). This is just an ID
// @param n_tiles: The number of tiles the circular buffer can hold.
CBHandle MakeCircularBufferBFP16(Program& program, const CoreCoord& core, tt::CB cb, uint32_t n_tiles)
{
constexpr uint32_t tile_size = 2 * (32 * 32);
return MakeCircularBuffer(program, core, cb, n_tiles * tile_size, tile_size, tt::DataFormat::Float16_b);
}
std::string next_arg(int& i, int argc, char **argv)
{
if(i + 1 >= argc) {
std::cerr << "Expected argument after " << argv[i] << std::endl;
exit(1);
}
return argv[++i];
}
void help(std::string_view program_name)
{
std::cout << "Usage: " << program_name << " [options]\n";
std::cout << "Options:\n";
std::cout << " --device, -d <device_id> Specify the device to run the program on. Default is 0.\n";
std::cout << " --seed, -s <seed> Specify the seed for the random number generator. Default is random.\n";
exit(0);
}
int main(int argc, char **argv)
{
int seed = std::random_device{}();
int device_id = 0;
// Quick and dirty argument parsing.
for(int i = 1; i < argc; i++) {
std::string_view arg = argv[i];
if(arg == "--device" || arg == "-d") {
device_id = std::stoi(next_arg(i, argc, argv));
}
else if(arg == "--seed" || arg == "-s") {
seed = std::stoi(next_arg(i, argc, argv));
}
else if(arg == "--help" || arg == "-h") {
help(argv[0]);
return 0;
}
else {
std::cout << "Unknown argument: " << arg << std::endl;
help(argv[0]);
}
}
Device *device = CreateDevice(device_id);
Program program = CreateProgram();
// This example program will only use 1 Tensix core. So we set the core to {0, 0}.
constexpr CoreCoord core = {0, 0};
CommandQueue& cq = device->command_queue();
const uint32_t n_tiles = 64;
const uint32_t tile_size = 32 * 32;
// Create 3 buffers on DRAM. These will hold the input and output data. A and B are the input buffers, C is the output buffer.
auto a = MakeBufferBFP16(device, n_tiles, false);
auto b = MakeBufferBFP16(device, n_tiles, false);
auto c = MakeBufferBFP16(device, n_tiles, false);
std::mt19937 rng(seed);
std::vector<uint32_t> a_data = create_random_vector_of_bfloat16(tile_size * n_tiles * 2, 10, rng());
std::vector<uint32_t> b_data = create_random_vector_of_bfloat16(tile_size * n_tiles * 2, 10, rng());
const uint32_t tiles_per_cb = 4;
// Create 3 circular buffers. These will be used by the data movement kernels to stream data into the compute cores and for the compute cores to stream data out.
CBHandle cb_a = MakeCircularBufferBFP16(program, core, tt::CB::c_in0, tiles_per_cb);
CBHandle cb_b = MakeCircularBufferBFP16(program, core, tt::CB::c_in1, tiles_per_cb);
CBHandle cb_c = MakeCircularBufferBFP16(program, core, tt::CB::c_out0, tiles_per_cb);
EnqueueWriteBuffer(cq, a, a_data, false);
EnqueueWriteBuffer(cq, b, b_data, false);
// A Tensix core is made up with 5 processors. 2 data movement processors, and 3 compute processors. The 2 data movement
// processors acts independently other cores. And the 3 compute processors acts together (hence 1 kerenl for compute).
// There is no need to explicitly parallelize the compute kernels. Unlike traditional CPU/GPU style SPMD programming,
// the 3 compute processors moves data from SRAM into the FPU(tensor engine)/SFPU(SIMD engine), operates on the data, and
// move it back to SRAM. The data movement processors moves data from the NoC, or in our case, the DRAM, into the SRAM.
//
// The vector add example consists of 3 kernels. `interleaved_tile_read` reads tiles from the input buffers A and B
// into 2 circular buffers. `add` reads tiles from the circular buffers, adds them together, and dumps the result into
// a third circular buffer. `tile_write` reads tiles from the third circular buffer and writes them to the output buffer C.
//
// This also registers the kernels with the program. A program is a collection of kernels on different cores.
auto reader = CreateKernel(
program,
"vecadd_kernels/interleaved_tile_read.cpp",
core,
DataMovementConfig {.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}
);
auto writer = CreateKernel(
program,
"vecadd_kernels/tile_write.cpp",
core,
DataMovementConfig {.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}
);
auto compute = CreateKernel(
program,
"vecadd_kernels/add.cpp",
core,
ComputeConfig{
.math_approx_mode = false,
.compile_args = {},
.defines = {}
}
);
// Set the runtime arguments for the kernels.
SetRuntimeArgs(program, reader, core, {
a->address(),
b->address(),
n_tiles
});
SetRuntimeArgs(program, writer, core, {
c->address(),
n_tiles
});
SetRuntimeArgs(program, compute, core, {
n_tiles
});
// We have setup the program. Now we can queue the kernel for execution.
// The last argument to EnqueueProgram is a boolean that specifies whether
// we wait for the program to finish execution before returning. I've set
// it to true. But alternatively, you can set it to false and call
// `Finish(cq)` to wait for all programs to finish.
// But it shouldn't matter in this case since we block on reading the output
// buffer.
EnqueueProgram(cq, program, true);
// Finish(cq);
std::cout << "Kernel execution finished" << std::endl;
// Read the output buffer.
std::vector<uint32_t> c_data;
EnqueueReadBuffer(cq, c, c_data, true);
// Print partial results so we can see the output is correct (plus or minus some error due to BFP16 precision)
std::cout << "Partial results: (note we are running under BFP16. It's going to be less accurate)\n";
size_t n = std::min((size_t)10, (size_t)tile_size * n_tiles);
bfloat16* a_bf16 = reinterpret_cast<bfloat16*>(a_data.data());
bfloat16* b_bf16 = reinterpret_cast<bfloat16*>(b_data.data());
bfloat16* c_bf16 = reinterpret_cast<bfloat16*>(c_data.data());
for(int i = 0; i < n; i++)
std::cout << " " << a_bf16[i].to_float() << " + " << b_bf16[i].to_float() << " = " << c_bf16[i].to_float() << "\n";
std::cout << std::flush;
// Finally, we close the device.
CloseDevice(device);
return 0;
}