From 5a59707571805308583440e3c49a0f5faa6a04c9 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Sun, 11 Feb 2024 18:49:57 +0000
Subject: [PATCH 1/4] Initial commit

---
 .gitignore               |   8 ++
 FastLanes                |   1 +
 README.md                | 182 +++++++++++++++++++++++++++
 build.zig                |  43 +++++++
 build.zig.zon            |  15 +++
 src/bench.zig            |  50 ++++++++
 src/bench_bitpacking.zig |  79 ++++++++++++
 src/bench_delta.zig      | 136 ++++++++++++++++++++
 src/bench_ffor.zig       |  39 ++++++
 src/bitpacking.zig       |  42 +++++++
 src/bitpacking_u8_u3.zig |  58 +++++++++
 src/delta.zig            |  80 ++++++++++++
 src/fastlanez.zig        | 266 +++++++++++++++++++++++++++++++++++++++
 src/ffor.zig             |  42 +++++++
 src/helper.zig           |   9 ++
 15 files changed, 1050 insertions(+)
 create mode 100644 .gitignore
 create mode 160000 FastLanes
 create mode 100644 README.md
 create mode 100644 build.zig
 create mode 100644 build.zig.zon
 create mode 100644 src/bench.zig
 create mode 100644 src/bench_bitpacking.zig
 create mode 100644 src/bench_delta.zig
 create mode 100644 src/bench_ffor.zig
 create mode 100644 src/bitpacking.zig
 create mode 100644 src/bitpacking_u8_u3.zig
 create mode 100644 src/delta.zig
 create mode 100644 src/fastlanez.zig
 create mode 100644 src/ffor.zig
 create mode 100644 src/helper.zig

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b382d39
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+zig-cache/
+zig-out/
+/release/
+/debug/
+/build/
+/build-*/
+/docgen_tmp/
+.idea/
diff --git a/FastLanes b/FastLanes
new file mode 160000
index 0000000..5feb61d
--- /dev/null
+++ b/FastLanes
@@ -0,0 +1 @@
+Subproject commit 5feb61d3b5078e5499690b51b30bd9a94556ccf6
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6a4fa9d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,182 @@
+# FastLanez
+
+A Zig implementation of the paper [Decoding >100 Billion Integers per Second with Scalar Code](https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf).
+
+Huge thanks to [**Azim Afroozeh**](https://www.cwi.nl/en/people/azim-afroozeh/) and [**Peter Boncz**](https://www.cwi.nl/en/people/peter-boncz/) for sharing this incredible work.
+
+Supported Codecs:
+* **Bit-Packing** - packing T bit integers into 0 <= W < T bit integers.
+* **Delta** - taking the difference between adjacent values.
+* **Fused Frame-of-Reference** - a fused kernel that subtracts a reference value before applying bit-packing.
+
+Requires Zig trunk >= 0.12.0-dev.2541
+
+Benchmarks can be run with `zig build bench`
+
+## What is FastLanes?
+
+FastLanes describes a practical approach to developing SIMD-based lightweight compression codecs. With a clever
+transposed layout of data it enables efficient compression and decompression among CPUs with varying width SIMD
+registers, even for codecs with data dependencies such as Delta and RLE.
+
+FastLanes operates over vectors of 1024 elements, 1024-bits at a time. It is up to the caller to decide how to
+handle padding. A typical FastLanes codec might look something like this:
+
+* Take an `FL.Vector` of 1024 elements of width `T`.
+* Transpose the vector with `FL.transpose`.
+* Apply a light-weight codec, such as `Delta`.
+* Bit-pack into integers of width `W`.
+* Return the `W * MM1024` words.
+
+Strictly speaking, the transpose is only required for codecs with data dependencies, i.e. where the output value
+of the next element depends on the previous element. However, if all FastLanes compressed vectors are ordered the
+same way, it's possible to more efficiently perform comparisons between these vectors without un-transposing the data.
+
+### Unified Transposed Layout
+
+The transposed layout works like this:
+* Take a 1024 element vector.
+* Split into 8 blocks of 128 elements.
+* Transpose the 128 elements from 8x16 to 16x8.
+* Reorder the blocks according to `0, 4, 2, 6, 1, 5, 3, 7`
+
+> Figure 6d in the paper is very helpful for visualizing this!
+
+With this order it is possible to make adjacent cells appear in adjacent SIMD words, regardless of how wide the CPU's SIMD
+register is. For example, a vector of u32s will be rearranged and then iterated such that adjacent words look like this:
+
+```
+u32 with 64-bit SIMD
+{ 0, 64 }
+{ 1, 65 }
+{ 2, 66 }
+...
+
+u32 with 128-bit SIMD
+{ 0, 64, 128, 192 }
+{ 1, 65, 129, 193 }
+{ 2, 66, 130, 194 }
+...
+
+u32 with 512-bit SIMD
+{ 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960 }
+{ 1, 65, 129, 193, 257, 321, 385, 449, 513, 577, 641, 705, 769, 833, 897, 961 }
+{ 2, 66, 130, 194, 258, 322, 386, 450, 514, 578, 642, 706, 770, 834, 898, 962 }
+...
+```
+
+Note that for a codec like Delta, instead of taking the delta from a single starting element, we must start with a 1024-bits
+worth of base values. So 32 * u32s.
+
+## Design of FastLanez
+
+FastLanez leverages Zig's SIMD abstraction to create a virtual 1024-bit word: `@Vector(1024 / @bitSizeOf(T), T)`.
+
+This allows us to implement codecs that are astonishingly close to the psuedo-code presented by the paper. Here is
+Listing 2 (unpacking 3-bit integers into 8-bit integers) ported to FastLanez with line-breaks adjusted to match the original:
+
+```zig
+comptime var mask: [W + 1]E = undefined;
+inline for (0..W + 1) |i| {
+    mask[i] = (1 << i) - 1;
+}
+
+var r0: FL.MM1024 = undefined;
+var r1: FL.MM1024 = undefined;
+
+r0 = FL.load(in, 0);
+r1 = FL.and_rshift(r0, 0, mask[3]); FL.store(out, 0, r1);
+r1 = FL.and_rshift(r0, 3, mask[3]); FL.store(out, 1, r1);
+r1 = FL.and_rshift(r0, 6, mask[2]);
+r0 = FL.load(in, 1); FL.store(out, 2, FL.or_(r1, FL.and_lshift(r0, 2, mask[1])));
+r1 = FL.and_rshift(r0, 1, mask[3]); FL.store(out, 3, r1);
+r1 = FL.and_rshift(r0, 4, mask[3]); FL.store(out, 4, r1);
+r1 = FL.and_rshift(r0, 7, mask[1]);
+r0 = FL.load(in, 2); FL.store(out, 5, FL.or_(r1, FL.and_lshift(r0, 1, mask[2])));
+r1 = FL.and_rshift(r0, 2, mask[3]); FL.store(out, 6, r1);
+r1 = FL.and_rshift(r0, 5, mask[3]); FL.store(out, 7, r1);
+```
+
+Zig's comptime feature allows us to wrap up this logic and generate **all** kernels at compile-time
+without any runtime performance overhead:
+
+```zig
+const FL = FastLanez(u8);
+
+pub fn unpack(comptime W: comptime_int, in: *const FL.PackedBytes(W), out: *FL.Vector) void {
+    comptime var unpacker = FL.bitunpacker(W);
+    var tmp: FL.MM1024 = undefined;
+    inline for (0..FL.T) |i| {
+        const next, tmp = unpacker.unpack(in, tmp);
+        FL.store(out, i, next);
+    }
+}
+```
+
+### Loop Ordering
+
+There is a key difference to the implementation of this library vs FastLanes: loop ordering.
+
+* FastLanes: SIMD word, tile, row.
+* FastLanez: tile, row, SIMD word. Where the SIMD word loop is internal to the Zig `@Vector`.
+
+We can see the difference more clearly with some psuedo-code:
+
+```zig
+const a, const b = FL.load(input, 0), FL.load(input, 1);
+FL.store(output, 0, FL.add(a, b));
+```
+
+Unoptimized FastLanes assembly would look something like this:
+```asm
+LDR
+ADD
+STR
+LDR
+ADD
+STR
+...
+```
+
+Whereas unoptimized FastLanez assembly would look like this:
+```asm
+LDR
+LDR
+...
+ADD
+ADD
+...
+STR
+STR
+...
+```
+
+Given there is a limited number of SIMD registers in a CPU, one would expect the FastLanes code to perform better.
+In fact, our benchmarking suggests that option 2 has a slight edge. Although I don't suspect this will hold true
+for more complex compression kernels and may become an issue in the future requiring us to invert the loop ordering
+of this library.
+
+Another possible advantage to the FastLanes loop ordering is that we can avoid unrolling the outer SIMD word loop,
+resulting in potentially much smaller code size for minimal impact on performance.
+
+
+## C Library
+
+TODO: this library will be made available as a C library.
+
+## Python Library
+
+TODO: this library will be made available as a Python library using [Ziggy Pydust](https://github.com/fulcrum-so/ziggy-pydust).
+
+## Benchmarks
+
+Benchmarks can be run with `zig build bench -Doptimize=ReleaseSafe`
+
+As with all benchmarks, take the results with a pinch of salt.
+
+> I found the performance of benchmarks varies greatly depending on whether the inputs and outputs are stack allocated or
+  heap allocated. I was surprised to find that often heap allocation was significantly faster than stack allocation.
+  If anyone happens to know why, please do let me know!
+
+The following plot shows the performance vs the original FastLanes repository for all bit unpacking kernels on an M2 Mac:
+
diff --git a/build.zig b/build.zig
new file mode 100644
index 0000000..98b185c
--- /dev/null
+++ b/build.zig
@@ -0,0 +1,43 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+
+    const cycleclock_dep = b.dependency("zig-cycleclock", .{
+        .target = target,
+        .optimize = optimize,
+    });
+
+    const module = b.addModule("fastlanez", .{
+        .root_source_file = .{ .path = "src/fastlanez.zig" },
+        .imports = &.{
+            .{ .name = "cycleclock", .module = cycleclock_dep.module("cycleclock") },
+        },
+    });
+
+    // Unit Tests
+    const unit_tests = b.addTest(.{
+        .root_source_file = .{ .path = "src/fastlanez.zig" },
+        .target = target,
+        .optimize = optimize,
+    });
+    unit_tests.root_module.import_table = module.import_table;
+    const run_unit_tests = b.addRunArtifact(unit_tests);
+    run_unit_tests.step.dependOn(b.getInstallStep());
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_unit_tests.step);
+
+    // Benchmarking
+    const bench = b.addTest(.{
+        .root_source_file = .{ .path = "src/bench.zig" },
+        .target = target,
+        .optimize = optimize,
+        .filter = "bench",
+    });
+    bench.root_module.import_table = module.import_table;
+    const run_bench = b.addRunArtifact(bench);
+    run_bench.step.dependOn(b.getInstallStep());
+    const bench_step = b.step("bench", "Run benchmarks");
+    bench_step.dependOn(&run_bench.step);
+}
diff --git a/build.zig.zon b/build.zig.zon
new file mode 100644
index 0000000..b6b7dc1
--- /dev/null
+++ b/build.zig.zon
@@ -0,0 +1,15 @@
+.{
+    .name = "fastlanez",
+    .version = "0.0.0",
+    .minimum_zig_version = "0.11.0",
+
+    // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+    .dependencies = .{
+        .@"zig-cycleclock" = .{
+            .url = "https://github.com/fulcrum-so/zig-cycleclock/archive/refs/tags/0.0.3.tar.gz",
+            .hash = "1220a3ceb465eef08008c61b2b198336328e033a9e9b9d24cc169e4743abd1ac79bb",
+        },
+    },
+
+    .paths = .{""},
+}
diff --git a/src/bench.zig b/src/bench.zig
new file mode 100644
index 0000000..e087e1e
--- /dev/null
+++ b/src/bench.zig
@@ -0,0 +1,50 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const cycleclock = @import("cycleclock");
+const fl = @import("fastlanez.zig");
+
+const dbg = builtin.mode == .Debug;
+
+pub const Options = struct {
+    warmup: comptime_int = if (dbg) 0 else 100_000,
+    iterations: comptime_int = if (dbg) 1 else 3_000_000,
+};
+
+pub fn Bench(comptime name: []const u8, comptime variant: []const u8, comptime options: Options) type {
+    return struct {
+        pub fn bench(comptime Unit: type) !void {
+            std.debug.print("{s},{s},{}", .{ name, variant, options.iterations });
+
+            const unit = if (@hasDecl(Unit, "setup")) try Unit.setup() else Unit{};
+            defer if (@hasDecl(Unit, "deinit")) {
+                unit.deinit();
+            };
+
+            for (0..options.warmup) |_| {
+                unit.run();
+            }
+
+            const start = cycleclock.now();
+            for (0..options.iterations) |_| {
+                unit.run();
+            }
+            const stop = cycleclock.now();
+            const cycles = stop - start;
+
+            if (cycles == 0) {
+                std.debug.print(",0  # failed to measure cycles\n", .{});
+                return;
+            } else {
+                const cycles_per_tuple = @as(f64, @floatFromInt(cycles)) / @as(f64, @floatFromInt(1024 * options.iterations));
+                std.debug.print(",{d:.4}", .{cycles_per_tuple});
+                std.debug.print(",{}\n", .{1024 * options.iterations / cycles});
+            }
+        }
+    };
+}
+
+comptime {
+    std.testing.refAllDecls(@import("bench_bitpacking.zig"));
+    std.testing.refAllDecls(@import("bench_delta.zig"));
+    std.testing.refAllDecls(@import("bench_ffor.zig"));
+}
diff --git a/src/bench_bitpacking.zig b/src/bench_bitpacking.zig
new file mode 100644
index 0000000..542c74f
--- /dev/null
+++ b/src/bench_bitpacking.zig
@@ -0,0 +1,79 @@
+const std = @import("std");
+const fl = @import("./fastlanez.zig");
+const Bench = @import("bench.zig").Bench;
+const BitPacking = @import("./bitpacking.zig").BitPacking;
+const gpa = std.testing.allocator;
+
+test "bench bitpacking pack" {
+    inline for (.{ u8, u16, u32, u64 }) |E| {
+        const FL = fl.FastLanez(E);
+        inline for (1..@bitSizeOf(E)) |W| {
+            const dbg = @import("builtin").mode == .Debug;
+            if (comptime dbg and !std.math.isPowerOfTwo(W)) {
+                // Avoid too much code-gen in debug mode.
+                continue;
+            }
+
+            try Bench("pack", @typeName(FL.E) ++ "_" ++ @typeName(std.meta.Int(.unsigned, W)), .{}).bench(struct {
+                ints: *const FL.Vector,
+                packed_bytes: *FL.PackedBytes(W),
+
+                pub fn setup() !@This() {
+                    const ints = try gpa.create(FL.Vector);
+                    for (0..1024) |i| {
+                        ints[i] = @intCast(i % W);
+                    }
+                    const packed_bytes = try gpa.create(FL.PackedBytes(W));
+                    return .{ .ints = ints, .packed_bytes = packed_bytes };
+                }
+
+                pub fn deinit(self: *const @This()) void {
+                    gpa.destroy(self.ints);
+                    gpa.destroy(self.packed_bytes);
+                }
+
+                pub fn run(self: *const @This()) void {
+                    BitPacking(FL).encode(W, self.ints, self.packed_bytes);
+                    std.mem.doNotOptimizeAway(self.packed_bytes);
+                }
+            });
+        }
+    }
+}
+
+test "bench bitpacking unpack" {
+    inline for (.{ u8, u16, u32, u64 }) |E| {
+        const FL = fl.FastLanez(E);
+        inline for (1..@bitSizeOf(E)) |W| {
+            const dbg = @import("builtin").mode == .Debug;
+            if (comptime dbg and !std.math.isPowerOfTwo(W)) {
+                // Avoid too much code-gen in debug mode.
+                continue;
+            }
+
+            try Bench("unpack", @typeName(FL.E) ++ "_" ++ @typeName(std.meta.Int(.unsigned, W)), .{}).bench(struct {
+                ints: *FL.Vector,
+                packed_bytes: *const FL.PackedBytes(W),
+
+                pub fn setup() !@This() {
+                    const ints = try gpa.create(FL.Vector);
+                    const packed_bytes = try gpa.create(FL.PackedBytes(W));
+                    for (0..@sizeOf(FL.PackedBytes(W))) |i| {
+                        packed_bytes[i] = 5; // Set every byte to 5...
+                    }
+                    return .{ .ints = ints, .packed_bytes = packed_bytes };
+                }
+
+                pub fn deinit(self: *const @This()) void {
+                    gpa.destroy(self.ints);
+                    gpa.destroy(self.packed_bytes);
+                }
+
+                pub fn run(self: *const @This()) void {
+                    BitPacking(FL).decode(W, self.packed_bytes, self.ints);
+                    std.mem.doNotOptimizeAway(self.ints);
+                }
+            });
+        }
+    }
+}
diff --git a/src/bench_delta.zig b/src/bench_delta.zig
new file mode 100644
index 0000000..c6cd1eb
--- /dev/null
+++ b/src/bench_delta.zig
@@ -0,0 +1,136 @@
+const std = @import("std");
+const fl = @import("fastlanez.zig");
+const Bench = @import("bench.zig").Bench;
+const Delta = @import("./delta.zig").Delta;
+const arange = @import("helper.zig").arange;
+const gpa = std.testing.allocator;
+
+test "bench delta encode" {
+    inline for (.{ u8, u16, u32, u64 }) |T| {
+        const FL = fl.FastLanez(T);
+
+        try Bench("delta_encode", @typeName(T), .{}).bench(struct {
+            base: FL.BaseVector,
+            input: *const FL.Vector,
+            output: *FL.Vector,
+
+            pub fn setup() !@This() {
+                const input = try gpa.create(FL.Vector);
+                input.* = FL.transpose(arange(T, 1024));
+                const output = try gpa.create(FL.Vector);
+                return .{
+                    .base = [_]T{0} ** (1024 / @bitSizeOf(T)),
+                    .input = input,
+                    .output = output,
+                };
+            }
+
+            pub fn deinit(self: *const @This()) void {
+                gpa.free(self.input);
+                gpa.free(self.output);
+            }
+
+            pub fn run(self: *const @This()) void {
+                @call(.never_inline, Delta(FL).encode, .{ &self.base, self.input, self.output });
+                std.mem.doNotOptimizeAway(self.output);
+            }
+        });
+    }
+}
+
+test "bench delta decode" {
+    inline for (.{ u8, u16, u32, u64 }) |T| {
+        const FL = fl.FastLanez(T);
+
+        try Bench("delta_decode", @typeName(T), .{}).bench(struct {
+            base: FL.BaseVector,
+            input: *const FL.Vector,
+            output: *FL.Vector,
+
+            pub fn setup() !@This() {
+                const input = try gpa.create(FL.Vector);
+                input.* = .{1} ** 1024;
+                const output = try gpa.create(FL.Vector);
+                return .{
+                    .base = [_]T{0} ** (1024 / @bitSizeOf(T)),
+                    .input = input,
+                    .output = output,
+                };
+            }
+
+            pub fn deinit(self: *const @This()) void {
+                gpa.destroy(self.input);
+                gpa.destroy(self.output);
+            }
+
+            pub fn run(self: @This()) void {
+                Delta(FL).decode(&self.base, self.input, self.output);
+                std.mem.doNotOptimizeAway(self.output);
+            }
+        });
+    }
+}
+
+test "bench delta pack" {
+    inline for (.{ u8, u16, u32, u64 }) |T| {
+        const FL = fl.FastLanez(T);
+        const W = 3;
+
+        try Bench("delta_pack", @typeName(T) ++ "_" ++ @typeName(std.meta.Int(.unsigned, W)), .{}).bench(struct {
+            base: FL.BaseVector = [_]T{0} ** (1024 / @bitSizeOf(T)),
+            input: *const FL.Vector,
+            output: *FL.PackedBytes(W),
+
+            pub fn setup() !@This() {
+                const input = try gpa.create(FL.Vector);
+                input.* = .{1} ** 1024;
+                return .{ .input = input, .output = try gpa.create(FL.PackedBytes(W)) };
+            }
+
+            pub fn deinit(self: *const @This()) void {
+                gpa.destroy(self.input);
+                gpa.destroy(self.output);
+            }
+
+            pub fn run(self: @This()) void {
+                Delta(FL).pack(3, &self.base, self.input, self.output);
+                std.mem.doNotOptimizeAway(self.output);
+            }
+        });
+    }
+}
+
+test "bench delta unpack" {
+    inline for (.{ u8, u16, u32, u64 }) |T| {
+        const FL = fl.FastLanez(T);
+        const W = 3;
+
+        try Bench("delta_unpack", @typeName(T) ++ "_" ++ @typeName(std.meta.Int(.unsigned, W)), .{}).bench(struct {
+            base: FL.BaseVector,
+            delta: *const FL.PackedBytes(W),
+            output: *FL.Vector,
+
+            pub fn setup() !@This() {
+                const base = [_]T{0} ** (1024 / @bitSizeOf(T));
+                const input: FL.Vector = .{1} ** 1024;
+                const delta: *FL.PackedBytes(W) = try gpa.create(FL.PackedBytes(W));
+                Delta(FL).pack(3, &base, &input, delta);
+                return .{
+                    .base = base,
+                    .delta = delta,
+                    .output = try gpa.create(FL.Vector),
+                };
+            }
+
+            pub fn deinit(self: *const @This()) void {
+                gpa.destroy(self.delta);
+                gpa.destroy(self.output);
+            }
+
+            pub fn run(self: *const @This()) void {
+                Delta(FL).unpack(3, &self.base, self.delta, self.output);
+                std.mem.doNotOptimizeAway(self.output);
+            }
+        });
+    }
+}
diff --git a/src/bench_ffor.zig b/src/bench_ffor.zig
new file mode 100644
index 0000000..37eff30
--- /dev/null
+++ b/src/bench_ffor.zig
@@ -0,0 +1,39 @@
+const std = @import("std");
+const fl = @import("./fastlanez.zig");
+const Bench = @import("bench.zig").Bench;
+const FFoR = @import("./ffor.zig").FFoR;
+const gpa = std.testing.allocator;
+
+test "bench ffor pack" {
+    inline for (.{ u8, u16, u32, u64 }) |E| {
+        const FL = fl.FastLanez(E);
+        inline for (1..@bitSizeOf(E)) |W| {
+            const dbg = @import("builtin").mode == .Debug;
+            if (comptime dbg and !std.math.isPowerOfTwo(W)) {
+                // Avoid too much code-gen in debug mode.
+                continue;
+            }
+
+            try Bench("ffor_pack", @typeName(FL.E) ++ "_" ++ @typeName(std.meta.Int(.unsigned, W)), .{}).bench(struct {
+                input: *const FL.Vector,
+                output: *FL.PackedBytes(W),
+
+                pub fn setup() !@This() {
+                    const input = try gpa.create(FL.Vector);
+                    input.* = .{5} ** 1024;
+                    return .{ .input = input, .output = try gpa.create(FL.PackedBytes(W)) };
+                }
+
+                pub fn deinit(self: *const @This()) void {
+                    gpa.destroy(self.input);
+                    gpa.destroy(self.output);
+                }
+
+                pub fn run(self: *const @This()) void {
+                    FFoR(FL).encode(W, 2, self.input, self.output);
+                    std.mem.doNotOptimizeAway(self.output);
+                }
+            });
+        }
+    }
+}
diff --git a/src/bitpacking.zig b/src/bitpacking.zig
new file mode 100644
index 0000000..8d0e4dc
--- /dev/null
+++ b/src/bitpacking.zig
@@ -0,0 +1,42 @@
+pub fn BitPacking(comptime FL: type) type {
+    return struct {
+        pub fn encode(comptime W: comptime_int, in: *const FL.Vector, out: *FL.PackedBytes(W)) void {
+            comptime var packer = FL.bitpacker(W);
+            var tmp: FL.MM1024 = undefined;
+            // FIXME(ngates): this pack function doesn't work if the loop isn't inlined at comptime.
+            //  We should either accept i as an argument to make it harder to misuse? Or avoid stateful packers?
+            inline for (0..FL.T) |i| {
+                tmp = packer.pack(out, FL.load(in, i), tmp);
+            }
+        }
+
+        pub fn decode(comptime W: comptime_int, in: *const FL.PackedBytes(W), out: *FL.Vector) void {
+            comptime var unpacker = FL.bitunpacker(W);
+            var tmp: FL.MM1024 = undefined;
+            inline for (0..FL.T) |i| {
+                const next, tmp = unpacker.unpack(in, tmp);
+                FL.store(out, i, next);
+            }
+        }
+    };
+}
+
+test "bitpack" {
+    const std = @import("std");
+    const fl = @import("./fastlanez.zig");
+    const BP = BitPacking(fl.FastLanez(u8));
+
+    const ints: [1024]u8 = .{2} ** 1024;
+    var packed_ints: [384]u8 = undefined;
+    BP.encode(3, &ints, &packed_ints);
+
+    // Decimal 2 repeated as 3-bit integers in blocks of 1024 bits.
+    try std.testing.expectEqual(
+        .{0b10010010} ** 128 ++ .{0b00100100} ** 128 ++ .{0b01001001} ** 128,
+        packed_ints,
+    );
+
+    var output: [1024]u8 = undefined;
+    BP.decode(3, &packed_ints, &output);
+    try std.testing.expectEqual(.{2} ** 1024, output);
+}
diff --git a/src/bitpacking_u8_u3.zig b/src/bitpacking_u8_u3.zig
new file mode 100644
index 0000000..d9473e4
--- /dev/null
+++ b/src/bitpacking_u8_u3.zig
@@ -0,0 +1,58 @@
+//! An implementation of the "unpack 3 -> 8" algorithm ported literally from the FastLanes paper.
+const fl = @import("fastlanez.zig");
+
+const E = u8;
+const W = 3;
+const FL = fl.FastLanez(E);
+
+/// Decode 3-bit ints into 8-bit ints.
+pub fn decode(in: *const FL.PackedBytes(3), out: *FL.Vector) void {
+    comptime var mask: [W + 1]E = undefined;
+    inline for (0..W + 1) |i| {
+        mask[i] = (1 << i) - 1;
+    }
+
+    var r0: FL.MM1024 = undefined;
+    var r1: FL.MM1024 = undefined;
+
+    r0 = FL.load(in, 0);
+    r1 = FL.and_rshift(r0, 0, mask[3]);
+    FL.store(out, 0, r1);
+
+    r1 = FL.and_rshift(r0, 3, mask[3]);
+    FL.store(out, 1, r1);
+
+    r1 = FL.and_rshift(r0, 6, mask[2]);
+    r0 = FL.load(in, 1);
+    FL.store(out, 2, FL.or_(r1, FL.and_lshift(r0, 2, mask[1])));
+
+    r1 = FL.and_rshift(r0, 1, mask[3]);
+    FL.store(out, 3, r1);
+
+    r1 = FL.and_rshift(r0, 4, mask[3]);
+    FL.store(out, 4, r1);
+
+    r1 = FL.and_rshift(r0, 7, mask[1]);
+    r0 = FL.load(in, 2);
+    FL.store(out, 5, FL.or_(r1, FL.and_lshift(r0, 1, mask[2])));
+
+    r1 = FL.and_rshift(r0, 2, mask[3]);
+    FL.store(out, 6, r1);
+
+    r1 = FL.and_rshift(r0, 5, mask[3]);
+    FL.store(out, 7, r1);
+}
+
+test "fastlanez unpack 3 -> 8" {
+    const std = @import("std");
+
+    // Setup an input of all "1" bits.
+    const input: [384]u8 = .{255} ** 384;
+
+    var output: [1024]u8 = undefined;
+    decode(&input, &output);
+
+    // Ensure all outputs are "0000111" => 7.
+    const expected: [1024]u8 = .{7} ** 1024;
+    try std.testing.expectEqual(expected, output);
+}
diff --git a/src/delta.zig b/src/delta.zig
new file mode 100644
index 0000000..c48d45c
--- /dev/null
+++ b/src/delta.zig
@@ -0,0 +1,80 @@
+pub fn Delta(comptime FastLanes: type) type {
+    const FL = FastLanes;
+
+    return struct {
+        const std = @import("std");
+
+        pub fn encode(base: *const FL.BaseVector, in: *const FL.Vector, out: *FL.Vector) void {
+            var prev: FL.MM1024 = FL.load(base, 0);
+            inline for (0..FL.T) |i| {
+                const next = FL.load_transposed(in, i);
+                const delta = FL.subtract(next, prev);
+                prev = next;
+                FL.store_transposed(out, i, delta);
+            }
+        }
+
+        pub fn decode(base: *const FL.BaseVector, in: *const FL.Vector, out: *FL.Vector) void {
+            var prev = FL.load(base, 0);
+            inline for (0..FL.T) |i| {
+                const delta = FL.load_transposed(in, i);
+                const result = FL.add(prev, delta);
+                prev = result;
+                FL.store_transposed(out, i, result);
+            }
+        }
+
+        pub fn pack(comptime W: comptime_int, base: *const FL.BaseVector, in: *const FL.Vector, out: *FL.PackedBytes(W)) void {
+            comptime var packer = FL.bitpacker(W);
+            var tmp: FL.MM1024 = undefined;
+
+            var prev: FL.MM1024 = FL.load(base, 0);
+            inline for (0..FL.T) |i| {
+                const next = FL.load_transposed(in, i);
+                const result = FL.subtract(next, prev);
+                prev = next;
+
+                tmp = packer.pack(out, result, tmp);
+            }
+        }
+
+        pub fn unpack(comptime W: comptime_int, base: *const FL.BaseVector, in: *const FL.PackedBytes(W), out: *FL.Vector) void {
+            comptime var packer = FL.bitunpacker(W);
+            var tmp: FL.MM1024 = undefined;
+
+            var prev: FL.MM1024 = FL.load(base, 0);
+            inline for (0..FL.T) |i| {
+                const next, tmp = packer.unpack(in, tmp);
+                const result = FL.add(prev, next);
+                FL.store_transposed(out, i, result);
+                prev = result;
+            }
+        }
+    };
+}
+
+test "fastlanez delta" {
+    const std = @import("std");
+    const fl = @import("fastlanez.zig");
+    const arange = @import("helper.zig").arange;
+
+    const T = u16;
+    const FL = fl.FastLanez(T);
+
+    const base: FL.BaseVector = .{0} ** FL.S;
+    const input: FL.Vector = FL.transpose(arange(T, 1024));
+
+    var actual: [1024]T = undefined;
+    Delta(FL).encode(&base, &input, &actual);
+
+    actual = FL.untranspose(actual);
+
+    for (0..1024) |i| {
+        // Since fastlanes processes based on 16 blocks, we expect a zero delta every 1024 / 16 = 64 elements.
+        if (i % @bitSizeOf(T) == 0) {
+            try std.testing.expectEqual(i, actual[i]);
+        } else {
+            try std.testing.expectEqual(1, actual[i]);
+        }
+    }
+}
diff --git a/src/fastlanez.zig b/src/fastlanez.zig
new file mode 100644
index 0000000..1301b1b
--- /dev/null
+++ b/src/fastlanez.zig
@@ -0,0 +1,266 @@
+// This unified transpose layout allows us to operate efficiently using a variety of SIMD lane widths.
+const ORDER: [8]u8 = .{ 0, 4, 2, 6, 1, 5, 3, 7 };
+
+// Comptime compute the transpose and untranspose masks.
+const transpose_mask: [1024]comptime_int = blk: {
+    @setEvalBranchQuota(4096);
+    var mask: [1024]comptime_int = undefined;
+    var mask_idx = 0;
+    for (0..8) |row| {
+        for (ORDER) |o| {
+            for (0..16) |i| {
+                mask[mask_idx] = (i * 64) + (o * 8) + row;
+                mask_idx += 1;
+            }
+        }
+    }
+    break :blk mask;
+};
+
+const untranspose_mask: [1024]comptime_int = blk: {
+    @setEvalBranchQuota(4096);
+    var mask: [1024]comptime_int = undefined;
+    for (0..1024) |i| {
+        mask[transpose_mask[i]] = i;
+    }
+    break :blk mask;
+};
+
+pub fn FastLanez(comptime Element: type) type {
+    return struct {
+        /// The type of the element.
+        pub const E = Element;
+        /// The bit size of the element type.
+        pub const T = @bitSizeOf(E);
+        /// The number of elements in a single MM1024 word.
+        pub const S = 1024 / T;
+        /// A vector of 1024 elements.
+        pub const Vector = [1024]E;
+        /// A vector of 1024 bits.
+        pub const BaseVector = [S]E;
+        /// Represents the fastlanes virtual 1024-bit SIMD word.
+        pub const MM1024 = @Vector(1024 / T, E);
+
+        /// Offset required to iterate over 1024 bit vectors according to the unified transpose order.
+        const offsets: [T]u8 = blk: {
+            var _offsets: [T]u8 = undefined;
+            var offset = 0;
+            for (0..T / 8) |order| {
+                for (0..8) |row| {
+                    _offsets[offset] = order + ((T / 8) * row);
+                    offset += 1;
+                }
+            }
+            break :blk _offsets;
+        };
+
+        /// Load the nth MM1024 from the input buffer. Respecting the unified transpose order.
+        pub inline fn load_transposed(ptr: anytype, n: usize) MM1024 {
+            return load(ptr, offsets[n]);
+        }
+
+        /// Load the nth MM1024 from the input buffer.
+        pub inline fn load(ptr: anytype, n: usize) MM1024 {
+            const Array = @typeInfo(@TypeOf(ptr)).Pointer.child;
+            const bytes: *const [@sizeOf(Array)]u8 = @ptrCast(ptr);
+            return @bitCast(bytes[n * 128 ..][0..128].*);
+        }
+
+        /// Store the nth MM1024 into the output buffer. Respecting the unified transpose order.
+        pub inline fn store_transposed(ptr: anytype, n: usize, vec: MM1024) void {
+            store(ptr, offsets[n], vec);
+        }
+
+        /// Store the nth MM1024 into the output buffer.
+        pub inline fn store(ptr: anytype, n: usize, word: MM1024) void {
+            const Array = @typeInfo(@TypeOf(ptr)).Pointer.child;
+            const words: *[@sizeOf(Array) / 128][128]u8 = @ptrCast(ptr);
+            words[n] = @bitCast(word);
+        }
+
+        /// Shuffle the input vector into the unified transpose order.
+        pub fn transpose(vec: Vector) Vector {
+            const V = @Vector(1024, E);
+            return @shuffle(E, @as(V, vec), @as(V, vec), transpose_mask);
+        }
+
+        /// Unshuffle the input vector from the unified transpose order.
+        pub fn untranspose(vec: Vector) Vector {
+            const V = @Vector(1024, E);
+            return @shuffle(E, @as(V, vec), @as(V, vec), untranspose_mask);
+        }
+
+        /// A type representing an array of packed bytes.
+        pub fn PackedBytes(comptime Width: comptime_int) type {
+            return [128 * Width]u8;
+        }
+
+        /// Returns a comptime struct for packing bits into Width-bit integers.
+        pub fn bitpacker(comptime Width: comptime_int) BitPacker(Width) {
+            return BitPacker(Width){};
+        }
+
+        fn BitPacker(comptime Width: comptime_int) type {
+            return struct {
+                const Self = @This();
+
+                /// The number of times store has been called. The position in the input vector, so to speak.
+                t: comptime_int = 0,
+                /// The position in the output that we're writing to. Will finish equal to Width.
+                out_idx: comptime_int = 0,
+
+                shift_bits: comptime_int = 0,
+                mask_bits: comptime_int = Width,
+
+                /// Invoke to store the next vector.
+                pub inline fn pack(comptime self: *Self, out: *PackedBytes(Width), word: MM1024, state: MM1024) MM1024 {
+                    var tmp: MM1024 = undefined;
+                    if (self.t == 0) {
+                        tmp = @splat(0);
+                    } else {
+                        tmp = state;
+                    }
+
+                    if (self.t > T) {
+                        @compileError("BitPacker.pack called too many times");
+                    }
+                    self.t += 1;
+
+                    // If we didn't take all W bits last time, then we load the remainder
+                    if (self.mask_bits < Width) {
+                        tmp = or_(tmp, and_rshift(word, self.mask_bits, bitmask(self.shift_bits)));
+                    }
+
+                    // Update the number of mask bits
+                    self.mask_bits = @min(T - self.shift_bits, Width);
+
+                    // Pull the masked bits into the tmp register
+                    tmp = or_(tmp, and_lshift(word, self.shift_bits, bitmask(self.mask_bits)));
+                    self.shift_bits += Width;
+
+                    if (self.shift_bits >= T) {
+                        // If we have a full 1024 bits, then store it and reset the tmp register
+                        store(out, self.out_idx, tmp);
+                        tmp = @splat(0);
+                        self.out_idx += 1;
+                        self.shift_bits -= T;
+                    }
+
+                    return tmp;
+                }
+            };
+        }
+
+        /// Returns a comptime struct for unpacking bits from Width-bit integers.
+        pub fn bitunpacker(comptime Width: comptime_int) BitUnpacker(Width) {
+            return BitUnpacker(Width){};
+        }
+
+        fn BitUnpacker(comptime Width: comptime_int) type {
+            return struct {
+                const Self = @This();
+
+                t: comptime_int = 0,
+
+                input_idx: comptime_int = 0,
+                shift_bits: comptime_int = 0,
+
+                pub inline fn unpack(comptime self: *Self, input: *const PackedBytes(Width), state: MM1024) struct { MM1024, MM1024 } {
+                    if (self.t > T) {
+                        @compileError("BitUnpacker.unpack called too many times");
+                    }
+                    self.t += 1;
+
+                    var tmp: MM1024 = undefined;
+                    if (self.input_idx == 0) {
+                        tmp = load(input, 0);
+                        self.input_idx += 1;
+                    } else {
+                        tmp = state;
+                    }
+
+                    const mask_bits = @min(T - self.shift_bits, Width);
+
+                    var next: MM1024 = undefined;
+                    if (self.shift_bits == T) {
+                        next = tmp;
+                    } else {
+                        next = and_rshift(tmp, self.shift_bits, bitmask(mask_bits));
+                    }
+
+                    if (mask_bits != Width) {
+                        tmp = load(input, self.input_idx);
+                        self.input_idx += 1;
+
+                        next = or_(next, and_lshift(tmp, mask_bits, bitmask(Width - mask_bits)));
+
+                        self.shift_bits = Width - mask_bits;
+                    } else {
+                        self.shift_bits += Width;
+                    }
+
+                    return .{ next, tmp };
+                }
+            };
+        }
+
+        // Create a mask of the first `bits` bits.
+        inline fn bitmask(comptime bits: comptime_int) E {
+            return (1 << bits) - 1;
+        }
+
+        pub inline fn add(a: MM1024, b: MM1024) MM1024 {
+            return a +% b;
+        }
+
+        pub inline fn subtract(a: MM1024, b: MM1024) MM1024 {
+            return a -% b;
+        }
+
+        pub inline fn and_(a: MM1024, b: MM1024) MM1024 {
+            return a & b;
+        }
+
+        pub inline fn or_(a: MM1024, b: MM1024) MM1024 {
+            return a | b;
+        }
+
+        pub inline fn and_lshift(lane: MM1024, comptime n: comptime_int, comptime mask: E) MM1024 {
+            const maskvec: MM1024 = @splat(mask);
+            const nvec: MM1024 = @splat(n);
+            return (lane & maskvec) << nvec;
+        }
+
+        pub inline fn and_rshift(lane: MM1024, comptime n: comptime_int, comptime mask: E) MM1024 {
+            const maskvec: MM1024 = @splat(mask << n);
+            const nvec: MM1024 = @splat(n);
+            return (lane & maskvec) >> nvec;
+        }
+    };
+}
+
+test "fastlanez transpose" {
+    const std = @import("std");
+    const arange = @import("helper.zig").arange;
+    const T = u32;
+    const FL = FastLanez(T);
+
+    const input: FL.Vector = arange(T, 1024);
+    const transposed = FL.transpose(input);
+
+    try std.testing.expectEqual(transposed[0], 0);
+    try std.testing.expectEqual(transposed[1], 64);
+    try std.testing.expectEqual(transposed[2], 128);
+    try std.testing.expectEqual(transposed[16], 32);
+    try std.testing.expectEqual(transposed[1017], 639);
+    try std.testing.expectEqual(transposed[1023], 1023);
+}
+
+comptime {
+    const std = @import("std");
+
+    std.testing.refAllDecls(@import("bitpacking_u8_u3.zig"));
+    std.testing.refAllDecls(@import("bitpacking.zig"));
+    std.testing.refAllDecls(@import("delta.zig"));
+    std.testing.refAllDecls(@import("ffor.zig"));
+}
diff --git a/src/ffor.zig b/src/ffor.zig
new file mode 100644
index 0000000..3d20cfd
--- /dev/null
+++ b/src/ffor.zig
@@ -0,0 +1,42 @@
+/// Fused Frame of Reference (FFoR) codec.
+pub fn FFoR(comptime FL: type) type {
+    return struct {
+        pub fn encode(comptime W: comptime_int, reference: FL.E, in: *const FL.Vector, out: *FL.PackedBytes(W)) void {
+            comptime var packer = FL.bitpacker(W);
+            var tmp: FL.MM1024 = undefined;
+
+            inline for (0..FL.T) |t| {
+                const next = FL.subtract(FL.load(in, t), @splat(reference));
+                tmp = packer.pack(out, next, tmp);
+            }
+        }
+
+        pub fn decode(comptime W: comptime_int, reference: FL.E, in: *const FL.PackedBytes(W), out: *FL.Vector) void {
+            comptime var packer = FL.bitunpacker(W);
+            var tmp: FL.MM1024 = undefined;
+
+            inline for (0..FL.T) |t| {
+                const next, tmp = packer.unpack(in, tmp);
+                FL.store(out, t, FL.add(next, @splat(reference)));
+            }
+        }
+    };
+}
+
+test "fastlanez ffor" {
+    const std = @import("std");
+    const fl = @import("fastlanez.zig");
+
+    const E = u16;
+    const W = 3;
+    const FL = fl.FastLanez(E);
+
+    const input: FL.Vector = .{5} ** 1024;
+    var output: FL.PackedBytes(W) = undefined;
+    FFoR(FL).encode(W, 3, &input, &output);
+
+    var decoded: FL.Vector = undefined;
+    FFoR(FL).decode(W, 3, &output, &decoded);
+
+    try std.testing.expectEqual(input, decoded);
+}
diff --git a/src/helper.zig b/src/helper.zig
new file mode 100644
index 0000000..2de1fac
--- /dev/null
+++ b/src/helper.zig
@@ -0,0 +1,9 @@
+pub fn arange(comptime T: type, comptime n: comptime_int) [n]T {
+    @setEvalBranchQuota(10_000);
+    const std = @import("std");
+    var result: [n]T = undefined;
+    for (0..n) |i| {
+        result[i] = @intCast(i % std.math.maxInt(T));
+    }
+    return result;
+}

From a0423f00cceee42e54b8cfc1d017d3d2c4dce089 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Thu, 29 Feb 2024 13:22:25 +0000
Subject: [PATCH 2/4] Remove submodule

---
 FastLanes | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 FastLanes

diff --git a/FastLanes b/FastLanes
deleted file mode 160000
index 5feb61d..0000000
--- a/FastLanes
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5feb61d3b5078e5499690b51b30bd9a94556ccf6

From 256c68b1362a100f96ab89b678ad3de3e1a88667 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Thu, 29 Feb 2024 13:31:48 +0000
Subject: [PATCH 3/4] Add Apache 2.0

---
 LICENSE                  | 202 +++++++++++++++++++++++++++++++++++++++
 build.zig                |  14 +++
 src/bench.zig            |  14 +++
 src/bench_bitpacking.zig |  14 +++
 src/bench_delta.zig      |  14 +++
 src/bench_ffor.zig       |  14 +++
 src/bitpacking.zig       |  14 +++
 src/bitpacking_u8_u3.zig |  15 +++
 src/delta.zig            |  14 +++
 src/fastlanez.zig        |  14 +++
 src/ffor.zig             |  14 +++
 src/helper.zig           |  14 +++
 12 files changed, 357 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..7a4a3ea
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/build.zig b/build.zig
index 98b185c..2f1ffcb 100644
--- a/build.zig
+++ b/build.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 const std = @import("std");
 
 pub fn build(b: *std.Build) void {
diff --git a/src/bench.zig b/src/bench.zig
index e087e1e..60a8644 100644
--- a/src/bench.zig
+++ b/src/bench.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 const std = @import("std");
 const builtin = @import("builtin");
 const cycleclock = @import("cycleclock");
diff --git a/src/bench_bitpacking.zig b/src/bench_bitpacking.zig
index 542c74f..827d2de 100644
--- a/src/bench_bitpacking.zig
+++ b/src/bench_bitpacking.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 const std = @import("std");
 const fl = @import("./fastlanez.zig");
 const Bench = @import("bench.zig").Bench;
diff --git a/src/bench_delta.zig b/src/bench_delta.zig
index c6cd1eb..ba93747 100644
--- a/src/bench_delta.zig
+++ b/src/bench_delta.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 const std = @import("std");
 const fl = @import("fastlanez.zig");
 const Bench = @import("bench.zig").Bench;
diff --git a/src/bench_ffor.zig b/src/bench_ffor.zig
index 37eff30..1434c71 100644
--- a/src/bench_ffor.zig
+++ b/src/bench_ffor.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 const std = @import("std");
 const fl = @import("./fastlanez.zig");
 const Bench = @import("bench.zig").Bench;
diff --git a/src/bitpacking.zig b/src/bitpacking.zig
index 8d0e4dc..b782acd 100644
--- a/src/bitpacking.zig
+++ b/src/bitpacking.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 pub fn BitPacking(comptime FL: type) type {
     return struct {
         pub fn encode(comptime W: comptime_int, in: *const FL.Vector, out: *FL.PackedBytes(W)) void {
diff --git a/src/bitpacking_u8_u3.zig b/src/bitpacking_u8_u3.zig
index d9473e4..57a2ae8 100644
--- a/src/bitpacking_u8_u3.zig
+++ b/src/bitpacking_u8_u3.zig
@@ -1,4 +1,19 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 //! An implementation of the "unpack 3 -> 8" algorithm ported literally from the FastLanes paper.
+
 const fl = @import("fastlanez.zig");
 
 const E = u8;
diff --git a/src/delta.zig b/src/delta.zig
index c48d45c..228b12a 100644
--- a/src/delta.zig
+++ b/src/delta.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 pub fn Delta(comptime FastLanes: type) type {
     const FL = FastLanes;
 
diff --git a/src/fastlanez.zig b/src/fastlanez.zig
index 1301b1b..4a8758e 100644
--- a/src/fastlanez.zig
+++ b/src/fastlanez.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // This unified transpose layout allows us to operate efficiently using a variety of SIMD lane widths.
 const ORDER: [8]u8 = .{ 0, 4, 2, 6, 1, 5, 3, 7 };
 
diff --git a/src/ffor.zig b/src/ffor.zig
index 3d20cfd..7979607 100644
--- a/src/ffor.zig
+++ b/src/ffor.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 /// Fused Frame of Reference (FFoR) codec.
 pub fn FFoR(comptime FL: type) type {
     return struct {
diff --git a/src/helper.zig b/src/helper.zig
index 2de1fac..8c83140 100644
--- a/src/helper.zig
+++ b/src/helper.zig
@@ -1,3 +1,17 @@
+// (c) Copyright 2024 Fulcrum Technologies, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 pub fn arange(comptime T: type, comptime n: comptime_int) [n]T {
     @setEvalBranchQuota(10_000);
     const std = @import("std");

From 48b695b3cd9bf4908a4d1af97ff285c8753b16b5 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Thu, 29 Feb 2024 13:47:14 +0000
Subject: [PATCH 4/4] Add GitHub Actions

---
 .github/workflows/ci.yml | 30 ++++++++++++++++++++++++++++++
 README.md                |  9 +++------
 2 files changed, 33 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..c767c8c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,30 @@
+name: CI
+
+on:
+  push:
+    branches: [ "develop" ]
+  pull_request:
+    branches: [ "develop" ]
+  workflow_dispatch: { }
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: 'test'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Zig Setup
+        uses: goto-bus-stop/setup-zig@v2
+        with:
+          version: "0.12.0-dev.2541+894493549"
+
+      - name: Zig Lint - Fmt
+        run: zig fmt --check .
+
+      - name: Zig Build
+        shell: bash
+        run: zig build test
diff --git a/README.md b/README.md
index 6a4fa9d..7762ba4 100644
--- a/README.md
+++ b/README.md
@@ -46,19 +46,19 @@ With this order it is possible to make adjacent cells appear in adjacent SIMD wo
 register is. For example, a vector of u32s will be rearranged and then iterated such that adjacent words look like this:
 
 ```
-u32 with 64-bit SIMD
+2xu32 64-bit SIMD
 { 0, 64 }
 { 1, 65 }
 { 2, 66 }
 ...
 
-u32 with 128-bit SIMD
+4xu32 128-bit SIMD
 { 0, 64, 128, 192 }
 { 1, 65, 129, 193 }
 { 2, 66, 130, 194 }
 ...
 
-u32 with 512-bit SIMD
+16xu32 512-bit SIMD
 { 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960 }
 { 1, 65, 129, 193, 257, 321, 385, 449, 513, 577, 641, 705, 769, 833, 897, 961 }
 { 2, 66, 130, 194, 258, 322, 386, 450, 514, 578, 642, 706, 770, 834, 898, 962 }
@@ -177,6 +177,3 @@ As with all benchmarks, take the results with a pinch of salt.
 > I found the performance of benchmarks varies greatly depending on whether the inputs and outputs are stack allocated or
   heap allocated. I was surprised to find that often heap allocation was significantly faster than stack allocation.
   If anyone happens to know why, please do let me know!
-
-The following plot shows the performance vs the original FastLanes repository for all bit unpacking kernels on an M2 Mac:
-