Skip to content

Commit

Permalink
Add version to KLL serialization and make sure backward compatilibity (
Browse files Browse the repository at this point in the history
…#3623)

Summary: Pull Request resolved: #3623

Reviewed By: kgpai

Differential Revision: D42315342

fbshipit-source-id: 2da7725ad6ecaf1911b4c0f1fa808cfa90de7407
  • Loading branch information
Yuhta authored and facebook-github-bot committed Jan 4, 2023
1 parent 011381e commit 12ea726
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 13 deletions.
13 changes: 12 additions & 1 deletion velox/functions/lib/KllSketch-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ namespace detail {

constexpr uint8_t kMaxLevel = 60;

// Current version number for the serialzation format. Everytime the
// serialization format changes, this needs to be increased and a new
// deserializer should be added. An adapter to convert serialization of
// previous version to serialization of current version should also be added.
constexpr int16_t kVersion = 1;

uint32_t computeTotalCapacity(uint32_t k, uint8_t numLevels);

uint32_t levelCapacity(uint32_t k, uint8_t numLevels, uint8_t height);
Expand Down Expand Up @@ -733,7 +739,8 @@ KllSketch<T, A, C> KllSketch<T, A, C>::fromView(

template <typename T, typename A, typename C>
size_t KllSketch<T, A, C>::serializedByteSize() const {
size_t ans = sizeof k_ + sizeof n_ + sizeof minValue_ + sizeof maxValue_;
size_t ans = sizeof detail::kVersion + sizeof k_ + sizeof n_;
ans += sizeof minValue_ + sizeof maxValue_;
ans += sizeof(size_t) + sizeof(T) * items_.size();
ans += sizeof(size_t) + sizeof(uint32_t) * levels_.size();
return ans;
Expand All @@ -742,6 +749,7 @@ size_t KllSketch<T, A, C>::serializedByteSize() const {
template <typename T, typename A, typename C>
void KllSketch<T, A, C>::serialize(char* out) const {
size_t i = 0;
detail::write(detail::kVersion, out, i);
detail::write(k_, out, i);
detail::write(n_, out, i);
detail::write(minValue_, out, i);
Expand All @@ -754,6 +762,9 @@ void KllSketch<T, A, C>::serialize(char* out) const {
template <typename T, typename A, typename C>
void KllSketch<T, A, C>::View::deserialize(const char* data) {
size_t i = 0;
int16_t version;
detail::read(data, i, version);
VELOX_CHECK_EQ(version, detail::kVersion, "Unsupported version: {}", version);
detail::read(data, i, k);
detail::read(data, i, n);
detail::read(data, i, minValue);
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/lib/KllSketch.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ struct KllSketch {
/// Get frequencies of items being tracked. The result is sorted by item.
std::vector<std::pair<T, uint64_t>> getFrequencies() const;

/// Internal API, do not use outside Velox.
struct View {
uint32_t k;
size_t n;
Expand All @@ -159,8 +160,10 @@ struct KllSketch {
void deserialize(const char* FOLLY_NONNULL);
};

/// Internal API, do not use outside Velox.
void mergeViews(const folly::Range<const View*>& views);

/// Internal API, do not use outside Velox.
View toView() const;

private:
Expand Down
6 changes: 5 additions & 1 deletion velox/functions/lib/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ add_executable(
Re2FunctionsTest.cpp
ZetaDistributionTest.cpp)

add_test(velox_functions_lib_test velox_functions_lib_test)
add_test(
NAME velox_functions_lib_test
COMMAND velox_functions_lib_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

target_link_libraries(
velox_functions_lib_test
Expand All @@ -33,6 +36,7 @@ target_link_libraries(
velox_exec_test_lib
velox_expression
velox_memory
velox_dwio_common_test_utils
gtest
gtest_main
gmock
Expand Down
67 changes: 56 additions & 11 deletions velox/functions/lib/tests/KllSketchTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
*/

#include <gtest/gtest.h>
#include <fstream>

#include "velox/common/memory/HashStringAllocator.h"
#include "velox/dwio/common/tests/utils/DataFiles.h"
#include "velox/functions/lib/KllSketch.h"

namespace facebook::velox::functions::kll::test {
Expand All @@ -36,6 +38,23 @@ std::vector<double> linspace(int len) {
return out;
}

std::string getDataFilePath(const std::string& name) {
return velox::test::getDataFilePath(
"velox/functions/lib/tests", fmt::format("data/{}", name));
}

void insertRandomData(int seed, int n, KllSketch<double>& kll, double* values) {
std::default_random_engine gen(seed);
std::normal_distribution<> dist;
for (int i = 0; i < n; ++i) {
auto v = dist(gen);
kll.insert(v);
if (values) {
values[i] = v;
}
}
}

TEST(KllSketchTest, oneItem) {
KllSketch<double> kll;
EXPECT_EQ(kll.totalCount(), 0);
Expand Down Expand Up @@ -88,13 +107,8 @@ TEST(KllSketchTest, randomInput) {
constexpr int N = 1e5;
constexpr int M = 1001;
KllSketch<double> kll(kDefaultK, {}, 0);
std::default_random_engine gen(0);
std::normal_distribution<> dist;
double values[N];
for (int i = 0; i < N; ++i) {
values[i] = dist(gen);
kll.insert(values[i]);
}
insertRandomData(0, N, kll, values);
EXPECT_EQ(kll.totalCount(), N);
kll.finish();
std::sort(std::begin(values), std::end(values));
Expand Down Expand Up @@ -207,20 +221,51 @@ TEST(KllSketchTest, kFromEpsilon) {
TEST(KllSketchTest, serialize) {
constexpr int N = 1e5;
constexpr int M = 1001;
KllSketch<double> kll;
for (int i = 0; i < N; ++i) {
kll.insert(i);
}
kll.finish();
KllSketch<double> kll(kDefaultK, {}, 0);
insertRandomData(0, N, kll, nullptr);
kll.compact();
std::vector<char> data(kll.serializedByteSize());
kll.serialize(data.data());
#if 0
// Enable this to write serialization to a file for backward compatibility
// test.
auto path = getDataFilePath(fmt::format("kll-ver-{}", detail::kVersion));
std::ofstream output(path);
output.write(data.data(), data.size());
ASSERT_FALSE(output.fail());
#endif
auto kll2 = KllSketch<double>::deserialize(data.data());
auto q = linspace(M);
auto v = kll.estimateQuantiles(folly::Range(q.begin(), q.end()));
auto v2 = kll2.estimateQuantiles(folly::Range(q.begin(), q.end()));
EXPECT_EQ(v, v2);
}

TEST(KllSketchTest, deserialize) {
constexpr int N = 1e5;
constexpr int M = 1001;
auto readFile = [](const std::string& path) {
std::ifstream input(path);
VELOX_CHECK(!input.fail());
std::stringstream buf;
buf << input.rdbuf();
auto data = buf.str();
return KllSketch<double>::deserialize(data.data());
};
auto currentVersion =
readFile(getDataFilePath(fmt::format("kll-ver-{}", detail::kVersion)));
for (int version = 1; version < detail::kVersion; ++version) {
auto path = getDataFilePath(fmt::format("kll-ver-{}", version));
SCOPED_TRACE(path);
auto oldVersion = readFile(path);
auto q = linspace(M);
auto v = oldVersion.estimateQuantiles(folly::Range(q.begin(), q.end()));
auto v2 =
currentVersion.estimateQuantiles(folly::Range(q.begin(), q.end()));
EXPECT_EQ(v, v2);
}
}

TEST(KllSketchTest, compact) {
constexpr int N = 1e5;
KllSketch<double> kll(kFromEpsilon(0.001));
Expand Down
Binary file added velox/functions/lib/tests/data/kll-ver-1
Binary file not shown.

0 comments on commit 12ea726

Please sign in to comment.