Skip to content

Commit

Permalink
PARQUET-494: Implement DictionaryEncoder and test dictionary decoding
Browse files Browse the repository at this point in the history
I incorporated quite a bit of code from Impala for this patch, but did a bunch of work to get everything working. In particular, I wasn't happy with the hash table implementation in `dict-encoder.h` and so have written a simple new one that we can benchmark and tune as necessary.

The simplest way to pull in the DictEncoder (PARQUET-493) was to also bring in the `MemPool` implementation, suitably trimmed down. We can continue to refactor this as needed for parquet-cpp.

I also did some light refactoring using `TYPED_TEST` in `plain-encoding-test` (now `encoding-test`).

Author: Wes McKinney <wesm@apache.org>

Closes apache#64 from wesm/PARQUET-494 and squashes the following commits:

c634abe [Wes McKinney] Refactor to create TestEncoderBase
a3a563a [Wes McKinney] Consolidate dictionary encoding code
2cc4ffe [Wes McKinney] Retrieve type_length() only once in PlainDecoder ctor
20ccd9e [Wes McKinney] Remove DictionaryEncoder shim layer for now
dcfc0aa [Wes McKinney] Remove redundant Int96 comparison
d98a2c0 [Wes McKinney] Dictionary encoding for booleans throws exception
05414f0 [Wes McKinney] Test dictionary encoding more types
9a5b1a4 [Wes McKinney] Enable include_order linting per PARQUET-539
f3f0efc [Wes McKinney] IWYU cleaning
d4191c6 [Wes McKinney] Add header installs, fix clang warning
1347b13 [Wes McKinney] Rename plain-encoding-test to encoding-test
09bf0fa [Wes McKinney] Fix bugs and add dictionary repeats
2e6af48 [Wes McKinney] Fix some bugs. FixedLenByteArray remains to get working.
69b5b69 [Wes McKinney] Refactor test fixtures to be less coupled to state. process on getting dict encoding working
6b23716 [Wes McKinney] Create reusable DataType structs for test fixtures and other compile-time type resolution matters
67883fd [Wes McKinney] Bunch of combined work for dict encoding support:

Change-Id: I0fe7d47373b9da106e700381bee6538199af8a69
  • Loading branch information
wesm authored and julienledem committed Feb 26, 2016
1 parent f0f40f6 commit ae1bb51
Show file tree
Hide file tree
Showing 34 changed files with 1,839 additions and 364 deletions.
1 change: 1 addition & 0 deletions cpp/src/parquet/column/column-reader-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <memory>
#include <string>
#include <vector>
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/column/levels-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>
#include <cstdint>
#include <memory>
#include <vector>
#include <string>

#include <gtest/gtest.h>

#include "parquet/column/levels.h"
#include "parquet/types.h"

Expand Down
5 changes: 3 additions & 2 deletions cpp/src/parquet/column/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {
//
// TODO(wesm): investigate whether this all-or-nothing decoding of the
// dictionary makes sense and whether performance can be improved
std::shared_ptr<DecoderType> decoder(
new DictionaryDecoder<TYPE>(descr_, &dictionary));

auto decoder = std::make_shared<DictionaryDecoder<TYPE> >(descr_);
decoder->SetDict(&dictionary);

decoders_[encoding] = decoder;
current_decoder_ = decoders_[encoding].get();
Expand Down
54 changes: 15 additions & 39 deletions cpp/src/parquet/column/scanner-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,33 +40,17 @@ namespace parquet_cpp {

using schema::NodePtr;

bool operator==(const Int96& a, const Int96& b) {
return a.value[0] == b.value[0] &&
a.value[1] == b.value[1] &&
a.value[2] == b.value[2];
}

bool operator==(const ByteArray& a, const ByteArray& b) {
return a.len == b.len && 0 == memcmp(a.ptr, b.ptr, a.len);
}

static int FLBA_LENGTH = 12;
bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
}

namespace test {

template <int N> class TypeValue {
public:
static const int value = N;
};
template <int N> const int TypeValue<N>::value;

template <typename TYPE>
template <typename Type>
class TestFlatScanner : public ::testing::Test {
public:
typedef typename type_traits<TYPE::value>::value_type T;
typedef typename Type::c_type T;

void InitValues() {
random_numbers(num_values_, 0, std::numeric_limits<T>::min(),
Expand Down Expand Up @@ -106,7 +90,7 @@ class TestFlatScanner : public ::testing::Test {
// Create values
values_.resize(num_values_);
InitValues();
Paginate<TYPE::value>(d, values_, def_levels_, max_def_level,
Paginate<Type::type_num>(d, values_, def_levels_, max_def_level,
rep_levels_, max_rep_level, levels_per_page, values_per_page, pages_);
}

Expand All @@ -116,8 +100,8 @@ class TestFlatScanner : public ::testing::Test {
}

void CheckResults(int batch_size, const ColumnDescriptor *d) {
TypedScanner<TYPE::value>* scanner =
reinterpret_cast<TypedScanner<TYPE::value>* >(scanner_.get());
TypedScanner<Type::type_num>* scanner =
reinterpret_cast<TypedScanner<Type::type_num>* >(scanner_.get());
T val;
bool is_null;
int16_t def_level;
Expand Down Expand Up @@ -158,14 +142,11 @@ class TestFlatScanner : public ::testing::Test {
void InitDescriptors(std::shared_ptr<ColumnDescriptor>& d1,
std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>& d3) {
NodePtr type;
type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED,
static_cast<Type::type>(TYPE::value));
type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num);
d1.reset(new ColumnDescriptor(type, 0, 0));
type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL,
static_cast<Type::type>(TYPE::value));
type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num);
d2.reset(new ColumnDescriptor(type, 4, 0));
type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED,
static_cast<Type::type>(TYPE::value));
type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num);
d3.reset(new ColumnDescriptor(type, 4, 2));
}

Expand Down Expand Up @@ -194,18 +175,18 @@ class TestFlatScanner : public ::testing::Test {
};

template<>
void TestFlatScanner<TypeValue<Type::BOOLEAN> >::InitValues() {
void TestFlatScanner<BooleanType>::InitValues() {
values_ = flip_coins(num_values_, 0);
}

template<>
void TestFlatScanner<TypeValue<Type::INT96> >::InitValues() {
void TestFlatScanner<Int96Type>::InitValues() {
random_Int96_numbers(num_values_, 0, std::numeric_limits<int32_t>::min(),
std::numeric_limits<int32_t>::max(), values_.data());
}

template<>
void TestFlatScanner<TypeValue<Type::BYTE_ARRAY> >::InitValues() {
void TestFlatScanner<ByteArrayType>::InitValues() {
int max_byte_array_len = 12;
int num_bytes = max_byte_array_len + sizeof(uint32_t);
size_t nbytes = num_values_ * num_bytes;
Expand All @@ -215,15 +196,15 @@ void TestFlatScanner<TypeValue<Type::BYTE_ARRAY> >::InitValues() {
}

template<>
void TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY> >::InitValues() {
void TestFlatScanner<FLBAType>::InitValues() {
size_t nbytes = num_values_ * FLBA_LENGTH;
data_buffer_.resize(nbytes);
random_fixed_byte_array(num_values_, 0, data_buffer_.data(), FLBA_LENGTH,
values_.data());
}

template<>
void TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY> >::InitDescriptors(
void TestFlatScanner<FLBAType>::InitDescriptors(
std::shared_ptr<ColumnDescriptor>& d1, std::shared_ptr<ColumnDescriptor>& d2,
std::shared_ptr<ColumnDescriptor>& d3) {
NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED,
Expand All @@ -237,18 +218,13 @@ void TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY> >::InitDescriptors(
d3.reset(new ColumnDescriptor(type, 4, 2));
}

typedef TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY>> TestFlatFLBAScanner;
typedef TestFlatScanner<FLBAType> TestFlatFLBAScanner;

static int num_levels_per_page = 100;
static int num_pages = 20;
static int batch_size = 32;

typedef ::testing::Types<TypeValue<Type::BOOLEAN>, TypeValue<Type::INT32>,
TypeValue<Type::INT64>, TypeValue<Type::INT96>, TypeValue<Type::FLOAT>,
TypeValue<Type::DOUBLE>, TypeValue<Type::BYTE_ARRAY>,
TypeValue<Type::FIXED_LEN_BYTE_ARRAY> > Primitives;

TYPED_TEST_CASE(TestFlatScanner, Primitives);
TYPED_TEST_CASE(TestFlatScanner, ParquetTypes);

TYPED_TEST(TestFlatScanner, TestScanner) {
this->ExecuteAll(num_pages, num_levels_per_page, batch_size);
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/parquet/compression/codec-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>
#include <cstdint>
#include <string>
#include <vector>

#include <gtest/gtest.h>
#include "parquet/util/test-common.h"

#include "parquet/compression/codec.h"
#include "parquet/util/test-common.h"

using std::string;
using std::vector;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/compression/codec.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
#ifndef PARQUET_COMPRESSION_CODEC_H
#define PARQUET_COMPRESSION_CODEC_H

#include <zlib.h>

#include <cstdint>
#include <memory>

#include <zlib.h>

#include "parquet/exception.h"
#include "parquet/types.h"

Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/compression/lz4-codec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
// specific language governing permissions and limitations
// under the License.

#include "parquet/compression/codec.h"

#include <lz4.h>
#include <cstdint>

#include "parquet/compression/codec.h"
#include "parquet/exception.h"

namespace parquet_cpp {
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/compression/snappy-codec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@
// specific language governing permissions and limitations
// under the License.

#include "parquet/compression/codec.h"

#include <snappy.h>
#include <cstdint>
#include <cstdlib>

#include "parquet/compression/codec.h"
#include "parquet/exception.h"

namespace parquet_cpp {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/encodings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ install(FILES
plain-encoding.h
DESTINATION include/parquet/encodings)

ADD_PARQUET_TEST(plain-encoding-test)
ADD_PARQUET_TEST(encoding-test)
Loading

0 comments on commit ae1bb51

Please sign in to comment.