PARQUET-494: Implement DictionaryEncoder and test dictionary decoding

I incorporated quite a bit of code from Impala for this patch, but did a bunch of work to get everything working. In particular, I wasn't happy with the hash table implementation in `dict-encoder.h` and so have written a simple new one that we can benchmark and tune as necessary. The simplest way to pull in the DictEncoder (PARQUET-493) was to also bring in the `MemPool` implementation, suitably trimmed down. We can continue to refactor this as needed for parquet-cpp. I also did some light refactoring using `TYPED_TEST` in `plain-encoding-test` (now `encoding-test`). Author: Wes McKinney <wesm@apache.org> Closes apache#64 from wesm/PARQUET-494 and squashes the following commits: c634abe [Wes McKinney] Refactor to create TestEncoderBase a3a563a [Wes McKinney] Consolidate dictionary encoding code 2cc4ffe [Wes McKinney] Retrieve type_length() only once in PlainDecoder ctor 20ccd9e [Wes McKinney] Remove DictionaryEncoder shim layer for now dcfc0aa [Wes McKinney] Remove redundant Int96 comparison d98a2c0 [Wes McKinney] Dictionary encoding for booleans throws exception 05414f0 [Wes McKinney] Test dictionary encoding more types 9a5b1a4 [Wes McKinney] Enable include_order linting per PARQUET-539 f3f0efc [Wes McKinney] IWYU cleaning d4191c6 [Wes McKinney] Add header installs, fix clang warning 1347b13 [Wes McKinney] Rename plain-encoding-test to encoding-test 09bf0fa [Wes McKinney] Fix bugs and add dictionary repeats 2e6af48 [Wes McKinney] Fix some bugs. FixedLenByteArray remains to get working. 69b5b69 [Wes McKinney] Refactor test fixtures to be less coupled to state. process on getting dict encoding working 6b23716 [Wes McKinney] Create reusable DataType structs for test fixtures and other compile-time type resolution matters 67883fd [Wes McKinney] Bunch of combined work for dict encoding support: Change-Id: I0fe7d47373b9da106e700381bee6538199af8a69
wesm · Feb 26, 2016 · ae1bb51 · ae1bb51
1 parent f0f40f6
commit ae1bb51
Show file tree

Hide file tree

Showing 34 changed files with 1,839 additions and 364 deletions.
diff --git a/cpp/src/parquet/column/column-reader-test.cc b/cpp/src/parquet/column/column-reader-test.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstdlib>
+#include <limits>
 #include <memory>
 #include <string>
 #include <vector>

diff --git a/cpp/src/parquet/column/levels-test.cc b/cpp/src/parquet/column/levels-test.cc
@@ -15,13 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <gtest/gtest.h>
 #include <cstdint>
 #include <memory>
 #include <vector>
 #include <string>
 
-#include <gtest/gtest.h>
-
 #include "parquet/column/levels.h"
 #include "parquet/types.h"
 

diff --git a/cpp/src/parquet/column/reader.cc b/cpp/src/parquet/column/reader.cc
@@ -52,8 +52,9 @@ void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {
   //
   // TODO(wesm): investigate whether this all-or-nothing decoding of the
   // dictionary makes sense and whether performance can be improved
-  std::shared_ptr<DecoderType> decoder(
-      new DictionaryDecoder<TYPE>(descr_, &dictionary));
+
+  auto decoder = std::make_shared<DictionaryDecoder<TYPE> >(descr_);
+  decoder->SetDict(&dictionary);
 
   decoders_[encoding] = decoder;
   current_decoder_ = decoders_[encoding].get();

diff --git a/cpp/src/parquet/column/scanner-test.cc b/cpp/src/parquet/column/scanner-test.cc
@@ -40,33 +40,17 @@ namespace parquet_cpp {
 
 using schema::NodePtr;
 
-bool operator==(const Int96& a, const Int96& b) {
-  return a.value[0] == b.value[0] &&
-    a.value[1] == b.value[1] &&
-    a.value[2] == b.value[2];
-}
-
-bool operator==(const ByteArray& a, const ByteArray& b) {
-  return a.len == b.len && 0 == memcmp(a.ptr, b.ptr, a.len);
-}
-
 static int FLBA_LENGTH = 12;
 bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
   return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
 }
 
 namespace test {
 
-template <int N> class TypeValue {
- public:
-  static const int value = N;
-};
-template <int N> const int TypeValue<N>::value;
-
-template <typename TYPE>
+template <typename Type>
 class TestFlatScanner : public ::testing::Test {
  public:
-  typedef typename type_traits<TYPE::value>::value_type T;
+  typedef typename Type::c_type T;
 
   void InitValues() {
     random_numbers(num_values_, 0, std::numeric_limits<T>::min(),
@@ -106,7 +90,7 @@ class TestFlatScanner : public ::testing::Test {
     // Create values
     values_.resize(num_values_);
     InitValues();
-    Paginate<TYPE::value>(d, values_, def_levels_, max_def_level,
+    Paginate<Type::type_num>(d, values_, def_levels_, max_def_level,
         rep_levels_, max_rep_level, levels_per_page, values_per_page, pages_);
   }
 
@@ -116,8 +100,8 @@ class TestFlatScanner : public ::testing::Test {
   }
 
   void CheckResults(int batch_size, const ColumnDescriptor *d) {
-    TypedScanner<TYPE::value>* scanner =
-      reinterpret_cast<TypedScanner<TYPE::value>* >(scanner_.get());
+    TypedScanner<Type::type_num>* scanner =
+      reinterpret_cast<TypedScanner<Type::type_num>* >(scanner_.get());
     T val;
     bool is_null;
     int16_t def_level;
@@ -158,14 +142,11 @@ class TestFlatScanner : public ::testing::Test {
   void InitDescriptors(std::shared_ptr<ColumnDescriptor>& d1,
       std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>& d3) {
     NodePtr type;
-    type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED,
-        static_cast<Type::type>(TYPE::value));
+    type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num);
     d1.reset(new ColumnDescriptor(type, 0, 0));
-    type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL,
-        static_cast<Type::type>(TYPE::value));
+    type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num);
     d2.reset(new ColumnDescriptor(type, 4, 0));
-    type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED,
-        static_cast<Type::type>(TYPE::value));
+    type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num);
     d3.reset(new ColumnDescriptor(type, 4, 2));
   }
 
@@ -194,18 +175,18 @@ class TestFlatScanner : public ::testing::Test {
 };
 
 template<>
-void TestFlatScanner<TypeValue<Type::BOOLEAN> >::InitValues() {
+void TestFlatScanner<BooleanType>::InitValues() {
   values_ = flip_coins(num_values_, 0);
 }
 
 template<>
-void TestFlatScanner<TypeValue<Type::INT96> >::InitValues() {
+void TestFlatScanner<Int96Type>::InitValues() {
   random_Int96_numbers(num_values_, 0, std::numeric_limits<int32_t>::min(),
       std::numeric_limits<int32_t>::max(), values_.data());
 }
 
 template<>
-void TestFlatScanner<TypeValue<Type::BYTE_ARRAY> >::InitValues() {
+void TestFlatScanner<ByteArrayType>::InitValues() {
   int max_byte_array_len = 12;
   int num_bytes = max_byte_array_len + sizeof(uint32_t);
   size_t nbytes = num_values_ * num_bytes;
@@ -215,15 +196,15 @@ void TestFlatScanner<TypeValue<Type::BYTE_ARRAY> >::InitValues() {
 }
 
 template<>
-void TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY> >::InitValues() {
+void TestFlatScanner<FLBAType>::InitValues() {
   size_t nbytes = num_values_ * FLBA_LENGTH;
   data_buffer_.resize(nbytes);
   random_fixed_byte_array(num_values_, 0, data_buffer_.data(), FLBA_LENGTH,
       values_.data());
 }
 
 template<>
-void TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY> >::InitDescriptors(
+void TestFlatScanner<FLBAType>::InitDescriptors(
     std::shared_ptr<ColumnDescriptor>& d1, std::shared_ptr<ColumnDescriptor>& d2,
     std::shared_ptr<ColumnDescriptor>& d3) {
   NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED,
@@ -237,18 +218,13 @@ void TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY> >::InitDescriptors(
   d3.reset(new ColumnDescriptor(type, 4, 2));
 }
 
-typedef TestFlatScanner<TypeValue<Type::FIXED_LEN_BYTE_ARRAY>> TestFlatFLBAScanner;
+typedef TestFlatScanner<FLBAType> TestFlatFLBAScanner;
 
 static int num_levels_per_page = 100;
 static int num_pages = 20;
 static int batch_size = 32;
 
-typedef ::testing::Types<TypeValue<Type::BOOLEAN>, TypeValue<Type::INT32>,
-    TypeValue<Type::INT64>, TypeValue<Type::INT96>, TypeValue<Type::FLOAT>,
-    TypeValue<Type::DOUBLE>, TypeValue<Type::BYTE_ARRAY>,
-    TypeValue<Type::FIXED_LEN_BYTE_ARRAY> > Primitives;
-
-TYPED_TEST_CASE(TestFlatScanner, Primitives);
+TYPED_TEST_CASE(TestFlatScanner, ParquetTypes);
 
 TYPED_TEST(TestFlatScanner, TestScanner) {
   this->ExecuteAll(num_pages, num_levels_per_page, batch_size);

diff --git a/cpp/src/parquet/compression/codec-test.cc b/cpp/src/parquet/compression/codec-test.cc
@@ -15,14 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <gtest/gtest.h>
 #include <cstdint>
 #include <string>
 #include <vector>
 
-#include <gtest/gtest.h>
-#include "parquet/util/test-common.h"
-
 #include "parquet/compression/codec.h"
+#include "parquet/util/test-common.h"
 
 using std::string;
 using std::vector;

diff --git a/cpp/src/parquet/compression/codec.h b/cpp/src/parquet/compression/codec.h
@@ -18,11 +18,11 @@
 #ifndef PARQUET_COMPRESSION_CODEC_H
 #define PARQUET_COMPRESSION_CODEC_H
 
+#include <zlib.h>
+
 #include <cstdint>
 #include <memory>
 
-#include <zlib.h>
-
 #include "parquet/exception.h"
 #include "parquet/types.h"
 

diff --git a/cpp/src/parquet/compression/lz4-codec.cc b/cpp/src/parquet/compression/lz4-codec.cc
@@ -15,11 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "parquet/compression/codec.h"
-
 #include <lz4.h>
 #include <cstdint>
 
+#include "parquet/compression/codec.h"
 #include "parquet/exception.h"
 
 namespace parquet_cpp {

diff --git a/cpp/src/parquet/compression/snappy-codec.cc b/cpp/src/parquet/compression/snappy-codec.cc
@@ -15,12 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "parquet/compression/codec.h"
-
 #include <snappy.h>
 #include <cstdint>
 #include <cstdlib>
 
+#include "parquet/compression/codec.h"
 #include "parquet/exception.h"
 
 namespace parquet_cpp {

diff --git a/cpp/src/parquet/encodings/CMakeLists.txt b/cpp/src/parquet/encodings/CMakeLists.txt
@@ -26,4 +26,4 @@ install(FILES
   plain-encoding.h
   DESTINATION include/parquet/encodings)
 
-ADD_PARQUET_TEST(plain-encoding-test)
+ADD_PARQUET_TEST(encoding-test)