Skip to content

Commit d868ad2

Browse files
authored
[DUCKDB] Native duckdb lance reader (lancedb#347)
1 parent d3edb17 commit d868ad2

File tree

9 files changed

+532
-5
lines changed

9 files changed

+532
-5
lines changed

.clang-format

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ BasedOnStyle: Google
22
ColumnLimit: 100
33
BinPackArguments: false
44
BinPackParameters: false
5+
ReferenceAlignment: Left
56
---
67
Language: Proto
78
BasedOnStyle: Google

.github/workflows/duckdb.yml

+16
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,20 @@ jobs:
1414
defaults:
1515
run:
1616
working-directory: ./integration/duckdb
17+
env:
18+
ArrowVersion: 10.0.1-1
1719
steps:
1820
- uses: actions/checkout@v2
1921
- name: ccache
2022
uses: hendrikmuhs/ccache-action@v1
23+
- name: Install dependencies
24+
run: |
25+
sudo apt update
26+
sudo apt install -y -V ca-certificates lsb-release wget
27+
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
28+
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
29+
sudo apt update
30+
sudo apt install -y -V libarrow-dev=${ArrowVersion} libarrow-dataset-dev=${ArrowVersion} libparquet-dev=${ArrowVersion}
2131
- name: Cmake
2232
run: cmake -B build
2333
- name: Build
@@ -30,6 +40,12 @@ jobs:
3040
working-directory: ./integration/duckdb
3141
steps:
3242
- uses: actions/checkout@v2
43+
- name: Install dependencies
44+
run: |
45+
brew update
46+
cd $(brew --repository)
47+
git checkout 3.6.8 # Arrow 10.0
48+
brew install apache-arrow
3349
- name: Cmake
3450
run: cmake -B build
3551
- name: Build

integration/duckdb/CMakeLists.txt

+16-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ if(POLICY CMP0135)
44
cmake_policy(SET CMP0135 NEW)
55
endif()
66

7-
add_compile_options(-mf16c) # opencv
7+
#add_compile_options(-mf16c) # opencv
88

99
project(lance_duckdb CXX)
1010
option(LANCE_BUILD_PYTORCH "Build with PyTorch" TRUE)
@@ -88,7 +88,7 @@ endif()
8888

8989
FetchContent_MakeAvailable(${available_contents})
9090

91-
set(CMAKE_CXX_STANDARD 17)
91+
set(CMAKE_CXX_STANDARD 20)
9292
set(CMAKE_CXX_STANDARD_REQUIRED True)
9393

9494
include_directories(${duckdb_SOURCE_DIR}/src/include)
@@ -109,12 +109,24 @@ if(LANCE_BUILD_PYTORCH)
109109
include_directories(${OpenCV_INCLUDE_DIRS})
110110
endif()
111111

112+
# Add lance core as dependency
113+
find_package(Arrow REQUIRED)
114+
find_package(ArrowDataset REQUIRED)
115+
include_directories(${CMAKE_BINARY_DIR}/lance/src ../../cpp/include ../../cpp/src)
116+
add_subdirectory(../../cpp lance)
117+
112118
include_directories(src)
113119

114120
set(LANCE_EXT_SOURCE_COMMON
121+
src/lance/duckdb/lance_reader.cc
122+
src/lance/duckdb/lance_reader.h
115123
src/lance/duckdb/lance-extension.cc
124+
src/lance/duckdb/lance.cc
125+
src/lance/duckdb/lance.h
116126
src/lance/duckdb/list_functions.cc
117-
src/lance/duckdb/vector_functions.cc)
127+
src/lance/duckdb/list_functions.h
128+
src/lance/duckdb/vector_functions.cc
129+
)
118130

119131
set(LANCE_EXT_SOURCE_ML
120132
src/lance/duckdb/ml/catalog.cc
@@ -132,6 +144,7 @@ endif()
132144
# add_library(lance_extension STATIC ${LANCE_EXT_SOURCES})
133145
set(PARAMETERS "-warnings")
134146
build_loadable_extension(lance ${PARAMETERS} ${LANCE_EXT_SOURCES})
147+
target_link_libraries(lance_loadable_extension lance ArrowDataset::arrow_dataset_shared fmt::fmt)
135148

136149
if(LANCE_BUILD_PYTORCH)
137150
target_link_libraries(lance_loadable_extension "${TORCH_LIBRARIES}"

integration/duckdb/src/lance/duckdb/lance-extension.cc

+10-2
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818

1919
#include <duckdb.hpp>
2020

21+
#include "lance/duckdb/lance_reader.h"
2122
#include "lance/duckdb/list_functions.h"
22-
#include "lance/duckdb/vector_functions.h"
2323
#include "lance/duckdb/ml/functions.h"
24+
#include "lance/duckdb/vector_functions.h"
2425

2526
namespace duckdb {
2627

@@ -29,6 +30,7 @@ void LanceExtension::Load(::duckdb::DuckDB &db) {
2930
con.BeginTransaction();
3031
auto &context = *con.context;
3132
auto &catalog = ::duckdb::Catalog::GetCatalog(context);
33+
auto &config = DBConfig::GetConfig(*db.instance);
3234

3335
for (auto &func : lance::duckdb::GetListFunctions()) {
3436
catalog.CreateFunction(context, func.get());
@@ -46,11 +48,17 @@ void LanceExtension::Load(::duckdb::DuckDB &db) {
4648
catalog.CreateTableFunction(context, func.get());
4749
}
4850

51+
auto scan_func = lance::duckdb::GetLanceReaderFunction();
52+
::duckdb::CreateTableFunctionInfo scan(scan_func);
53+
catalog.CreateTableFunction(context, &scan);
54+
55+
config.replacement_scans.emplace_back(lance::duckdb::LanceScanReplacement);
56+
4957
con.Commit();
5058
}
5159

5260
std::string LanceExtension::Name() { return {"lance"}; }
53-
};
61+
}; // namespace duckdb
5462

5563
extern "C" {
5664

integration/duckdb/src/lance/duckdb/lance-extension.h

+2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ namespace duckdb {
2222

2323
class LanceExtension : public Extension {
2424
public:
25+
2526
void Load(DuckDB &db) override;
27+
2628
std::string Name() override;
2729
};
2830

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// Copyright 2022 Lance Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
//
15+
16+
#include "lance/duckdb/lance.h"
17+
18+
#include <arrow/type.h>
19+
20+
#include <duckdb/common/exception.hpp>
21+
#include <vector>
22+
23+
namespace lance::duckdb {
24+
25+
namespace {
26+
27+
inline ::duckdb::LogicalType ToLogicalType(const ::arrow::DictionaryType& dtype) {
28+
return lance::duckdb::ToLogicalType(*dtype.value_type());
29+
}
30+
31+
inline ::duckdb::LogicalType ToLogicalType(const ::arrow::StructType& struct_type) {
32+
::duckdb::child_list_t<::duckdb::LogicalType> children;
33+
for (auto& child : struct_type.fields()) {
34+
children.emplace_back(
35+
std::make_pair(child->name(), lance::duckdb::ToLogicalType(*child->type())));
36+
}
37+
return ::duckdb::LogicalType::STRUCT(children);
38+
}
39+
40+
template <typename L>
41+
inline ::duckdb::LogicalType ToLogicalType(const ::arrow::DataType& dtype) {
42+
auto& list_type = dynamic_cast<const L&>(dtype);
43+
auto child_type = lance::duckdb::ToLogicalType(*list_type.value_type());
44+
return ::duckdb::LogicalType::LIST(child_type);
45+
}
46+
47+
} // namespace
48+
49+
::duckdb::LogicalType ToLogicalType(const ::arrow::DataType& arrow_type) {
50+
switch (arrow_type.id()) {
51+
case ::arrow::Type::BOOL:
52+
return ::duckdb::LogicalType::BOOLEAN;
53+
case ::arrow::Type::INT8:
54+
return ::duckdb::LogicalType::TINYINT;
55+
case ::arrow::Type::UINT8:
56+
return ::duckdb::LogicalType::UTINYINT;
57+
case ::arrow::Type::INT16:
58+
return ::duckdb::LogicalType::SMALLINT;
59+
case ::arrow::Type::UINT16:
60+
return ::duckdb::LogicalType::USMALLINT;
61+
case ::arrow::Type::INT32:
62+
return ::duckdb::LogicalType::INTEGER;
63+
case ::arrow::Type::UINT64:
64+
return ::duckdb::LogicalType::UINTEGER;
65+
case ::arrow::Type::FLOAT:
66+
case ::arrow::Type::HALF_FLOAT:
67+
return ::duckdb::LogicalType::FLOAT;
68+
case ::arrow::Type::DOUBLE:
69+
return ::duckdb::LogicalType::DOUBLE;
70+
case ::arrow::Type::STRING:
71+
case ::arrow::Type::LARGE_STRING:
72+
return ::duckdb::LogicalType::VARCHAR;
73+
case ::arrow::Type::BINARY:
74+
case ::arrow::Type::LARGE_BINARY:
75+
return ::duckdb::LogicalType::BLOB;
76+
case ::arrow::Type::TIME32:
77+
case ::arrow::Type::TIME64:
78+
return ::duckdb::LogicalType::TIME;
79+
case ::arrow::Type::TIMESTAMP:
80+
return ::duckdb::LogicalType::TIMESTAMP;
81+
case ::arrow::Type::DATE32:
82+
case ::arrow::Type::DATE64:
83+
return ::duckdb::LogicalType::DATE;
84+
case ::arrow::Type::DICTIONARY:
85+
return ToLogicalType(dynamic_cast<const ::arrow::DictionaryType&>(arrow_type));
86+
case ::arrow::Type::STRUCT:
87+
return ToLogicalType(dynamic_cast<const ::arrow::StructType&>(arrow_type));
88+
case ::arrow::Type::LIST:
89+
return ToLogicalType<::arrow::ListType>(arrow_type);
90+
case ::arrow::Type::FIXED_SIZE_LIST:
91+
return ToLogicalType<::arrow::FixedSizeListType>(arrow_type);
92+
default:
93+
throw ::duckdb::InvalidInputException("Does not support type: %s",
94+
arrow_type.ToString().c_str());
95+
}
96+
}
97+
98+
} // namespace lance::duckdb
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// Copyright 2022 Lance Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
//
15+
16+
#pragma once
17+
18+
/// \brief Lance Core Adaptors and utilities
19+
20+
#include <arrow/result.h>
21+
#include <arrow/status.h>
22+
#include <arrow/type_fwd.h>
23+
24+
#include <duckdb/common/exception.hpp>
25+
#include <duckdb/common/types.hpp>
26+
27+
namespace lance::duckdb {
28+
29+
template <typename T, typename E = ::duckdb::IOException>
30+
T GetResult(::arrow::Result<T>&& result) {
31+
if (result.ok()) {
32+
return std::move(result.ValueOrDie());
33+
}
34+
throw E(result.status().message());
35+
}
36+
37+
template <typename E = ::duckdb::IOException>
38+
void CheckStatus(const ::arrow::Status& status) {
39+
if (!status.ok()) {
40+
throw E(status.message());
41+
}
42+
}
43+
44+
/// Convert Arrow and Lance types into DuckDB logical type
45+
::duckdb::LogicalType ToLogicalType(const ::arrow::DataType& arrow_type);
46+
47+
} // namespace lance::duckdb

0 commit comments

Comments
 (0)