Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-222: Prototyping an IO interface for Arrow, with initial HDFS (libhdfs) client wrapper #94

Merged
merged 1 commit into from
Jun 24, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sudo: required
dist: precise
dist: trusty
addons:
apt:
sources:
Expand All @@ -12,6 +12,9 @@ addons:
- ccache
- cmake
- valgrind
- libboost-dev
- libboost-filesystem-dev
- libboost-system-dev

matrix:
fast_finish: true
Expand Down
9 changes: 9 additions & 0 deletions NOTICE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Apache Arrow
Copyright 2016 The Apache Software Foundation

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

This product includes software from the SFrame project (BSD, 3-clause).
* Copyright (C) 2015 Dato, Inc.
* Copyright (c) 2009 Carnegie Mellon University.
15 changes: 12 additions & 3 deletions ci/travis_before_script_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,21 @@ echo $GTEST_HOME

: ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install}

CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DARROW_PARQUET=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"
CMAKE_COMMON_FLAGS="\
-DARROW_BUILD_BENCHMARKS=ON \
-DARROW_PARQUET=ON \
-DARROW_HDFS=on \
-DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"

if [ $TRAVIS_OS_NAME == "linux" ]; then
cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR
cmake -DARROW_TEST_MEMCHECK=on \
$CMAKE_COMMON_FLAGS \
-DCMAKE_CXX_FLAGS="-Werror" \
$CPP_DIR
else
cmake $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR
cmake $CMAKE_COMMON_FLAGS \
-DCMAKE_CXX_FLAGS="-Werror" \
$CPP_DIR
fi

make -j4
Expand Down
60 changes: 59 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
"Build the Arrow IPC extensions"
ON)

option(ARROW_HDFS
"Build the Arrow IO extensions for the Hadoop file system"
OFF)

option(ARROW_SSE3
"Build Arrow with SSE3"
ON)
Expand Down Expand Up @@ -454,6 +458,47 @@ if ("$ENV{GBENCHMARK_HOME}" STREQUAL "")
set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed)
endif()

# ----------------------------------------------------------------------
# Add Boost dependencies (code adapted from Apache Kudu (incubating))

# find boost headers and libs
set(Boost_DEBUG TRUE)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_LIBS ON)
find_package(Boost COMPONENTS system filesystem REQUIRED)
include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
set(BOOST_STATIC_LIBS ${Boost_LIBRARIES})
list(LENGTH BOOST_STATIC_LIBS BOOST_STATIC_LIBS_LEN)

# Find Boost shared libraries.
set(Boost_USE_STATIC_LIBS OFF)
find_package(Boost COMPONENTS system filesystem REQUIRED)
set(BOOST_SHARED_LIBS ${Boost_LIBRARIES})
list(LENGTH BOOST_SHARED_LIBS BOOST_SHARED_LIBS_LEN)
list(SORT BOOST_SHARED_LIBS)

message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIRS})
message(STATUS "Boost libraries: " ${Boost_LIBRARIES})

math(EXPR LAST_IDX "${BOOST_STATIC_LIBS_LEN} - 1")
foreach(IDX RANGE ${LAST_IDX})
list(GET BOOST_STATIC_LIBS ${IDX} BOOST_STATIC_LIB)
list(GET BOOST_SHARED_LIBS ${IDX} BOOST_SHARED_LIB)

# Remove the prefix/suffix from the library name.
#
# e.g. libboost_system-mt --> boost_system
get_filename_component(LIB_NAME ${BOOST_STATIC_LIB} NAME_WE)
string(REGEX REPLACE "lib([^-]*)(-mt)?" "\\1" LIB_NAME_NO_PREFIX_SUFFIX ${LIB_NAME})
ADD_THIRDPARTY_LIB(${LIB_NAME_NO_PREFIX_SUFFIX}
STATIC_LIB "${BOOST_STATIC_LIB}"
SHARED_LIB "${BOOST_SHARED_LIB}")
list(APPEND ARROW_BOOST_LIBS ${LIB_NAME_NO_PREFIX_SUFFIX})
endforeach()
include_directories(SYSTEM ${Boost_INCLUDE_DIR})

# ----------------------------------------------------------------------
# Enable / disable tests and benchmarks

if(ARROW_BUILD_TESTS)
add_custom_target(unittest ctest -L unittest)
Expand Down Expand Up @@ -529,12 +574,24 @@ endif (UNIX)
# "make lint" target
############################################################
if (UNIX)

file(GLOB_RECURSE LINT_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/src/*.h"
"${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc"
)

FOREACH(item ${LINT_FILES})
IF(NOT (item MATCHES "_generated.h"))
LIST(APPEND FILTERED_LINT_FILES ${item})
ENDIF()
ENDFOREACH(item ${LINT_FILES})

# Full lint
add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py
--verbose=2
--linelength=90
--filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references
`find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
${FILTERED_LINT_FILES})
endif (UNIX)


Expand Down Expand Up @@ -624,6 +681,7 @@ set_target_properties(arrow
target_link_libraries(arrow ${LIBARROW_LINK_LIBS})

add_subdirectory(src/arrow)
add_subdirectory(src/arrow/io)
add_subdirectory(src/arrow/util)
add_subdirectory(src/arrow/types)

Expand Down
39 changes: 39 additions & 0 deletions cpp/doc/HDFS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## Using Arrow's HDFS (Apache Hadoop Distributed File System) interface

### Build requirements

To build the integration, pass the following option to CMake

```shell
-DARROW_HDFS=on
```

For convenience, we have bundled `hdfs.h` for libhdfs from Apache Hadoop in
Arrow's thirdparty. If you wish to build against the `hdfs.h` in your installed
Hadoop distribution, set the `$HADOOP_HOME` environment variable.

### Runtime requirements

By default, the HDFS client C++ class in `libarrow_io` uses the libhdfs JNI
interface to the Java Hadoop client. This library is loaded **at runtime**
(rather than at link / library load time, since the library may not be in your
LD_LIBRARY_PATH), and relies on some environment variables.

* `HADOOP_HOME`: the root of your installed Hadoop distribution. Check in the
`lib/native` directory to look for `libhdfs.so` if you have any questions
about which directory you're after.
* `JAVA_HOME`: the location of your Java SDK installation
* `CLASSPATH`: must contain the Hadoop jars. You can set these using:

```shell
export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
```

#### Setting $JAVA_HOME automatically on OS X

The installed location of Java on OS X can vary, however the following snippet
will set it automatically for you:

```shell
export JAVA_HOME=$(/usr/libexec/java_home)
```
97 changes: 97 additions & 0 deletions cpp/src/arrow/io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# ----------------------------------------------------------------------
# arrow_io : Arrow IO interfaces

set(ARROW_IO_LINK_LIBS
arrow
)

set(ARROW_IO_PRIVATE_LINK_LIBS
boost_system
boost_filesystem
)

set(ARROW_IO_TEST_LINK_LIBS
arrow_io
${ARROW_IO_PRIVATE_LINK_LIBS})

set(ARROW_IO_SRCS
)

if(ARROW_HDFS)
if(NOT THIRDPARTY_DIR)
message(FATAL_ERROR "THIRDPARTY_DIR not set")
endif()

if (DEFINED ENV{HADOOP_HOME})
set(HADOOP_HOME $ENV{HADOOP_HOME})
else()
set(HADOOP_HOME "${THIRDPARTY_DIR}/hadoop")
endif()

set(HDFS_H_PATH "${HADOOP_HOME}/include/hdfs.h")
if (NOT EXISTS ${HDFS_H_PATH})
message(FATAL_ERROR "Did not find hdfs.h at ${HDFS_H_PATH}")
endif()
message(STATUS "Found hdfs.h at: " ${HDFS_H_PATH})
message(STATUS "Building libhdfs shim component")

include_directories(SYSTEM "${HADOOP_HOME}/include")

set(ARROW_HDFS_SRCS
hdfs.cc
libhdfs_shim.cc)

set_property(SOURCE ${ARROW_HDFS_SRCS}
APPEND_STRING PROPERTY
COMPILE_FLAGS "-DHAS_HADOOP")

set(ARROW_IO_SRCS
${ARROW_HDFS_SRCS}
${ARROW_IO_SRCS})

ADD_ARROW_TEST(hdfs-io-test)
ARROW_TEST_LINK_LIBRARIES(hdfs-io-test
${ARROW_IO_TEST_LINK_LIBS})
endif()

add_library(arrow_io SHARED
${ARROW_IO_SRCS}
)
target_link_libraries(arrow_io LINK_PUBLIC ${ARROW_IO_LINK_LIBS})
target_link_libraries(arrow_io LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS})

SET_TARGET_PROPERTIES(arrow_io PROPERTIES LINKER_LANGUAGE CXX)

if (APPLE)
set_target_properties(arrow_io
PROPERTIES
BUILD_WITH_INSTALL_RPATH ON
INSTALL_NAME_DIR "@rpath")
endif()

# Headers: top level
install(FILES
hdfs.h
interfaces.h
DESTINATION include/arrow/io)

install(TARGETS arrow_io
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
Loading