Skip to content

Commit

Permalink
GH-13: Set up JNI build (dataset, etc.)
Browse files Browse the repository at this point in the history
Fixes #13.
  • Loading branch information
lidavidm committed Dec 5, 2024
1 parent d650aa0 commit ac6bbb9
Show file tree
Hide file tree
Showing 5 changed files with 342 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ ARCH_SHORT=amd64

# Default repository to pull and push images from
REPO=ghcr.io/apache/arrow-java-dev
ARROW_REPO=apache/arrow-dev

# The setup attempts to generate coredumps by default, in order to disable the
# coredump generation set it to 0
Expand All @@ -48,3 +49,9 @@ ULIMIT_CORE=-1
# Default versions for various dependencies
JDK=11
MAVEN=3.9.9

# Versions for various dependencies used to build artifacts
# Keep in sync with apache/arrow
ARROW_REPO_ROOT=./arrow
PYTHON=3.9
VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release
81 changes: 81 additions & 0 deletions .github/workflows/test_jni.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

name: Test (JNI)

on:
push:
branches:
- '**'
- '!dependabot/**'
tags:
- '**'
pull_request:

concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true

permissions:
contents: read

env:
DOCKER_VOLUME_PREFIX: ".docker/"

jobs:
cpp-ubuntu:
name: Build C++ libraries ${{ matrix.platform.runs_on }} ${{ matrix.platform.arch }}
runs-on: ${{ matrix.platform.runs_on }}
strategy:
fail-fast: false
matrix:
platform:
- runs_on: ubuntu-latest
arch: "x86_64"
archery_arch: "amd64"
archery_arch_alias: "x86_64"
archery_arch_short: "amd64"
env:
# architecture name used for archery build
ARCH: ${{ matrix.platform.archery_arch }}
ARCH_ALIAS: ${{ matrix.platform.archery_arch_alias }}
ARCH_SHORT: ${{ matrix.platform.archery_arch_short }}
steps:
- name: Checkout apache/arrow-java
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
fetch-depth: 0
submodules: recursive
- name: Checkout apache/arrow
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
repository: apache/arrow
fetch-depth: 0
path: arrow
submodules: recursive
- name: Build C++ libraries
env:
VCPKG_BINARY_SOURCES: "clear;nuget,GitHub,readwrite"
run: |
docker-compose run vcpkg-jni
- name: Compress into single artifact to keep directory structure
run: tar -cvzf arrow-shared-libs-linux-${{ matrix.platform.arch }}.tar.gz dist/
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: ubuntu-shared-lib-${{ matrix.platform.arch }}
path: arrow-shared-libs-linux-${{ matrix.platform.arch }}.tar.gz
44 changes: 44 additions & 0 deletions ci/docker/vcpkg-jni.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

ARG base
FROM ${base}

# Install the libraries required by Gandiva to run
# Use enable llvm[enable-rtti] in the vcpkg.json to avoid link problems in Gandiva
RUN vcpkg install \
--clean-after-build \
--x-install-root=${VCPKG_ROOT}/installed \
--x-manifest-root=/arrow/ci/vcpkg \
--x-feature=dev \
--x-feature=flight \
--x-feature=gcs \
--x-feature=json \
--x-feature=parquet \
--x-feature=gandiva \
--x-feature=s3

# Install Java
# We need Java for JNI headers, but we don't invoke Maven in this build.
ARG java=11
RUN yum install -y java-$java-openjdk-devel && yum clean all

# For ci/scripts/{cpp,java}_*.sh
ENV ARROW_HOME=/tmp/local \
ARROW_JAVA_CDATA=ON \
ARROW_JAVA_JNI=ON \
ARROW_USE_CCACHE=ON
178 changes: 178 additions & 0 deletions ci/scripts/java_jni_manylinux_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# This script is like java_jni_build.sh, but is meant for release artifacts
# and hardcodes assumptions about the environment it is being run in.

set -eo pipefail

arrow_dir=${1}
build_dir=${2}
normalized_arch=$(arch)
case ${normalized_arch} in
aarch64)
normalized_arch=aarch_64
;;
esac
# The directory where the final binaries will be stored when scripts finish
dist_dir=${3}

echo "=== Install Archery ==="
pip install -e "${arrow_dir}/dev/archery[all]"

echo "=== Clear output directories and leftovers ==="
# Clear output directories and leftovers
rm -rf ${build_dir}
rm -rf "${dist_dir}"

echo "=== Building Arrow C++ libraries ==="
devtoolset_version=$(rpm -qa "devtoolset-*-gcc" --queryformat %{VERSION} | \
grep -o "^[0-9]*")
devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}"
: ${ARROW_ACERO:=ON}
export ARROW_ACERO
: ${ARROW_BUILD_TESTS:=ON}
: ${ARROW_DATASET:=ON}
export ARROW_DATASET
: ${ARROW_GANDIVA:=ON}
export ARROW_GANDIVA
: ${ARROW_GCS:=ON}
: ${ARROW_JEMALLOC:=ON}
: ${ARROW_RPATH_ORIGIN:=ON}
: ${ARROW_ORC:=ON}
export ARROW_ORC
: ${ARROW_PARQUET:=ON}
: ${ARROW_S3:=ON}
: ${ARROW_USE_CCACHE:=OFF}
: ${CMAKE_BUILD_TYPE:=release}
: ${CMAKE_UNITY_BUILD:=ON}
: ${VCPKG_ROOT:=/opt/vcpkg}
: ${VCPKG_FEATURE_FLAGS:=-manifests}
: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}}
: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-lpthread}

if [ "${ARROW_USE_CCACHE}" == "ON" ]; then
echo "=== ccache statistics before build ==="
ccache -sv 2>/dev/null || ccache -s
fi

export ARROW_TEST_DATA="${arrow_dir}/testing/data"
export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data"
export AWS_EC2_METADATA_DISABLED=TRUE

mkdir -p "${build_dir}/cpp"
pushd "${build_dir}/cpp"

cmake \
-DARROW_ACERO=${ARROW_ACERO} \
-DARROW_BUILD_SHARED=OFF \
-DARROW_BUILD_TESTS=ON \
-DARROW_CSV=${ARROW_DATASET} \
-DARROW_DATASET=${ARROW_DATASET} \
-DARROW_SUBSTRAIT=${ARROW_DATASET} \
-DARROW_DEPENDENCY_SOURCE="VCPKG" \
-DARROW_DEPENDENCY_USE_SHARED=OFF \
-DARROW_GANDIVA_PC_CXX_FLAGS=${GANDIVA_CXX_FLAGS} \
-DARROW_GANDIVA=${ARROW_GANDIVA} \
-DARROW_GCS=${ARROW_GCS} \
-DARROW_JEMALLOC=${ARROW_JEMALLOC} \
-DARROW_ORC=${ARROW_ORC} \
-DARROW_PARQUET=${ARROW_PARQUET} \
-DARROW_RPATH_ORIGIN=${ARROW_RPATH_ORIGIN} \
-DARROW_S3=${ARROW_S3} \
-DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \
-DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \
-DGTest_SOURCE=BUNDLED \
-DORC_SOURCE=BUNDLED \
-DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \
-DPARQUET_BUILD_EXAMPLES=OFF \
-DPARQUET_BUILD_EXECUTABLES=OFF \
-DPARQUET_REQUIRE_ENCRYPTION=OFF \
-DVCPKG_MANIFEST_MODE=OFF \
-DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
-GNinja \
${arrow_dir}/cpp
ninja install

if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then
# MinIO is required
exclude_tests="arrow-s3fs-test"
case $(arch) in
aarch64)
# GCS testbench is crashed on aarch64:
# ImportError: ../grpc/_cython/cygrpc.cpython-38-aarch64-linux-gnu.so:
# undefined symbol: vtable for std::__cxx11::basic_ostringstream<
# char, std::char_traits<char>, std::allocator<char> >
exclude_tests="${exclude_tests}|arrow-gcsfs-test"
;;
esac
# unstable
exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test"
exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test"
# external dependency
exclude_tests="${exclude_tests}|arrow-gcsfs-test"
# strptime
exclude_tests="${exclude_tests}|arrow-utility-test"
ctest \
--exclude-regex "${exclude_tests}" \
--label-regex unittest \
--output-on-failure \
--parallel $(nproc) \
--timeout 300
fi

popd


JAVA_JNI_CMAKE_ARGS=""
JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET}"
export JAVA_JNI_CMAKE_ARGS
${arrow_dir}/ci/scripts/java_jni_build.sh \
${arrow_dir} \
${ARROW_HOME} \
${build_dir} \
${dist_dir}

if [ "${ARROW_USE_CCACHE}" == "ON" ]; then
echo "=== ccache statistics after build ==="
ccache -sv 2>/dev/null || ccache -s
fi


echo "=== Checking shared dependencies for libraries ==="
pushd ${dist_dir}
archery linking check-dependencies \
--allow ld-linux-aarch64 \
--allow ld-linux-x86-64 \
--allow libc \
--allow libdl \
--allow libgcc_s \
--allow libm \
--allow libpthread \
--allow librt \
--allow libstdc++ \
--allow libz \
--allow linux-vdso \
arrow_cdata_jni/${normalized_arch}/libarrow_cdata_jni.so \
arrow_dataset_jni/${normalized_arch}/libarrow_dataset_jni.so \
arrow_orc_jni/${normalized_arch}/libarrow_orc_jni.so \
gandiva_jni/${normalized_arch}/libgandiva_jni.so
popd
32 changes: 32 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ volumes:

services:
ubuntu:
# Build and test arrow-java on Ubuntu.
#
# Usage:
# docker compose build ubuntu
# docker compose run ubuntu
Expand All @@ -47,6 +49,10 @@ services:
/arrow-java/ci/scripts/java_test.sh /arrow-java /build"

conda-jni-cdata:
# Builds and tests just the C Data Interface JNI library and JARs.
# (No dependencies on arrow-cpp.)
# This build isn't meant for distribution. It's for testing only.
#
# Usage:
# docker compose build conda-jni-cdata
# docker compose run conda-jni-cdata
Expand Down Expand Up @@ -75,3 +81,29 @@ services:
/arrow-java/ci/scripts/java_jni_build.sh /arrow-java /build/jni /build /jni &&
/arrow-java/ci/scripts/java_build.sh /arrow-java /build /jni &&
/arrow-java/ci/scripts/java_test.sh /arrow-java /build /jni"

vcpkg-jni:
# Builds all the JNI libraries, but not the JARs.
# (Requires arrow-cpp.)
# The artifacts from this build are meant to be used for packaging.
#
# Usage:
# docker compose build vcpkg-jni
# docker compose run vcpkg-jni
image: ${REPO}:${ARCH}-vcpkg-jni
build:
context: .
dockerfile: ci/docker/vcpkg-jni.dockerfile
cache_from:
- ${REPO}:${ARCH}-vcpkg-jni
args:
base: ${ARROW_REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-2014-vcpkg-${VCPKG}
volumes:
- .:/arrow-java:delegated
- ${ARROW_REPO_ROOT}:/arrow:delegated
- ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated
environment:
ARROW_JAVA_CDATA: "ON"
command:
["git config --global --add safe.directory /arrow-java && \
/arrow-java/ci/scripts/java_jni_manylinux_build.sh /arrow /build /arrow-java/dist"]

0 comments on commit ac6bbb9

Please sign in to comment.