From 2a7353a74e7b7c86ef9f759b4b039e5c42c9e36e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 18 Jun 2022 17:13:03 +0200 Subject: [PATCH 01/43] Basic wiring for GCS in R --- r/R/filesystem.R | 85 +++++++++++++++++++++++++++++++++++++++----- r/configure | 5 +++ r/data-raw/codegen.R | 63 +++++++++++++++++--------------- r/src/filesystem.cpp | 79 ++++++++++++++++++++++++++++++++++++++++ r/tools/autobrew | 1 + 5 files changed, 196 insertions(+), 37 deletions(-) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index b035430ff6589..6d130da4f17f9 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -269,7 +269,20 @@ FileSystem <- R6Class("FileSystem", } ), active = list( - type_name = function() fs___FileSystem__type_name(self) + type_name = function() fs___FileSystem__type_name(self), + url_scheme = function() { + fs_type_name <- self$type_name + if (identical(fs_type_name, "subtree")) { + # Recurse + return(self$base_fs$url_scheme) + } + # Some type_names are the url scheme but others aren't + type_map <- list( + local = "file", + gcs = "gs" + ) + type_map[[fs_type_name]] %||% fs_type_name + } ) ) FileSystem$from_uri <- function(uri) { @@ -435,6 +448,64 @@ s3_bucket <- function(bucket, ...) { SubTreeFileSystem$create(fs_and_path$path, fs) } +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @importFrom utils modifyList +#' @export +GcsFileSystem <- R6Class("GcsFileSystem", + inherit = FileSystem +) +GcsFileSystem$create <- function(anonymous = FALSE, ...) { + options <- list(...) + + # Validate options + if (isTRUE(anonymous)) { + invalid_args <- intersect( + c("access_token", "expiration", "json_credentials"), + names(options) + ) + if (length(invalid_args)) { + stop( + "Cannot specify ", + oxford_paste(invalid_args), + " when anonymous = TRUE", + call. = FALSE + ) + } + } else if (!is.null(options[["access_token"]])) { + # access_token string requires expiration timestamp + if (is.null(options[["expiration"]])) { + stop("access_token auth requires specifying 'expiration'", call. = FALSE) + } + # those are mutually exclusive with json_credentials + if (!is.null(options[["json_credentials"]])) { + stop("Cannot provide json_credentials with access_token", call. = FALSE) + } + } else if (!is.null(options[["json_credentials"]])) { + if (is.null(options[["access_token"]]) || is.null(options[["expiration"]])) { + stop("Cannot provide access_token with json_credentials", call. = FALSE) + } + } + + valid_opts <- c( + "access_token", "expiration", "json_credentials", "endpoint_override", + "scheme", "default_bucket_location", "retry_limit_seconds", + "default_metadata" + ) + + invalid_opts <- setdiff(names(options), valid_opts) + if (length(invalid_opts)) { + stop( + "Invalid options for GcsFileSystem: ", + oxford_paste(invalid_opts), + call. = FALSE + ) + } + + fs___GcsFileSystem__Make(anonymous, options) +} + #' @usage NULL #' @format NULL #' @rdname FileSystem @@ -443,13 +514,11 @@ SubTreeFileSystem <- R6Class("SubTreeFileSystem", inherit = FileSystem, public = list( print = function(...) { - if (inherits(self$base_fs, "LocalFileSystem")) { - cat("SubTreeFileSystem: ", "file://", self$base_path, "\n", sep = "") - } else if (inherits(self$base_fs, "S3FileSystem")) { - cat("SubTreeFileSystem: ", "s3://", self$base_path, "\n", sep = "") - } else { - cat("SubTreeFileSystem", "\n", sep = "") - } + cat( + "SubTreeFileSystem: ", + self$url_scheme, "://", self$base_path, "\n", + sep = "" + ) invisible(self) } ), diff --git a/r/configure b/r/configure index 30fa4bff1e12b..7e47f0af2f1ee 100755 --- a/r/configure +++ b/r/configure @@ -262,6 +262,11 @@ if [ $? -eq 0 ]; then BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl" fi fi + # Check for GCS + grep -i 'set(ARROW_GCS "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 + if [ $? -eq 0 ]; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS" + fi # Check for JSON grep -i 'set(ARROW_JSON "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 if [ $? -eq 0 ]; then diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index fd1781c49fac8..92a4267153bef 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -30,7 +30,7 @@ # Ensure that all machines are sorting the same way invisible(Sys.setlocale("LC_COLLATE", "C")) -features <- c("dataset", "substrait", "parquet", "s3", "json") +features <- c("dataset", "substrait", "parquet", "s3", "gcs", "json") suppressPackageStartupMessages({ library(decor) @@ -44,7 +44,9 @@ get_exported_functions <- function(decorations, export_tag) { out <- decorations %>% filter(decoration %in% paste0(export_tag, "::export")) %>% mutate(functions = map(context, decor:::parse_cpp_function)) %>% - { vec_cbind(., vec_rbind(!!!pull(., functions))) } %>% + { + vec_cbind(., vec_rbind(!!!pull(., functions))) + } %>% select(-functions) %>% mutate(decoration = sub("::export", "", decoration)) message(glue("*** > {n} functions decorated with [[{tags}::export]]", n = nrow(out), tags = paste0(export_tag, collapse = "|"))) @@ -58,7 +60,7 @@ glue_collapse_data <- function(data, ..., sep = ", ", last = "") { } wrap_call <- function(name, return_type, args) { - call <- glue::glue('{name}({list_params})', list_params = glue_collapse_data(args, "{name}")) + call <- glue::glue("{name}({list_params})", list_params = glue_collapse_data(args, "{name}")) if (return_type == "void") { glue::glue("\t{call};\n\treturn R_NilValue;", .trim = FALSE) } else { @@ -68,7 +70,7 @@ wrap_call <- function(name, return_type, args) { feature_available <- function(feat) { glue::glue( -'extern "C" SEXP _{feat}_available() {{ + 'extern "C" SEXP _{feat}_available() {{ return Rf_ScalarLogical( #if defined(ARROW_R_WITH_{toupper(feat)}) TRUE @@ -77,11 +79,12 @@ return Rf_ScalarLogical( #endif ); }} -') +' + ) } write_if_modified <- function(code, file) { - old <- try(readLines(file), silent=TRUE) + old <- try(readLines(file), silent = TRUE) new <- unclass(unlist(strsplit(code, "\n"))) # We don't care about changes in empty lines if (!identical(old[nzchar(old)], new[nzchar(new)])) { @@ -124,7 +127,7 @@ cpp_functions_definitions <- arrow_exports %>% select(name, return_type, args, file, line, decoration) %>% pmap_chr(function(name, return_type, args, file, line, decoration) { sexp_params <- glue_collapse_data(args, "SEXP {name}_sexp") - sexp_signature <- glue('_arrow_{name}({sexp_params})') + sexp_signature <- glue("_arrow_{name}({sexp_params})") cpp11_wrapped <- glue(' {return_type} {name}({real_params}); extern "C" SEXP {sexp_signature}{{ @@ -135,12 +138,13 @@ cpp_functions_definitions <- arrow_exports %>% sep = "\n", real_params = glue_collapse_data(args, "{type} {name}"), input_params = glue_collapse_data(args, "\tarrow::r::Input<{type}>::type {name}({name}_sexp);", sep = "\n"), - return_line = if (nrow(args)) "\n" else "") + return_line = if (nrow(args)) "\n" else "" + ) - glue::glue(' + glue::glue(" // {basename(file)} {ifdef_wrap(cpp11_wrapped, name, sexp_signature, decoration)} - ', + ", sep = "\n", ) }) %>% @@ -161,25 +165,25 @@ cpp_file_header <- '// Generated by using data-raw/codegen.R -> do not edit by h ' arrow_exports_cpp <- paste0( -glue::glue(' + glue::glue(" {cpp_file_header} {cpp_functions_definitions} -\n'), -glue::glue_collapse(glue::glue(' +\n"), + glue::glue_collapse(glue::glue(" {feature_available({features})} -'), sep = '\n'), -' +"), sep = "\n"), + " static const R_CallMethodDef CallEntries[] = { -', -glue::glue_collapse(glue::glue( - '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},', -), sep = '\n'), -glue::glue('\n +", + glue::glue_collapse(glue::glue( + '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},', + ), sep = "\n"), + glue::glue("\n {cpp_functions_registration} \t\t{{NULL, NULL, 0}} }}; -\n'), -'extern "C" void R_init_arrow(DllInfo* dll){ +\n"), + 'extern "C" void R_init_arrow(DllInfo* dll){ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); @@ -188,7 +192,8 @@ glue::glue('\n #endif } -\n') +\n' +) write_if_modified(arrow_exports_cpp, "src/arrowExports.cpp") @@ -200,27 +205,27 @@ r_functions <- arrow_exports %>% } else { "" } - call <- glue::glue('.Call(`_arrow_{name}`{params})') + call <- glue::glue(".Call(`_arrow_{name}`{params})") if (return_type == "void") { - call <- glue::glue('invisible({call})') + call <- glue::glue("invisible({call})") } - glue::glue(' + glue::glue(" {name} <- function({list_params}) {{ {call} }} - ', + ", list_params = glue_collapse_data(args, "{name}"), sep = "\n", ) }) %>% glue_collapse(sep = "\n") -arrow_exports_r <- glue::glue(' +arrow_exports_r <- glue::glue(" # Generated by using data-raw/codegen.R -> do not edit by hand {r_functions} -') +") write_if_modified(arrow_exports_r, "R/arrowExports.R") diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index bcafef34e41ea..2c86b24a8c033 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -33,6 +33,12 @@ const char* r6_class_name::get( return "LocalFileSystem"; } else if (type_name == "s3") { return "S3FileSystem"; + } else if (type_name == "gcs") { + return "GcsFileSystem" + } else if (type_name == "abfs") { + return "AzureBlobFileSystem" + } else if (type_name == "hdfs") { + return "HadoopFileSystem" } else if (type_name == "subtree") { return "SubTreeFileSystem"; } else { @@ -335,3 +341,76 @@ std::string fs___S3FileSystem__region(const std::shared_ptr& f } #endif + +#if defined(ARROW_R_WITH_GCS) + +#include + +std::shared_ptr strings_to_kvm(cpp11::strings metadata); + +// [[gcs::export]] +std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, + cpp11::list options) { + fs::GcsOptions gcs_opts; + + // Handle auth (anonymous, credentials, default) + // (validation/internal coherence handled in R) + if (anonymous) { + gcs_opts = fs::GcsOptions::Anonymous(); + } else if (!Rf_isNull(options["access_token"])) { + // Convert POSIXct timestamp ms to nanoseconds + std::chrono::nanoseconds ns_count(static_cast(options["expiration"]) * + 1000000); + auto expiration_timepoint = + TimePoint(std::chrono::duration_cast(ns_count)); + gcs_opts = fs::GcsOptions::FromAccessToken( + cpp11::as_cpp(options["access_token"]), expiration_timepoint); + // TODO: implement FromImpersonatedServiceAccount + // } else if (base_credentials != "") { + // // static GcsOptions FromImpersonatedServiceAccount( + // // const GcsCredentials& base_credentials, const std::string& + // target_service_account); + // // TODO: construct GcsCredentials + // gcs_opts = fs::GcsOptions::FromImpersonatedServiceAccount(base_credentials, + // target_service_account); + } else if (!Rf_isNull(options["json_credentials"]) { + gcs_opts = fs::GcsOptions::FromServiceAccountCredentials( + cpp11::as_cpp(options["json_credentials"])); + } else { + gcs_opts = fs::GcsOptions::Defaults(); + } + + // Handle other attributes + if (!Rf_isNull(options["endpoint_override"])) { + options.endpoint_override = cpp11::as_cpp(options["endpoint_override"]); + } + + if (!Rf_isNull(options["scheme"])) { + options.scheme = cpp11::as_cpp(options["scheme"]); + } + + // /// \brief Location to use for creating buckets. + if (!Rf_isNull(options["default_bucket_location"])) { + options.default_bucket_location = + cpp11::as_cpp(options["default_bucket_location"]); + } + // /// \brief If set used to control total time allowed for retrying underlying + // /// errors. + // /// + // /// The default policy is to retry for up to 15 minutes. + if (!Rf_isNull(options["retry_limit_seconds"])) { + options.retry_limit_seconds = cpp11::as_cpp(options["retry_limit_seconds"]); + } + + // /// \brief Default metadata for OpenOutputStream. + // /// + // /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + if (!Rf_isNull(options["default_metadata"])) { + options.default_metadata = strings_to_kvm(options["default_metadata"]); + } + + auto io_context = arrow::io::IOContext(gc_memory_pool()); + return ValueOrStop(fs::GcsFileSystem::Make(gcs_opts, io_context)); +} + +#endif diff --git a/r/tools/autobrew b/r/tools/autobrew index 25b6fa97d894a..8ba06a64c27d6 100644 --- a/r/tools/autobrew +++ b/r/tools/autobrew @@ -60,6 +60,7 @@ for FILE in $BREWDIR/Cellar/*/*/lib/*.a; do PKG_LIBS=`echo $PKG_LIBS | sed "s/-l$LIBNAME/-lbrew$LIBNAME/g"` done +# TODO: add -DARROW_R_WITH_GCS PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3" unset HOMEBREW_NO_ANALYTICS From b203c88390bd7f17f7e17067cf9e25e146696c02 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 18 Jun 2022 16:42:48 -0400 Subject: [PATCH 02/43] Compiles now but symbol not found --- r/R/arrowExports.R | 5 +++++ r/src/arrowExports.cpp | 27 +++++++++++++++++++++++++++ r/src/arrow_types.h | 8 ++++++++ r/src/filesystem.cpp | 29 +++++++++++++++-------------- 4 files changed, 55 insertions(+), 14 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 4c579840e4913..bf5a8d0682181 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1288,6 +1288,10 @@ fs___S3FileSystem__region <- function(fs) { .Call(`_arrow_fs___S3FileSystem__region`, fs) } +fs___GcsFileSystem__Make <- function(anonymous, options) { + .Call(`_arrow_fs___GcsFileSystem__Make`, anonymous, options) +} + io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } @@ -2007,3 +2011,4 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } + diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 887327d48f982..947270199ab12 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3222,6 +3222,22 @@ extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){ } #endif +// filesystem.cpp +#if defined(ARROW_R_WITH_GCS) +std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, cpp11::list options); +extern "C" SEXP _arrow_fs___GcsFileSystem__Make(SEXP anonymous_sexp, SEXP options_sexp){ +BEGIN_CPP11 + arrow::r::Input::type anonymous(anonymous_sexp); + arrow::r::Input::type options(options_sexp); + return cpp11::as_sexp(fs___GcsFileSystem__Make(anonymous, options)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs___GcsFileSystem__Make(SEXP anonymous_sexp, SEXP options_sexp){ + Rf_error("Cannot call fs___GcsFileSystem__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5099,6 +5115,15 @@ return Rf_ScalarLogical( #endif ); } +extern "C" SEXP _gcs_available() { +return Rf_ScalarLogical( +#if defined(ARROW_R_WITH_GCS) + TRUE +#else + FALSE +#endif +); +} extern "C" SEXP _json_available() { return Rf_ScalarLogical( #if defined(ARROW_R_WITH_JSON) @@ -5113,6 +5138,7 @@ static const R_CallMethodDef CallEntries[] = { { "_substrait_available", (DL_FUNC)& _substrait_available, 0 }, { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, + { "_gcs_available", (DL_FUNC)& _gcs_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, { "_arrow_test_SET_STRING_ELT", (DL_FUNC) &_arrow_test_SET_STRING_ELT, 1}, { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, @@ -5436,6 +5462,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 15}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, + { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index d9fee37e7f138..60de6eff8fee3 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -69,6 +69,14 @@ namespace ds = ::arrow::dataset; namespace compute = ::arrow::compute; namespace fs = ::arrow::fs; +#if defined(ARROW_R_WITH_GCS) +namespace arrow { +namespace fs { +class GcsFileSystem; +} // namespace fs +} // namespace arrow +#endif + std::shared_ptr RecordBatch__from_arrays(SEXP, SEXP); arrow::MemoryPool* gc_memory_pool(); arrow::compute::ExecContext* gc_context(); diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 2c86b24a8c033..c1ed8a435c712 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -34,11 +34,11 @@ const char* r6_class_name::get( } else if (type_name == "s3") { return "S3FileSystem"; } else if (type_name == "gcs") { - return "GcsFileSystem" + return "GcsFileSystem"; } else if (type_name == "abfs") { - return "AzureBlobFileSystem" + return "AzureBlobFileSystem"; } else if (type_name == "hdfs") { - return "HadoopFileSystem" + return "HadoopFileSystem"; } else if (type_name == "subtree") { return "SubTreeFileSystem"; } else { @@ -358,11 +358,11 @@ std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, if (anonymous) { gcs_opts = fs::GcsOptions::Anonymous(); } else if (!Rf_isNull(options["access_token"])) { - // Convert POSIXct timestamp ms to nanoseconds - std::chrono::nanoseconds ns_count(static_cast(options["expiration"]) * - 1000000); + // Convert POSIXct timestamp seconds to nanoseconds + std::chrono::nanoseconds ns_count( + static_cast(cpp11::as_cpp(options["expiration"])) * 1000000000); auto expiration_timepoint = - TimePoint(std::chrono::duration_cast(ns_count)); + fs::TimePoint(std::chrono::duration_cast(ns_count)); gcs_opts = fs::GcsOptions::FromAccessToken( cpp11::as_cpp(options["access_token"]), expiration_timepoint); // TODO: implement FromImpersonatedServiceAccount @@ -373,7 +373,7 @@ std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, // // TODO: construct GcsCredentials // gcs_opts = fs::GcsOptions::FromImpersonatedServiceAccount(base_credentials, // target_service_account); - } else if (!Rf_isNull(options["json_credentials"]) { + } else if (!Rf_isNull(options["json_credentials"])) { gcs_opts = fs::GcsOptions::FromServiceAccountCredentials( cpp11::as_cpp(options["json_credentials"])); } else { @@ -382,16 +382,16 @@ std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, // Handle other attributes if (!Rf_isNull(options["endpoint_override"])) { - options.endpoint_override = cpp11::as_cpp(options["endpoint_override"]); + gcs_opts.endpoint_override = cpp11::as_cpp(options["endpoint_override"]); } if (!Rf_isNull(options["scheme"])) { - options.scheme = cpp11::as_cpp(options["scheme"]); + gcs_opts.scheme = cpp11::as_cpp(options["scheme"]); } // /// \brief Location to use for creating buckets. if (!Rf_isNull(options["default_bucket_location"])) { - options.default_bucket_location = + gcs_opts.default_bucket_location = cpp11::as_cpp(options["default_bucket_location"]); } // /// \brief If set used to control total time allowed for retrying underlying @@ -399,18 +399,19 @@ std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, // /// // /// The default policy is to retry for up to 15 minutes. if (!Rf_isNull(options["retry_limit_seconds"])) { - options.retry_limit_seconds = cpp11::as_cpp(options["retry_limit_seconds"]); + gcs_opts.retry_limit_seconds = cpp11::as_cpp(options["retry_limit_seconds"]); } // /// \brief Default metadata for OpenOutputStream. // /// // /// This will be ignored if non-empty metadata is passed to OpenOutputStream. if (!Rf_isNull(options["default_metadata"])) { - options.default_metadata = strings_to_kvm(options["default_metadata"]); + gcs_opts.default_metadata = strings_to_kvm(options["default_metadata"]); } auto io_context = arrow::io::IOContext(gc_memory_pool()); - return ValueOrStop(fs::GcsFileSystem::Make(gcs_opts, io_context)); + // TODO: S3FileSystem::Make returns a Result and uses ValueOrStop but this doesn't? + return fs::GcsFileSystem::Make(gcs_opts, io_context); } #endif From f3c6e8648d56e3f68142a3404891741f374bf3db Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sun, 19 Jun 2022 15:50:55 -0400 Subject: [PATCH 03/43] Update absl cmake for latest version in order to fix undefined symbol --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 198 ++++++++++---------- 1 file changed, 94 insertions(+), 104 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 2abbb52b52907..34856bddb36e6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2704,15 +2704,16 @@ macro(resolve_dependency_absl) stacktrace status statusor - strerror str_format_internal + strerror strings strings_internal symbolize synchronization throw_delegate time - time_zone) + time_zone + wyhash) # Abseil creates a number of header-only targets, which are needed to resolve dependencies. # The list can be refreshed using: # comm -13 <(ls -l $PREFIX/lib/libabsl_*.a | sed -e 's/.*libabsl_//' -e 's/.a$//' | sort -u) \ @@ -2769,8 +2770,8 @@ macro(resolve_dependency_absl) pretty_function random_bit_gen_ref random_internal_distribution_caller - random_internal_fastmath random_internal_fast_uniform_bits + random_internal_fastmath random_internal_generate_real random_internal_iostream_state_saver random_internal_mock_helpers @@ -2808,18 +2809,17 @@ macro(resolve_dependency_absl) endforeach() # Extracted the dependency information using the Abseil pkg-config files: - # grep Requires $PREFIX/pkgconfig/absl_*.pc | \ + # grep Requires $PREFIX/lib/pkgconfig/absl_*.pc | \ # sed -e 's;.*/absl_;set_property(TARGET absl::;' \ # -e 's/.pc:Requires:/ PROPERTY INTERFACE_LINK_LIBRARIES /' \ - # -e 's/ = 20210324,//g' \ - # -e 's/ = 20210324//g' \ + # -E -e 's/ = 20[0-9]{6},?//g' \ # -e 's/absl_/absl::/g' \ # -e 's/$/)/' | \ # grep -v 'INTERFACE_LINK_LIBRARIES[ ]*)' + set_property(TARGET absl::algorithm PROPERTY INTERFACE_LINK_LIBRARIES absl::config) set_property(TARGET absl::algorithm_container PROPERTY INTERFACE_LINK_LIBRARIES absl::algorithm absl::core_headers absl::meta) - set_property(TARGET absl::algorithm PROPERTY INTERFACE_LINK_LIBRARIES absl::config) set_property(TARGET absl::any PROPERTY INTERFACE_LINK_LIBRARIES absl::bad_any_cast @@ -2830,19 +2830,17 @@ macro(resolve_dependency_absl) absl::utility) set_property(TARGET absl::atomic_hook PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::core_headers) + set_property(TARGET absl::bad_any_cast PROPERTY INTERFACE_LINK_LIBRARIES + absl::bad_any_cast_impl absl::config) set_property(TARGET absl::bad_any_cast_impl PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::raw_logging_internal) - set_property(TARGET absl::bad_any_cast PROPERTY INTERFACE_LINK_LIBRARIES - absl::bad_any_cast_impl absl::config) set_property(TARGET absl::bad_optional_access PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::raw_logging_internal) set_property(TARGET absl::bad_variant_access PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::raw_logging_internal) - set_property(TARGET absl::base_internal PROPERTY INTERFACE_LINK_LIBRARIES - absl::config absl::type_traits) set_property(TARGET absl::base PROPERTY INTERFACE_LINK_LIBRARIES absl::atomic_hook @@ -2854,6 +2852,8 @@ macro(resolve_dependency_absl) absl::raw_logging_internal absl::spinlock_wait absl::type_traits) + set_property(TARGET absl::base_internal PROPERTY INTERFACE_LINK_LIBRARIES + absl::config absl::type_traits) set_property(TARGET absl::bind_front PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::compressed_tuple) @@ -2874,12 +2874,12 @@ macro(resolve_dependency_absl) absl::utility) set_property(TARGET absl::city PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::core_headers absl::endian) - set_property(TARGET absl::cleanup_internal - PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::core_headers - absl::utility) set_property(TARGET absl::cleanup PROPERTY INTERFACE_LINK_LIBRARIES absl::cleanup_internal absl::config absl::core_headers) + set_property(TARGET absl::cleanup_internal + PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::core_headers + absl::utility) set_property(TARGET absl::compare PROPERTY INTERFACE_LINK_LIBRARIES absl::core_headers absl::type_traits) set_property(TARGET absl::compressed_tuple PROPERTY INTERFACE_LINK_LIBRARIES @@ -2892,19 +2892,6 @@ macro(resolve_dependency_absl) absl::memory absl::type_traits absl::utility) - set_property(TARGET absl::cord_internal - PROPERTY INTERFACE_LINK_LIBRARIES - absl::base_internal - absl::compressed_tuple - absl::config - absl::core_headers - absl::endian - absl::inlined_vector - absl::layout - absl::raw_logging_internal - absl::strings - absl::throw_delegate - absl::type_traits) set_property(TARGET absl::cord PROPERTY INTERFACE_LINK_LIBRARIES absl::base @@ -2923,6 +2910,19 @@ macro(resolve_dependency_absl) absl::raw_logging_internal absl::strings absl::type_traits) + set_property(TARGET absl::cord_internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::base_internal + absl::compressed_tuple + absl::config + absl::core_headers + absl::endian + absl::inlined_vector + absl::layout + absl::raw_logging_internal + absl::strings + absl::throw_delegate + absl::type_traits) set_property(TARGET absl::cordz_functions PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -2971,6 +2971,8 @@ macro(resolve_dependency_absl) set_property(TARGET absl::core_headers PROPERTY INTERFACE_LINK_LIBRARIES absl::config) set_property(TARGET absl::counting_allocator PROPERTY INTERFACE_LINK_LIBRARIES absl::config) + set_property(TARGET absl::debugging PROPERTY INTERFACE_LINK_LIBRARIES + absl::stacktrace absl::leak_check) set_property(TARGET absl::debugging_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::core_headers @@ -2978,8 +2980,6 @@ macro(resolve_dependency_absl) absl::dynamic_annotations absl::errno_saver absl::raw_logging_internal) - set_property(TARGET absl::debugging PROPERTY INTERFACE_LINK_LIBRARIES - absl::stacktrace absl::leak_check) set_property(TARGET absl::demangle_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::base absl::core_headers) set_property(TARGET absl::dynamic_annotations PROPERTY INTERFACE_LINK_LIBRARIES @@ -3015,8 +3015,16 @@ macro(resolve_dependency_absl) absl::dynamic_annotations absl::throw_delegate absl::memory) - set_property(TARGET absl::flags_commandlineflag_internal - PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::fast_type_id) + set_property(TARGET absl::flags + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::flags_commandlineflag + absl::flags_config + absl::flags_internal + absl::flags_reflection + absl::base + absl::core_headers + absl::strings) set_property(TARGET absl::flags_commandlineflag PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3024,6 +3032,8 @@ macro(resolve_dependency_absl) absl::flags_commandlineflag_internal absl::optional absl::strings) + set_property(TARGET absl::flags_commandlineflag_internal + PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::fast_type_id) set_property(TARGET absl::flags_config PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3067,16 +3077,6 @@ macro(resolve_dependency_absl) absl::synchronization) set_property(TARGET absl::flags_path_util PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::strings) - set_property(TARGET absl::flags - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::flags_commandlineflag - absl::flags_config - absl::flags_internal - absl::flags_reflection - absl::base - absl::core_headers - absl::strings) set_property(TARGET absl::flags_private_handle_accessor PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3099,6 +3099,13 @@ macro(resolve_dependency_absl) absl::strings absl::synchronization absl::flat_hash_map) + set_property(TARGET absl::flags_usage + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::core_headers + absl::flags_usage_internal + absl::strings + absl::synchronization) set_property(TARGET absl::flags_usage_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3113,13 +3120,6 @@ macro(resolve_dependency_absl) absl::flat_hash_map absl::strings absl::synchronization) - set_property(TARGET absl::flags_usage - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::core_headers - absl::flags_usage_internal - absl::strings - absl::synchronization) set_property(TARGET absl::flat_hash_map PROPERTY INTERFACE_LINK_LIBRARIES absl::container_memory @@ -3146,12 +3146,6 @@ macro(resolve_dependency_absl) absl::core_headers absl::malloc_internal absl::raw_logging_internal) - set_property(TARGET absl::hash_function_defaults - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::cord - absl::hash - absl::strings) set_property(TARGET absl::hash PROPERTY INTERFACE_LINK_LIBRARIES absl::city @@ -3166,12 +3160,18 @@ macro(resolve_dependency_absl) absl::variant absl::utility absl::low_level_hash) + set_property(TARGET absl::hash_function_defaults + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::cord + absl::hash + absl::strings) set_property(TARGET absl::hash_policy_traits PROPERTY INTERFACE_LINK_LIBRARIES absl::meta) - set_property(TARGET absl::hashtable_debug_hooks PROPERTY INTERFACE_LINK_LIBRARIES - absl::config) set_property(TARGET absl::hashtable_debug PROPERTY INTERFACE_LINK_LIBRARIES absl::hashtable_debug_hooks) + set_property(TARGET absl::hashtable_debug_hooks PROPERTY INTERFACE_LINK_LIBRARIES + absl::config) set_property(TARGET absl::hashtablez_sampler PROPERTY INTERFACE_LINK_LIBRARIES absl::base @@ -3179,13 +3179,6 @@ macro(resolve_dependency_absl) absl::have_sse absl::sample_recorder absl::synchronization) - set_property(TARGET absl::inlined_vector_internal - PROPERTY INTERFACE_LINK_LIBRARIES - absl::compressed_tuple - absl::core_headers - absl::memory - absl::span - absl::type_traits) set_property(TARGET absl::inlined_vector PROPERTY INTERFACE_LINK_LIBRARIES absl::algorithm @@ -3193,6 +3186,13 @@ macro(resolve_dependency_absl) absl::inlined_vector_internal absl::throw_delegate absl::memory) + set_property(TARGET absl::inlined_vector_internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::compressed_tuple + absl::core_headers + absl::memory + absl::span + absl::type_traits) set_property(TARGET absl::int128 PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::core_headers absl::bits) set_property(TARGET absl::kernel_timeout_internal @@ -3291,10 +3291,10 @@ macro(resolve_dependency_absl) absl::strings absl::str_format absl::span) - set_property(TARGET absl::random_internal_fastmath PROPERTY INTERFACE_LINK_LIBRARIES - absl::bits) set_property(TARGET absl::random_internal_fast_uniform_bits PROPERTY INTERFACE_LINK_LIBRARIES absl::config) + set_property(TARGET absl::random_internal_fastmath PROPERTY INTERFACE_LINK_LIBRARIES + absl::bits) set_property(TARGET absl::random_internal_generate_real PROPERTY INTERFACE_LINK_LIBRARIES absl::bits @@ -3335,6 +3335,10 @@ macro(resolve_dependency_absl) absl::random_seed_gen_exception absl::raw_logging_internal absl::span) + set_property(TARGET absl::random_internal_randen + PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform + absl::random_internal_randen_hwaes + absl::random_internal_randen_slow) set_property(TARGET absl::random_internal_randen_engine PROPERTY INTERFACE_LINK_LIBRARIES absl::endian @@ -3342,16 +3346,12 @@ macro(resolve_dependency_absl) absl::random_internal_randen absl::raw_logging_internal absl::type_traits) - set_property(TARGET absl::random_internal_randen_hwaes_impl - PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform - absl::config) set_property(TARGET absl::random_internal_randen_hwaes PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform absl::random_internal_randen_hwaes_impl absl::config) - set_property(TARGET absl::random_internal_randen + set_property(TARGET absl::random_internal_randen_hwaes_impl PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform - absl::random_internal_randen_hwaes - absl::random_internal_randen_slow) + absl::config) set_property(TARGET absl::random_internal_randen_slow PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform absl::config) @@ -3439,16 +3439,6 @@ macro(resolve_dependency_absl) set_property(TARGET absl::stacktrace PROPERTY INTERFACE_LINK_LIBRARIES absl::debugging_internal absl::config absl::core_headers) - set_property(TARGET absl::statusor - PROPERTY INTERFACE_LINK_LIBRARIES - absl::base - absl::status - absl::core_headers - absl::raw_logging_internal - absl::type_traits - absl::strings - absl::utility - absl::variant) set_property(TARGET absl::status PROPERTY INTERFACE_LINK_LIBRARIES absl::atomic_hook @@ -3463,8 +3453,18 @@ macro(resolve_dependency_absl) absl::cord absl::str_format absl::optional) - set_property(TARGET absl::strerror PROPERTY INTERFACE_LINK_LIBRARIES absl::config - absl::core_headers absl::errno_saver) + set_property(TARGET absl::statusor + PROPERTY INTERFACE_LINK_LIBRARIES + absl::base + absl::status + absl::core_headers + absl::raw_logging_internal + absl::type_traits + absl::strings + absl::utility + absl::variant) + set_property(TARGET absl::str_format PROPERTY INTERFACE_LINK_LIBRARIES + absl::str_format_internal) set_property(TARGET absl::str_format_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::bits @@ -3475,15 +3475,8 @@ macro(resolve_dependency_absl) absl::type_traits absl::int128 absl::span) - set_property(TARGET absl::str_format PROPERTY INTERFACE_LINK_LIBRARIES - absl::str_format_internal) - set_property(TARGET absl::strings_internal - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::core_headers - absl::endian - absl::raw_logging_internal - absl::type_traits) + set_property(TARGET absl::strerror PROPERTY INTERFACE_LINK_LIBRARIES absl::config + absl::core_headers absl::errno_saver) set_property(TARGET absl::strings PROPERTY INTERFACE_LINK_LIBRARIES absl::strings_internal @@ -3497,6 +3490,13 @@ macro(resolve_dependency_absl) absl::raw_logging_internal absl::throw_delegate absl::type_traits) + set_property(TARGET absl::strings_internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::core_headers + absl::endian + absl::raw_logging_internal + absl::type_traits) set_property(TARGET absl::symbolize PROPERTY INTERFACE_LINK_LIBRARIES absl::debugging_internal @@ -3547,6 +3547,8 @@ macro(resolve_dependency_absl) absl::core_headers absl::type_traits absl::utility) + set_property(TARGET absl::wyhash PROPERTY INTERFACE_LINK_LIBRARIES absl::config + absl::endian absl::int128) if(APPLE) # This is due to upstream absl::cctz issue @@ -3556,18 +3558,6 @@ macro(resolve_dependency_absl) APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${CoreFoundation}) endif() - set_property(TARGET absl::type_traits PROPERTY INTERFACE_LINK_LIBRARIES absl::config) - set_property(TARGET absl::utility - PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::config - absl::type_traits) - set_property(TARGET absl::variant - PROPERTY INTERFACE_LINK_LIBRARIES - absl::bad_variant_access - absl::base_internal - absl::config - absl::core_headers - absl::type_traits - absl::utility) externalproject_add(absl_ep ${EP_LOG_OPTIONS} From ca19afb60df9605cfd6d17789a9c2908e57317d8 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 09:11:40 -0400 Subject: [PATCH 04/43] GCS needs curl and openssl like S3 --- r/configure | 39 +++++++++++++++++++-------------------- r/tools/nixlibs.R | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/r/configure b/r/configure index 7e47f0af2f1ee..d62c58eedae1c 100755 --- a/r/configure +++ b/r/configure @@ -229,49 +229,48 @@ if [ $? -eq 0 ]; then # Check for features LIB_DIR=`echo $PKG_DIRS | sed -e 's/^-L//'` ARROW_OPTS_CMAKE="$LIB_DIR/cmake/arrow/ArrowOptions.cmake" - # Check for Parquet - grep -i 'set(ARROW_PARQUET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + + arrow_built_with() { + # Function to check cmake options for features + grep -i 'set('"$1"' "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 + } + + if arrow_built_with ARROW_PARQUET; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_PARQUET" PKG_LIBS="-lparquet $PKG_LIBS" # NOTE: parquet is assumed to have the same -L flag as arrow # so there is no need to add its location to PKG_DIRS fi - # Check for Arrow Dataset subcomponent - grep -i 'set(ARROW_DATASET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_DATASET; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_DATASET" PKG_LIBS="-larrow_dataset $PKG_LIBS" # NOTE: arrow-dataset is assumed to have the same -L flag as arrow # so there is no need to add its location to PKG_DIRS fi - # Check for Arrow Substrait subcomponent - grep -i 'set(ARROW_SUBSTRAIT "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_SUBSTRAIT; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_SUBSTRAIT" PKG_LIBS="-larrow_substrait $PKG_LIBS" # NOTE: arrow-substrait is assumed to have the same -L flag as arrow # so there is no need to add its location to PKG_DIRS fi - # Check for S3 - grep -i 'set(ARROW_S3 "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_JSON; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" + fi + if arrow_built_with ARROW_S3; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3" if [ "$BUNDLED_LIBS" != "" ]; then # We're depending on openssl/curl from the system, so they're not in the bundled deps BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl" fi fi - # Check for GCS - grep -i 'set(ARROW_GCS "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_GCS; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS" + if [ "$BUNDLED_LIBS" != "" ]; then + # GCS also requires openssl and curl + BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl" + fi fi - # Check for JSON - grep -i 'set(ARROW_JSON "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then - PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" - fi + # prepend PKG_DIRS and append BUNDLED_LIBS to PKG_LIBS PKG_LIBS="$PKG_DIRS $PKG_LIBS $BUNDLED_LIBS" echo "PKG_CFLAGS=$PKG_CFLAGS" diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 768c6291939aa..ba2a2535f28a8 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -312,11 +312,11 @@ build_libarrow <- function(src_dir, dst_dir) { # CXXFLAGS = R_CMD_config("CXX11FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS") ) - env_var_list <- with_s3_support(env_var_list) + env_var_list <- with_cloud_support(env_var_list) env_var_list <- with_mimalloc(env_var_list) # turn_off_all_optional_features() needs to happen after with_mimalloc() and - # with_s3_support(), since those might turn features ON. + # with_cloud_support(), since those might turn features ON. thirdparty_deps_unavailable <- !download_ok && !dir.exists(thirdparty_dependency_dir) && !env_is("ARROW_DEPENDENCY_SOURCE", "system") @@ -538,24 +538,45 @@ with_mimalloc <- function(env_var_list) { replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF")) } -with_s3_support <- function(env_var_list) { +with_cloud_support <- function(env_var_list) { arrow_s3 <- is_feature_requested("ARROW_S3") - if (arrow_s3) { - # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 + arrow_gcs <- is_feature_requested("ARROW_GCS") + if (arrow_s3 || arrow_gcs) { + # User wants S3 or GCS support. + # If they're using gcc, let's make sure the version is >= 4.9 + # (aws-sdk-cpp requires that; google-cloud-cpp only tests with >= 6.3) # and make sure that we have curl and openssl system libs + feats <- c( + if (arrow_s3) "S3", + if (arrow_gcs) "GCS" + ) + start_msg <- paste(feats, collapse = "/") + off_flags <- paste("ARROW_", feats, "=OFF", sep = "", collapse = " and ") + print_warning <- function(msg) { + # Utility to assemble warning message in the console + cat("**** ", start_msg, " support ", msg, "; building with ", off_flags, "\n") + } + + # Check the features if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { - cat("**** S3 support not available for gcc < 4.9; building with ARROW_S3=OFF\n") + print_warning("not available for gcc < 4.9") arrow_s3 <- FALSE + arrow_gcs <- FALSE } else if (!cmake_find_package("CURL", NULL, env_var_list)) { # curl on macos should be installed, so no need to alter this for macos - cat("**** S3 support requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb); building with ARROW_S3=OFF\n") + print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb") arrow_s3 <- FALSE + arrow_gcs <- FALSE } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { - cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") + print_warning("requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew)") arrow_s3 <- FALSE + arrow_gcs <- FALSE } } - replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) + + # Update the build flags + env_var_list <- replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) + replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) } cmake_gcc_version <- function(env_var_list) { From db82303e3f2cfcc23a5a42ee3bc36404bcddfe75 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 12:43:06 -0400 Subject: [PATCH 05/43] Add some basic tests that exercise the bindings, no actual or mock GCS needed --- r/R/arrow-info.R | 11 ++++++- r/R/filesystem.R | 16 ++++------ r/tests/testthat/test-gcs.R | 60 +++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 12 deletions(-) create mode 100644 r/tests/testthat/test-gcs.R diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 28afe75e6d6e8..55d07b77cb4a0 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -44,6 +44,7 @@ arrow_info <- function() { parquet = arrow_with_parquet(), json = arrow_with_json(), s3 = arrow_with_s3(), + gcs = arrow_with_gcs(), utf8proc = "utf8_upper" %in% compute_funcs, re2 = "replace_substring_regex" %in% compute_funcs, vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) @@ -116,6 +117,14 @@ arrow_with_s3 <- function() { }) } +#' @rdname arrow_info +#' @export +arrow_with_gcs <- function() { + tryCatch(.Call(`_gcs_available`), error = function(e) { + return(FALSE) + }) +} + #' @rdname arrow_info #' @export arrow_with_json <- function() { @@ -150,7 +159,7 @@ print.arrow_info <- function(x, ...) { mimalloc = "mimalloc" %in% x$memory_pool$available_backends )) if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) { - # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris + # Only on linux because (e.g.) we disable certain features on purpose on rtools35 cat( "To reinstall with more optional capabilities enabled, see\n", " https://arrow.apache.org/docs/r/articles/install.html\n\n" diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 6d130da4f17f9..75997431a434f 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -473,18 +473,12 @@ GcsFileSystem$create <- function(anonymous = FALSE, ...) { call. = FALSE ) } - } else if (!is.null(options[["access_token"]])) { - # access_token string requires expiration timestamp - if (is.null(options[["expiration"]])) { - stop("access_token auth requires specifying 'expiration'", call. = FALSE) - } - # those are mutually exclusive with json_credentials - if (!is.null(options[["json_credentials"]])) { - stop("Cannot provide json_credentials with access_token", call. = FALSE) - } - } else if (!is.null(options[["json_credentials"]])) { - if (is.null(options[["access_token"]]) || is.null(options[["expiration"]])) { + } else { + token_args <- intersect(c("access_token", "expiration"), names(options)) + if (!is.null(options[["json_credentials"]]) && length(token_args) > 0) { stop("Cannot provide access_token with json_credentials", call. = FALSE) + } else if (length(token_args) == 1) { + stop("token auth requires both 'access_token' and 'expiration'", call. = FALSE) } } diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R new file mode 100644 index 0000000000000..a823442f30b57 --- /dev/null +++ b/r/tests/testthat/test-gcs.R @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("gcs") + +test_that("FileSystem$from_uri with gs://", { + fs_and_path <- FileSystem$from_uri("gs://my/test/bucket/") + expect_r6_class(fs_and_path$fs, "GcsFileSystem") + expect_identical(fs_and_path$path, "my/test/bucket") +}) + +test_that("GcsFileSystem$create() options", { + # TODO: expose options as a list so we can confirm they are set? + expect_r6_class(GcsFileSystem$create(), "GcsFileSystem") + expect_r6_class(GcsFileSystem$create(anonymous = TRUE), "GcsFileSystem") + expect_r6_class( + GcsFileSystem$create( + anonymous = TRUE, + scheme = "http", + endpoint_override = "localhost:8888", + default_bucket_location = "here", + retry_limit_seconds = 30, + default_metadata = c(a = "list", of = "stuff") + ), + "GcsFileSystem" + ) +}) + +test_that("GcsFileSystem$create() input validation", { + expect_error( + GcsFileSystem$create(anonymous = TRUE, access_token = "something"), + 'Cannot specify "access_token" when anonymous = TRUE' + ) + expect_error( + GcsFileSystem$create(expiration = Sys.time()), + "token auth requires both 'access_token' and 'expiration'" + ) + expect_error( + GcsFileSystem$create(json_credentials = "{}", expiration = Sys.time()), + "Cannot provide access_token with json_credentials" + ) + expect_error( + GcsFileSystem$create(role_arn = "something"), + 'Invalid options for GcsFileSystem: "role_arn"' + ) +}) From 14b8ecd4449c0bfde2ad4a188003397b146a2d54 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 13:19:27 -0400 Subject: [PATCH 06/43] Move type forwarding to cpp --- cpp/src/arrow/filesystem/type_fwd.h | 1 + r/src/arrow_types.h | 8 -------- r/src/filesystem.cpp | 9 +++++---- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/filesystem/type_fwd.h b/cpp/src/arrow/filesystem/type_fwd.h index 112563577db08..c6427dc3c8643 100644 --- a/cpp/src/arrow/filesystem/type_fwd.h +++ b/cpp/src/arrow/filesystem/type_fwd.h @@ -44,6 +44,7 @@ class SubTreeFileSystem; class SlowFileSystem; class LocalFileSystem; class S3FileSystem; +class GcsFileSystem; } // namespace fs } // namespace arrow diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 60de6eff8fee3..d9fee37e7f138 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -69,14 +69,6 @@ namespace ds = ::arrow::dataset; namespace compute = ::arrow::compute; namespace fs = ::arrow::fs; -#if defined(ARROW_R_WITH_GCS) -namespace arrow { -namespace fs { -class GcsFileSystem; -} // namespace fs -} // namespace arrow -#endif - std::shared_ptr RecordBatch__from_arrays(SEXP, SEXP); arrow::MemoryPool* gc_memory_pool(); arrow::compute::ExecContext* gc_context(); diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index c1ed8a435c712..6f08feb28b3f5 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -35,10 +35,11 @@ const char* r6_class_name::get( return "S3FileSystem"; } else if (type_name == "gcs") { return "GcsFileSystem"; - } else if (type_name == "abfs") { - return "AzureBlobFileSystem"; - } else if (type_name == "hdfs") { - return "HadoopFileSystem"; + // Uncomment these once R6 classes for these filesystems are added + // } else if (type_name == "abfs") { + // return "AzureBlobFileSystem"; + // } else if (type_name == "hdfs") { + // return "HadoopFileSystem"; } else if (type_name == "subtree") { return "SubTreeFileSystem"; } else { From d5fa9586a4cb51b8aab09d8bb86289e6f31684fc Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 13:25:27 -0400 Subject: [PATCH 07/43] Add ARROW_GCS wherever ARROW_S3 is mentioned in linux builds --- r/inst/build_arrow_static.sh | 1 + r/tools/nixlibs.R | 3 +- r/vignettes/developers/setup.Rmd | 2 + r/vignettes/install.Rmd | 101 ++++++++++++++++--------------- 4 files changed, 57 insertions(+), 50 deletions(-) diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 27d42d4702528..3e6b0546b1c4c 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -59,6 +59,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ -DARROW_FILESYSTEM=ON \ + -DARROW_GCS=${ARROW_GCS:-$ARROW_DEFAULT_PARAM} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ -DARROW_JSON=${ARROW_JSON:-ON} \ diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index ba2a2535f28a8..0dadaa0ef73b2 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -69,7 +69,8 @@ download_binary <- function(os = identify_os()) { if (try_download(binary_url, libfile)) { cat(sprintf("*** Successfully retrieved C++ binaries for %s\n", os)) if (!identical(os, "centos-7")) { - # centos-7 uses gcc 4.8 so the binary doesn't have ARROW_S3=ON but the others do + # centos-7 uses gcc 4.8 so the binary doesn't have ARROW_S3=ON + # or ARROW_GCS=ON but the others do # TODO: actually check for system requirements? cat("**** Binary package requires libcurl and openssl\n") cat("**** If installation fails, retry after installing those system requirements\n") diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index 159a43808eca0..af312e30b89d4 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -237,6 +237,7 @@ cmake \ To enable optional features including: S3 support, an alternative memory allocator, and additional compression libraries, add some or all of these flags to your call to `cmake` (the trailing `\` makes them easier to paste into a bash shell on a new line): ```bash + -DARROW_GCS=ON \ -DARROW_MIMALLOC=ON \ -DARROW_S3=ON \ -DARROW_WITH_BROTLI=ON \ @@ -307,6 +308,7 @@ cmake \ -DARROW_DATASET=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON \ -DARROW_FILESYSTEM=ON \ + -DARROW_GCS=ON \ -DARROW_INSTALL_NAME_RPATH=OFF \ -DARROW_JEMALLOC=ON \ -DARROW_JSON=ON \ diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 2c402e162d7c3..257dd7b11d8e7 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -13,14 +13,14 @@ In most cases, `install.packages("arrow")` should just work. There are things yo ---- -The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or MacOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. +The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or MacOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. This vignette outlines the recommend approaches to installing arrow on Linux, starting from the simplest and least customisable to the most complex but with more flexbility to customise your installation. The intended audience for this document is arrow R package _users_ on Linux, and not Arrow _developers_. If you're contributing to the Arrow project, see `vignette("developing", package = "arrow")` for resources to help you on set up your development environment. You can also find -a more detailed discussion of the code run during the installation process in the +a more detailed discussion of the code run during the installation process in the [developers' installation docs](https://arrow.apache.org/docs/r/articles/developers/install_details.html) > Having trouble installing arrow? See the "Troubleshooting" section below. @@ -59,11 +59,11 @@ install.packages("arrow", repos = "https://packagemanager.rstudio.com/all/__linu Note that the User Agent header must be specified as in the example above. Please check [the RStudio Package Manager: Admin Guide ](https://docs.rstudio.com/rspm/admin/serving-binaries/#using-linux-binary-packages) for more details. -For other Linux distributions, to get the relevant URL, you can visit +For other Linux distributions, to get the relevant URL, you can visit [the RSPM site](https://packagemanager.rstudio.com/client/#/repos/1/overview), click on 'binary', and select your preferred distribution. -Similarly, if you use `conda` to manage your R environment, you can get the +Similarly, if you use `conda` to manage your R environment, you can get the latest official release of the R package including libarrow via: ```shell @@ -87,34 +87,36 @@ This installs the source version of the R package, but during the installation p # Installing libarrow dependencies -When you install libarrow, its dependencies will be automatically downloaded. +When you install libarrow, its dependencies will be automatically downloaded. The environment variable `ARROW_DEPENDENCY_SOURCE` controls whether the libarrow -installation also downloads or installs all dependencies (when set to `BUNDLED`), -uses only system-installed dependencies (when set to `SYSTEM`) or checks -system-installed dependencies first and only installs dependencies which aren't +installation also downloads or installs all dependencies (when set to `BUNDLED`), +uses only system-installed dependencies (when set to `SYSTEM`) or checks +system-installed dependencies first and only installs dependencies which aren't already present (when set to `AUTO`). -These dependencies vary by platform; however, if you wish to install these -yourself prior to libarrow installation, we recommend that you take a look at +These dependencies vary by platform; however, if you wish to install these +yourself prior to libarrow installation, we recommend that you take a look at the [docker file for whichever of our CI builds](https://github.com/apache/arrow/tree/master/ci/docker) -(the ones ending in "cpp" are for building Arrow's C++ libaries aka libarrow) -corresponds most closely to your setup. This will contain the most up-to-date +(the ones ending in "cpp" are for building Arrow's C++ libaries aka libarrow) +corresponds most closely to your setup. This will contain the most up-to-date information about dependencies and minimum versions. -## Dependencies for S3 support +## Dependencies for S3 and GCS support The arrow package allows you to work with data in AWS S3 or in other cloud -storage system that emulate S3. However, support for working with S3 is not +storage system that emulate S3, as well as Google Cloud Storage. +However, support for working with S3 and GCS is not enabled in the default build, and it has additional system requirements. To enable it, set the environment variable `LIBARROW_MINIMAL=false` or `NOT_CRAN=true` to choose the full-featured build, or more selectively set -`ARROW_S3=ON`. You also need the following system dependencies: +`ARROW_S3=ON` and/or `ARROW_GCS=ON`. +You also need the following system dependencies: * `gcc` >= 4.9 or `clang` >= 3.3; note that the default compiler on CentOS 7 is gcc 4.8.5, which is not sufficient * CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) * OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb) -The prebuilt libarrow binaries come with S3 support enabled, so you will need to meet these system requirements in order to use them--the package will not install without them (and will error with a message that explains this).If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 support in the build if the prerequisites are not met--installation will succeed but without S3 functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 support. +The prebuilt libarrow binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them--the package will not install without them (and will error with a message that explains this).If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. # Installing a release version (the less easy way) @@ -124,60 +126,60 @@ The prebuilt libarrow binaries come with S3 support enabled, so you will need to knitr::include_graphics("./r_source_libarrow_source.png") ``` -Generally compiling and installing R packages with C++ dependencies, requires -either installing system packages, which you may not have privileges to do, or -building the C++ dependencies separately, which introduces all sorts of +Generally compiling and installing R packages with C++ dependencies, requires +either installing system packages, which you may not have privileges to do, or +building the C++ dependencies separately, which introduces all sorts of additional ways for things to go wrong, which is why we recommend method 1 above. -However, if you wish to fine-tune or customise your Linux installation, the +However, if you wish to fine-tune or customise your Linux installation, the instructions in this section explain how to do that. ### Basic configuration for building from source with fully featured installation -If you wish to install libarrow from source instead of looking for pre-compiled +If you wish to install libarrow from source instead of looking for pre-compiled binaries, you can set the `LIBARROW_BINARY` variable. ```{r, eval = FALSE} Sys.setenv("LIBARROW_BINARY" = FALSE) ``` -By default, this is set to `TRUE`, and so libarrow will only be built from -source if this environment variable is set to `FALSE` or no compatible binary +By default, this is set to `TRUE`, and so libarrow will only be built from +source if this environment variable is set to `FALSE` or no compatible binary for your OS can be found. -When compiling libarrow from source, you have the power to really fine-tune -which features to install. You can set the environment variable -`LIBARROW_MINIMAL` to `FALSE` to enable a more full-featured build including S3 support +When compiling libarrow from source, you have the power to really fine-tune +which features to install. You can set the environment variable +`LIBARROW_MINIMAL` to `FALSE` to enable a more full-featured build including S3 support and alternative memory allocators. ```{r, eval = FALSE} Sys.setenv("LIBARROW_MINIMAL" = FALSE) ``` -By default this variable is unset; if set to `TRUE` a trimmed-down version of +By default this variable is unset; if set to `TRUE` a trimmed-down version of arrow is installed with many features disabled. -Note that in this guide, you will have seen us mention the environment variable -`NOT_CRAN` - this is a convenience variable, which when set to `TRUE`, +Note that in this guide, you will have seen us mention the environment variable +`NOT_CRAN` - this is a convenience variable, which when set to `TRUE`, automatically sets `LIBARROW_MINIMAL` to `FALSE` and `LIBARROW_BINARY` to `TRUE`. -Building libarrow from source requires more time and resources than installing -a binary. We recommend that you set the environment variable `ARROW_R_DEV` to -`TRUE` for more verbose output during the installation process if anything goes +Building libarrow from source requires more time and resources than installing +a binary. We recommend that you set the environment variable `ARROW_R_DEV` to +`TRUE` for more verbose output during the installation process if anything goes wrong. ```{r, eval = FALSE} Sys.setenv("ARROW_R_DEV" = TRUE) ``` -Once you have set these variables, call `install.packages()` to install arrow +Once you have set these variables, call `install.packages()` to install arrow using this configuration. ```{r, eval = FALSE} install.packages("arrow") ``` -The section below discusses environment variables you can set before calling +The section below discusses environment variables you can set before calling `install.packages("arrow")` to build from source and customise your configuration. ### Advanced configuration for building from source @@ -187,13 +189,14 @@ In this section, we describe how to fine-tune your installation at a more granul #### libarrow configuration Some features are optional when you build Arrow from source - you can configure -whether these components are built via the use of environment variables. The -names of the environment variables which control these features and their +whether these components are built via the use of environment variables. The +names of the environment variables which control these features and their default values are shown below. | Name | Description | Default Value | | ---| --- | :-: | | `ARROW_S3` | S3 support (if dependencies are met)* | `OFF` | +| `ARROW_GCS` | GCS support (if dependencies are met)* | `OFF` | | `ARROW_JEMALLOC` | The `jemalloc` memory allocator | `ON` | | `ARROW_MIMALLOC` | The `mimalloc` memory allocator | `ON` | | `ARROW_PARQUET` | | `ON` | @@ -210,7 +213,7 @@ default values are shown below. #### R package configuration -There are a number of other variables that affect the `configure` script and +There are a number of other variables that affect the `configure` script and the bundled build script. All boolean variables are case-insensitive. | Name | Description | Default | @@ -227,18 +230,18 @@ the bundled build script. All boolean variables are case-insensitive. See below for more in-depth explanations of these environment variables. * `LIBARROW_BINARY` : If set to `true`, the script will try to download a binary - C++ library built for your operating system. You may also set it to some other string, a related "distro-version" that has binaries built that work for your OS. See the [distro map](https://raw.githubusercontent.com/ursa-labs/arrow-r-nightly/master/linux/distro-map.csv) for compatible binaries and OSs. If no binary is found, installation will fall back to building C++ dependencies from source. + C++ library built for your operating system. You may also set it to some other string, a related "distro-version" that has binaries built that work for your OS. See the [distro map](https://raw.githubusercontent.com/ursa-labs/arrow-r-nightly/master/linux/distro-map.csv) for compatible binaries and OSs. If no binary is found, installation will fall back to building C++ dependencies from source. * `LIBARROW_BUILD` : If set to `false`, the build script will not attempt to build the C++ from source. This means you will only get a working arrow R package if a prebuilt binary is found. Use this if you want to avoid compiling the C++ library, which may be slow - and resource-intensive, and ensure that you only use a prebuilt binary. + and resource-intensive, and ensure that you only use a prebuilt binary. * `LIBARROW_MINIMAL` : If set to `false`, the build script will enable some optional features, including S3 support and additional alternative memory allocators. This will increase the - source build time but results in a more fully functional library. If set to - `true` turns off Parquet, Datasets, compression libraries, and other optional - features. This is not commonly used but may be helpful if needing to compile + source build time but results in a more fully functional library. If set to + `true` turns off Parquet, Datasets, compression libraries, and other optional + features. This is not commonly used but may be helpful if needing to compile on a platform that does not support these features, e.g. Solaris. * `NOT_CRAN` : If this variable is set to `true`, as the `devtools` package does, the build script will set `LIBARROW_BINARY=true` and `LIBARROW_MINIMAL=false` @@ -250,7 +253,7 @@ See below for more in-depth explanations of these environment variables. in the build script. `arrow::install_arrow(verbose = TRUE)` sets this. This variable also is needed if you're modifying C++ code in the package: see the developer guide vignette. -* `ARROW_USE_PKG_CONFIG`: If set to `false`, the configure script won't look for +* `ARROW_USE_PKG_CONFIG`: If set to `false`, the configure script won't look for Arrow libraries on your system and instead will look to download/build them. Use this if you have a version mismatch between installed system libraries and the version of the R package you're installing. @@ -266,7 +269,7 @@ Arrow libraries on your system and instead will look to download/build them. # Install the nightly build -Daily development builds, which are not official releases, can be installed +Daily development builds, which are not official releases, can be installed from the Ursa Labs repository: ```r @@ -292,12 +295,12 @@ R CMD INSTALL . If you don't already have libarrow on your system, when installing the R package from source, it will also download and build -libarrow for you. See the section above on build environment +libarrow for you. See the section above on build environment variables for options for configuring the build source and enabled features. # Installation using install_arrow() -The previous instructions are useful for a fresh arrow installation, but arrow +The previous instructions are useful for a fresh arrow installation, but arrow provides the function `install_arrow()`, which you can use if you: * already have arrow installed and want to upgrade to a different version @@ -307,7 +310,7 @@ provides the function `install_arrow()`, which you can use if you: `install_arrow()` provides some convenience wrappers around the various environment variables described below. -Although this function is part of the arrow package, it is also available as +Although this function is part of the arrow package, it is also available as a standalone script, so you can access it for convenience without first installing the package: ```r @@ -404,8 +407,8 @@ in the output when the package fails to install, that means that installation failed to retrieve or build the libarrow version compatible with the current version of the R package. -Please check the "Known installation issues" below to see if any apply, and if -none apply, set the environment variable `ARROW_R_DEV=TRUE` for more verbose +Please check the "Known installation issues" below to see if any apply, and if +none apply, set the environment variable `ARROW_R_DEV=TRUE` for more verbose output and try installing again. Then, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) and include the full installation output. From 71f9f6aa1cd790c92bf6ba3ebf9009d51e9babd0 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 17:20:12 -0400 Subject: [PATCH 08/43] Turn on (bundled) ARROW_GCS in mac and win packages --- ci/scripts/PKGBUILD | 3 +++ dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb | 1 + 2 files changed, 4 insertions(+) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index b9b0194f5c8cf..e9b22682eb591 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -79,11 +79,13 @@ build() { export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH" export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include" export LIBS="-L${MINGW_PREFIX}/libs" + export ARROW_GCS=OFF export ARROW_S3=OFF export ARROW_WITH_RE2=OFF # Without this, some dataset functionality segfaults export CMAKE_UNITY_BUILD=ON else + export ARROW_GCS=ON export ARROW_S3=ON export ARROW_WITH_RE2=ON # Without this, some compute functionality segfaults in tests @@ -101,6 +103,7 @@ build() { -DARROW_CSV=ON \ -DARROW_DATASET=ON \ -DARROW_FILESYSTEM=ON \ + -DARROW_GCS="${ARROW_GCS}" \ -DARROW_HDFS=OFF \ -DARROW_JEMALLOC=OFF \ -DARROW_JSON=ON \ diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb index d72f64c92e1d0..45c04463b6d0d 100644 --- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb @@ -47,6 +47,7 @@ def install -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_FILESYSTEM=ON + -DARROW_GCS=ON -DARROW_HDFS=OFF -DARROW_JEMALLOC=ON -DARROW_JSON=ON From bb767905b4d3beb12b91e778fadd12066d5180cc Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 17:42:58 -0400 Subject: [PATCH 09/43] Try updating abseil deps for google-cloud-cpp from upstream --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 34856bddb36e6..43e7d963c2f89 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4036,13 +4036,18 @@ macro(build_google_cloud_cpp_storage) "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON}" INTERFACE_INCLUDE_DIRECTORIES "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + # Refer to https://github.com/googleapis/google-cloud-cpp/blob/main/google/cloud/google_cloud_cpp_common.cmake + # (subsitute `main` for the SHA of the version we use) + # Version 1.39.0 is at a different place (they refactored after): + # https://github.com/googleapis/google-cloud-cpp/blob/29e5af8ca9b26cec62106d189b50549f4dc1c598/google/cloud/CMakeLists.txt#L146-L155 set_property(TARGET google-cloud-cpp::common PROPERTY INTERFACE_LINK_LIBRARIES - absl::any - absl::flat_hash_map + absl::base absl::memory absl::optional + absl::span absl::time + absl::variant Threads::Threads OpenSSL::Crypto) @@ -4052,6 +4057,7 @@ macro(build_google_cloud_cpp_storage) "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE}" INTERFACE_INCLUDE_DIRECTORIES "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + # Update this from https://github.com/googleapis/google-cloud-cpp/blob/main/google/cloud/storage/google_cloud_cpp_storage.cmake set_property(TARGET google-cloud-cpp::storage PROPERTY INTERFACE_LINK_LIBRARIES google-cloud-cpp::common From d4bcfdbad431e9adbb203b9b76aa5cfd90a9b9ff Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 18:11:10 -0400 Subject: [PATCH 10/43] Add curl to PKGBUILD --- ci/scripts/PKGBUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index e9b22682eb591..566ec881e4040 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -25,6 +25,7 @@ arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") depends=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp" + "${MINGW_PACKAGE_PREFIX}-curl" # for google-cloud-cpp bundled build "${MINGW_PACKAGE_PREFIX}-libutf8proc" "${MINGW_PACKAGE_PREFIX}-re2" "${MINGW_PACKAGE_PREFIX}-thrift" From d5b2b85bae8375fb56df622b231f56dd7f6db5c1 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 19:22:20 -0400 Subject: [PATCH 11/43] Try to define all the symbols --- ci/scripts/r_windows_build.sh | 6 +++--- cpp/cmake_modules/ThirdpartyToolchain.cmake | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index 89d5737a09bd0..3334eab8663a8 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -87,7 +87,7 @@ if [ -d mingw64/lib/ ]; then # These may be from https://dl.bintray.com/rtools/backports/ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x64 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 + cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 fi # Same for the 32-bit versions @@ -97,7 +97,7 @@ if [ -d mingw32/lib/ ]; then mkdir -p $DST_DIR/lib/i386 mv mingw32/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 + cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 fi # Do the same also for ucrt64 @@ -105,7 +105,7 @@ if [ -d ucrt64/lib/ ]; then ls $MSYS_LIB_DIR/ucrt64/lib/ mkdir -p $DST_DIR/lib/x64-ucrt mv ucrt64/lib/*.a $DST_DIR/lib/x64-ucrt - cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt + cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt fi # Create build artifact diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 43e7d963c2f89..61f66ff9415cb 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4076,6 +4076,19 @@ macro(build_google_cloud_cpp_storage) list(APPEND ARROW_BUNDLED_STATIC_LIBS google-cloud-cpp::storage google-cloud-cpp::common) + if(ABSL_VENDORED) + # Copy and de-dupe these absl:: from above + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + absl::base + absl::memory + absl::optional + absl::span + absl::time + absl::variant + absl::strings + absl::str_format) + endif() endmacro() if(ARROW_WITH_GOOGLE_CLOUD_CPP) From bfde6b527fd6897259a690b5aa2ea0cd576a5786 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 20:15:30 -0400 Subject: [PATCH 12/43] absl::memory is header only --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 61f66ff9415cb..9c0d832d504f5 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4081,7 +4081,8 @@ macro(build_google_cloud_cpp_storage) list(APPEND ARROW_BUNDLED_STATIC_LIBS absl::base - absl::memory + # memory is header only + # absl::memory absl::optional absl::span absl::time From 015560b0e13124c0953220d3beff802a9bb5e0b7 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 20:38:45 -0400 Subject: [PATCH 13/43] More header only --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9c0d832d504f5..fefb16c83b5d9 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4077,18 +4077,17 @@ macro(build_google_cloud_cpp_storage) list(APPEND ARROW_BUNDLED_STATIC_LIBS google-cloud-cpp::storage google-cloud-cpp::common) if(ABSL_VENDORED) - # Copy and de-dupe these absl:: from above + # Copy and de-dupe these absl:: from above, but note: some are header-only list(APPEND ARROW_BUNDLED_STATIC_LIBS absl::base - # memory is header only # absl::memory - absl::optional - absl::span + # absl::optional + # absl::span absl::time - absl::variant - absl::strings - absl::str_format) + # absl::variant + # absl::str_format + absl::strings) endif() endmacro() From 2533eb23cbc1fb57bc1ecd8033b971db216fb83a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 21 Jun 2022 21:43:57 -0400 Subject: [PATCH 14/43] See if this gets us closer --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 +++ r/configure.win | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index fefb16c83b5d9..d258b82aae813 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4088,6 +4088,9 @@ macro(build_google_cloud_cpp_storage) # absl::variant # absl::str_format absl::strings) + if(NOT ARROW_USE_NATIVE_INT128) + list(APPEND ARROW_BUNDLED_STATIC_LIBS absl::int128) + endif() endif() endmacro() diff --git a/r/configure.win b/r/configure.win index 9e22136c79f40..0dd11244f6cba 100644 --- a/r/configure.win +++ b/r/configure.win @@ -31,6 +31,7 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" +GCS_LIBS="-lcurl" function configure_release() { VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) @@ -64,11 +65,11 @@ function configure_release() { -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 ${BROTLI_LIBS} -lole32 \ ${MIMALLOC_LIBS} ${OPENSSL_LIBS}" - # S3 and re2 support only for Rtools40 (i.e. R >= 4.0) + # S3, GCS, and re2 support only for Rtools40 (i.e. R >= 4.0) "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1 if [ $? -eq 0 ]; then - PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3" - PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}" + PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS" + PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS} ${GCS_LIBS}" else # It seems that order matters PKG_LIBS="${PKG_LIBS} -lws2_32" From f75253d8b9f4c5e19106682e9bfbb823d65ea197 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 08:57:37 -0400 Subject: [PATCH 15/43] Add more absl to bundled libs --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index d258b82aae813..1aa2463bda572 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4087,10 +4087,11 @@ macro(build_google_cloud_cpp_storage) absl::time # absl::variant # absl::str_format - absl::strings) - if(NOT ARROW_USE_NATIVE_INT128) - list(APPEND ARROW_BUNDLED_STATIC_LIBS absl::int128) - endif() + absl::strings + # Also these seem to be required, depended on by the above + absl::int128 + absl::time_zone + ) endif() endmacro() From 21e4c0bb48f457505685fdbd18a412b78a152d76 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 11:44:45 -0400 Subject: [PATCH 16/43] Add more recursive dependencies of abseil libs --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1aa2463bda572..f55a76e0ab7d7 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4089,9 +4089,14 @@ macro(build_google_cloud_cpp_storage) # absl::str_format absl::strings # Also these seem to be required, depended on by the above + absl::base_internal + absl::civil_time absl::int128 - absl::time_zone - ) + absl::log_severity + absl::raw_logging_internal + absl::spinlock_wait + absl::strings_internal + absl::time_zone) endif() endmacro() From 49b56bce04b01d144d7dce22d851dbd3c69c3d96 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 12:01:06 -0400 Subject: [PATCH 17/43] base_internal must be header only --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index f55a76e0ab7d7..5f3fdc07f83df 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4089,7 +4089,6 @@ macro(build_google_cloud_cpp_storage) # absl::str_format absl::strings # Also these seem to be required, depended on by the above - absl::base_internal absl::civil_time absl::int128 absl::log_severity From d354704a34373b6a6d1d9dc6e8f7fcf325f63b77 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 12:42:22 -0400 Subject: [PATCH 18/43] whackamole --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 5f3fdc07f83df..4b0471b48f3a1 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4095,6 +4095,7 @@ macro(build_google_cloud_cpp_storage) absl::raw_logging_internal absl::spinlock_wait absl::strings_internal + absl::str_format_internal absl::time_zone) endif() endmacro() From 43a92019b07fa1037ab9f496c4acd1c8fcc88a2f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 13:09:55 -0400 Subject: [PATCH 19/43] Use pkg-config to determine dependencies --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4b0471b48f3a1..2bc9fcfc4029d 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4077,25 +4077,26 @@ macro(build_google_cloud_cpp_storage) list(APPEND ARROW_BUNDLED_STATIC_LIBS google-cloud-cpp::storage google-cloud-cpp::common) if(ABSL_VENDORED) - # Copy and de-dupe these absl:: from above, but note: some are header-only + # Figure out what absl libraries (not header-only) are required by the + # google-cloud-cpp libraries above and add them to the bundled_dependencies + # + # pkg-config --libs absl_memory absl_strings absl_str_format absl_time absl_variant absl_base absl_memory absl_optional absl_span absl_time absl_variant + # (and then some regexing) list(APPEND ARROW_BUNDLED_STATIC_LIBS + absl::bad_optional_access + absl::bad_variant_access absl::base - # absl::memory - # absl::optional - # absl::span - absl::time - # absl::variant - # absl::str_format - absl::strings - # Also these seem to be required, depended on by the above absl::civil_time absl::int128 absl::log_severity absl::raw_logging_internal absl::spinlock_wait + absl::strings absl::strings_internal absl::str_format_internal + absl::throw_delegate + absl::time absl::time_zone) endif() endmacro() From b51fe747b8281d9e527252546f5b9290a113121a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 13:42:45 -0400 Subject: [PATCH 20/43] Add non-abseil libs --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 2bc9fcfc4029d..9a5a63c37df7e 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4097,7 +4097,9 @@ macro(build_google_cloud_cpp_storage) absl::str_format_internal absl::throw_delegate absl::time - absl::time_zone) + absl::time_zone + nlohmann_json::nlohmann_json + Crc32c::crc32c) endif() endmacro() From 58d0b57069a1f0a6876f8d883d058d9ce275b6bc Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 14:06:57 -0400 Subject: [PATCH 21/43] sigh --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9a5a63c37df7e..709731afd262f 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4098,7 +4098,6 @@ macro(build_google_cloud_cpp_storage) absl::throw_delegate absl::time absl::time_zone - nlohmann_json::nlohmann_json Crc32c::crc32c) endif() endmacro() From fbdbec0317565bd3ac154489a0c4df6501ec66f9 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 16:21:49 -0400 Subject: [PATCH 22/43] Add jira issues to TODOs --- cpp/src/arrow/filesystem/gcsfs.h | 1 + r/src/filesystem.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 8458c7f2108bd..77b8a0b201a8c 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -218,6 +218,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem { const std::shared_ptr& metadata) override; /// Create a GcsFileSystem instance from the given options. + // TODO(ARROW-16884): make this return Result for consistency static std::shared_ptr Make( const GcsOptions& options, const io::IOContext& = io::default_io_context()); diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 6f08feb28b3f5..cdf536b3bc85b 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -366,7 +366,7 @@ std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, fs::TimePoint(std::chrono::duration_cast(ns_count)); gcs_opts = fs::GcsOptions::FromAccessToken( cpp11::as_cpp(options["access_token"]), expiration_timepoint); - // TODO: implement FromImpersonatedServiceAccount + // TODO(ARROW-16885): implement FromImpersonatedServiceAccount // } else if (base_credentials != "") { // // static GcsOptions FromImpersonatedServiceAccount( // // const GcsCredentials& base_credentials, const std::string& @@ -411,7 +411,7 @@ std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, } auto io_context = arrow::io::IOContext(gc_memory_pool()); - // TODO: S3FileSystem::Make returns a Result and uses ValueOrStop but this doesn't? + // TODO(ARROW-16884): update when this returns Result return fs::GcsFileSystem::Make(gcs_opts, io_context); } From be7acfd4c57977ed6861275acba06e278ae3e58f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 22 Jun 2022 17:28:20 -0400 Subject: [PATCH 23/43] Try -DCURL_STATICLIB for windows packages --- r/configure.win | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/r/configure.win b/r/configure.win index 0dd11244f6cba..6271f6b46be8b 100644 --- a/r/configure.win +++ b/r/configure.win @@ -68,7 +68,7 @@ function configure_release() { # S3, GCS, and re2 support only for Rtools40 (i.e. R >= 4.0) "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1 if [ $? -eq 0 ]; then - PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS" + PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS -DCURL_STATICLIB" PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS} ${GCS_LIBS}" else # It seems that order matters @@ -105,6 +105,10 @@ function configure_dev() { PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3" fi + if [ $(cmake_option ARROW_GCS) -eq 1 ]; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS -DCURL_STATICLIB" + fi + if [ $(cmake_option ARROW_JSON) -eq 1 ]; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" fi From 3f9793dc2826fda1bb183bb6c80db173b14d5ae0 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 23 Jun 2022 08:16:08 -0400 Subject: [PATCH 24/43] Add Kou's patch --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 709731afd262f..f7fb772170788 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1075,6 +1075,15 @@ macro(find_curl) "${CURL_LIBRARIES}") endif() endif() + if(WIN32 AND NOT CURL_STATIC_CHECKED) + get_target_property(CURL_LIBRARY CURL::libcurl IMPORTED_LOCATION) + get_filename_component(CURL_LIBRARY_EXT "${CURL_LIBRARY}" LAST_EXT) + if(CURL_LIBRARY_EXT STREQUAL "${CMAKE_STATIC_LIBRARY_SUFFIX}") + set_target_properties(CURL::libcurl + PROPERTIES INTERFACE_COMPILE_DEFINITIONS "CURL_STATICLIB") + endif() + set(CURL_STATIC_CHECKED TRUE) + endif() endmacro() # ---------------------------------------------------------------------- From 54e7d4e4460d5bf7debbeec6ee9edef59aac7150 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 23 Jun 2022 08:17:42 -0400 Subject: [PATCH 25/43] Back out -DCURL_STATICLIB from configure.win --- r/configure.win | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/configure.win b/r/configure.win index 6271f6b46be8b..36bae5cd4c53e 100644 --- a/r/configure.win +++ b/r/configure.win @@ -68,7 +68,7 @@ function configure_release() { # S3, GCS, and re2 support only for Rtools40 (i.e. R >= 4.0) "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1 if [ $? -eq 0 ]; then - PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS -DCURL_STATICLIB" + PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS" PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS} ${GCS_LIBS}" else # It seems that order matters @@ -106,7 +106,7 @@ function configure_dev() { fi if [ $(cmake_option ARROW_GCS) -eq 1 ]; then - PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS -DCURL_STATICLIB" + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS" fi if [ $(cmake_option ARROW_JSON) -eq 1 ]; then From 789988eaee4da4c2fd83d46cb14a73b4a69fc6cf Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 23 Jun 2022 09:12:12 -0400 Subject: [PATCH 26/43] Turn on ARROW_VERBOSE_THIRDPARTY_BUILD to see if CURL_STATICLIB is being set where it should be --- ci/scripts/PKGBUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 566ec881e4040..3158950edef7a 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -116,6 +116,7 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_UTF8PROC_USE_SHARED=OFF \ + -ARROW_VERBOSE_THIRDPARTY_BUILD=ON \ -DARROW_WITH_LZ4=ON \ -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \ -DARROW_WITH_SNAPPY=ON \ From 86539ec68ae1a20ccfa9a856714561a00cdfeaef Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 23 Jun 2022 10:09:05 -0400 Subject: [PATCH 27/43] :facepalm: --- ci/scripts/PKGBUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 3158950edef7a..ea17fba17edd0 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -116,7 +116,7 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_UTF8PROC_USE_SHARED=OFF \ - -ARROW_VERBOSE_THIRDPARTY_BUILD=ON \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=ON \ -DARROW_WITH_LZ4=ON \ -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \ -DARROW_WITH_SNAPPY=ON \ From 44f8419567478feb0f8f7b8bdfa1901e4596a576 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 10:16:34 +0900 Subject: [PATCH 28/43] Patch to google-cloud-cpp for -DCURL_STATICLIB on Windows --- .../google-cloud-cpp-curl-static-windows.patch | 14 ++++++++++++++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 10 ++++++++++ 2 files changed, 24 insertions(+) create mode 100644 cpp/build-support/google-cloud-cpp-curl-static-windows.patch diff --git a/cpp/build-support/google-cloud-cpp-curl-static-windows.patch b/cpp/build-support/google-cloud-cpp-curl-static-windows.patch new file mode 100644 index 0000000000000..a6adad5f0a372 --- /dev/null +++ b/cpp/build-support/google-cloud-cpp-curl-static-windows.patch @@ -0,0 +1,14 @@ +diff -ru google_cloud_cpp_ep.orig/cmake/FindCurlWithTargets.cmake google_cloud_cpp_ep/cmake/FindCurlWithTargets.cmake +--- google_cloud_cpp_ep.orig/cmake/FindCurlWithTargets.cmake 2022-04-05 06:00:53.000000000 +0900 ++++ google_cloud_cpp_ep/cmake/FindCurlWithTargets.cmake 2022-06-24 10:06:00.177969962 +0900 +@@ -68,6 +68,10 @@ + TARGET CURL::libcurl + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES crypt32 wsock32 ws2_32) ++ set_property( ++ TARGET CURL::libcurl ++ APPEND ++ PROPERTY INTERFACE_COMPILE_DEFINITIONS "CURL_STATICLIB") + endif () + if (APPLE) + set_property( diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index f7fb772170788..5f7368c22bbf5 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4023,6 +4023,15 @@ macro(build_google_cloud_cpp_storage) "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_common${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + set(GOOGLE_CLOUD_CPP_PATCH_COMMAND) + if(CMAKE_VERSION VERSION_GREATER 3.9) + find_package(Patch) + if(Patch_FOUND) + set(GOOGLE_CLOUD_CPP_PATCH_COMMAND ${Patch_EXECUTABLE} + "/cmake/FindCurlWithTargets.cmake" + "${CMAKE_SOURCE_DIR}/build-support/google-cloud-cpp-curl-static-windows.patch") + endif() + endif() externalproject_add(google_cloud_cpp_ep ${EP_LOG_OPTIONS} LIST_SEPARATOR ${GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST_SEP_CHAR} @@ -4030,6 +4039,7 @@ macro(build_google_cloud_cpp_storage) URL ${google_cloud_cpp_storage_SOURCE_URL} URL_HASH "SHA256=${ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${GOOGLE_CLOUD_CPP_CMAKE_ARGS} + PATCH_COMMAND ${GOOGLE_CLOUD_CPP_PATCH_COMMAND} BUILD_BYPRODUCTS ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE} ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON} DEPENDS google_cloud_cpp_dependencies) From 8627bb0cc18273a2db9dbe1eacdfcb6700050fcb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 10:50:47 +0900 Subject: [PATCH 29/43] Add more GCS_LIBS --- r/configure.win | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 r/configure.win diff --git a/r/configure.win b/r/configure.win old mode 100644 new mode 100755 index 36bae5cd4c53e..1234f765f2cab --- a/r/configure.win +++ b/r/configure.win @@ -31,7 +31,9 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" -GCS_LIBS="-lcurl" +# pkg-config --libs libcurl +GCS_LIBS="-lcurl -lnormaliz -lssh2 -lcrypt32 -lgdi32 -lssl -lwldap32 \ + -lz -lws2_32" function configure_release() { VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) From 92d0918a2e5b3b4a5767fae2fe19c6a68d5c6681 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 11:22:03 +0900 Subject: [PATCH 30/43] Fix lint --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 5f7368c22bbf5..53fd56d8bcffd 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1079,8 +1079,8 @@ macro(find_curl) get_target_property(CURL_LIBRARY CURL::libcurl IMPORTED_LOCATION) get_filename_component(CURL_LIBRARY_EXT "${CURL_LIBRARY}" LAST_EXT) if(CURL_LIBRARY_EXT STREQUAL "${CMAKE_STATIC_LIBRARY_SUFFIX}") - set_target_properties(CURL::libcurl - PROPERTIES INTERFACE_COMPILE_DEFINITIONS "CURL_STATICLIB") + set_target_properties(CURL::libcurl PROPERTIES INTERFACE_COMPILE_DEFINITIONS + "CURL_STATICLIB") endif() set(CURL_STATIC_CHECKED TRUE) endif() @@ -4027,9 +4027,9 @@ macro(build_google_cloud_cpp_storage) if(CMAKE_VERSION VERSION_GREATER 3.9) find_package(Patch) if(Patch_FOUND) - set(GOOGLE_CLOUD_CPP_PATCH_COMMAND ${Patch_EXECUTABLE} - "/cmake/FindCurlWithTargets.cmake" - "${CMAKE_SOURCE_DIR}/build-support/google-cloud-cpp-curl-static-windows.patch") + set(GOOGLE_CLOUD_CPP_PATCH_COMMAND + ${Patch_EXECUTABLE} "/cmake/FindCurlWithTargets.cmake" + "${CMAKE_SOURCE_DIR}/build-support/google-cloud-cpp-curl-static-windows.patch") endif() endif() externalproject_add(google_cloud_cpp_ep From 52f6b4550629fd24f9553d51940a792aed1bae85 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 11:22:07 +0900 Subject: [PATCH 31/43] Fix order --- r/configure.win | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/configure.win b/r/configure.win index 1234f765f2cab..8e1911bb6fd66 100755 --- a/r/configure.win +++ b/r/configure.win @@ -32,7 +32,7 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" # pkg-config --libs libcurl -GCS_LIBS="-lcurl -lnormaliz -lssh2 -lcrypt32 -lgdi32 -lssl -lwldap32 \ +GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypt32 -lwldap32 \ -lz -lws2_32" function configure_release() { From 598c66599840a842ef7717c6e49b3a1a6339808d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 12:13:49 +0900 Subject: [PATCH 32/43] Add missing library --- r/configure.win | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/configure.win b/r/configure.win index 8e1911bb6fd66..dfd2c87ab4f15 100755 --- a/r/configure.win +++ b/r/configure.win @@ -32,7 +32,7 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" # pkg-config --libs libcurl -GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypt32 -lwldap32 \ +GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ -lz -lws2_32" function configure_release() { From aa059b390f872274f17a9c51b6493973fefd20ec Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 12:14:40 +0900 Subject: [PATCH 33/43] Add license header --- .../google-cloud-cpp-curl-static-windows.patch | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpp/build-support/google-cloud-cpp-curl-static-windows.patch b/cpp/build-support/google-cloud-cpp-curl-static-windows.patch index a6adad5f0a372..e3f849ceda1c2 100644 --- a/cpp/build-support/google-cloud-cpp-curl-static-windows.patch +++ b/cpp/build-support/google-cloud-cpp-curl-static-windows.patch @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + diff -ru google_cloud_cpp_ep.orig/cmake/FindCurlWithTargets.cmake google_cloud_cpp_ep/cmake/FindCurlWithTargets.cmake --- google_cloud_cpp_ep.orig/cmake/FindCurlWithTargets.cmake 2022-04-05 06:00:53.000000000 +0900 +++ google_cloud_cpp_ep/cmake/FindCurlWithTargets.cmake 2022-06-24 10:06:00.177969962 +0900 From d8431224e5d0ffdb658cf047f7a2dda09f98f3cf Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 13:28:50 +0900 Subject: [PATCH 34/43] Increase timeout --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 48d9672c74bfc..86e006d538552 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -165,7 +165,7 @@ jobs: name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }} runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 + timeout-minutes: 90 strategy: fail-fast: false matrix: From 3192b0ac6ae481f9a684e1f4a31033f25aac748f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Jun 2022 14:15:10 +0900 Subject: [PATCH 35/43] Remove needless CURL_STATICLIB check --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 53fd56d8bcffd..47300d4f66c05 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1075,15 +1075,6 @@ macro(find_curl) "${CURL_LIBRARIES}") endif() endif() - if(WIN32 AND NOT CURL_STATIC_CHECKED) - get_target_property(CURL_LIBRARY CURL::libcurl IMPORTED_LOCATION) - get_filename_component(CURL_LIBRARY_EXT "${CURL_LIBRARY}" LAST_EXT) - if(CURL_LIBRARY_EXT STREQUAL "${CMAKE_STATIC_LIBRARY_SUFFIX}") - set_target_properties(CURL::libcurl PROPERTIES INTERFACE_COMPILE_DEFINITIONS - "CURL_STATICLIB") - endif() - set(CURL_STATIC_CHECKED TRUE) - endif() endmacro() # ---------------------------------------------------------------------- From 1f9f9de44972226d6c833648ad9d1a7776e86d8b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 10:56:14 -0400 Subject: [PATCH 36/43] Upgrade google-cloud-cpp to 1.42.0 to resolve mingw issues --- .github/workflows/cpp.yml | 4 +-- cpp/cmake_modules/ThirdpartyToolchain.cmake | 30 +++++++++++++++++++-- cpp/thirdparty/versions.txt | 4 +-- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index b914b7df52f6b..bf54e26ee5d0e 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -276,9 +276,7 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON - # google-could-cpp uses _dupenv_s() but it can't be used with msvcrt. - # We need to use ucrt to use _dupenv_s(). - # ARROW_GCS: ON + ARROW_GCS: ON ARROW_HDFS: OFF ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} ARROW_JEMALLOC: OFF diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 47300d4f66c05..bef87dd080250 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -324,6 +324,7 @@ endif() if(ARROW_GCS) set(ARROW_WITH_GOOGLE_CLOUD_CPP ON) set(ARROW_WITH_NLOHMANN_JSON ON) + set(ARROW_WITH_ZLIB ON) endif() if(ARROW_JSON) @@ -4010,6 +4011,10 @@ macro(build_google_cloud_cpp_storage) "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_storage${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + set(GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL + "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_rest_internal${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_common${CMAKE_STATIC_LIBRARY_SUFFIX}" ) @@ -4032,6 +4037,7 @@ macro(build_google_cloud_cpp_storage) CMAKE_ARGS ${GOOGLE_CLOUD_CPP_CMAKE_ARGS} PATCH_COMMAND ${GOOGLE_CLOUD_CPP_PATCH_COMMAND} BUILD_BYPRODUCTS ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE} + ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL} ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON} DEPENDS google_cloud_cpp_dependencies) @@ -4061,6 +4067,21 @@ macro(build_google_cloud_cpp_storage) Threads::Threads OpenSSL::Crypto) + add_library(google-cloud-cpp::rest-internal STATIC IMPORTED) + set_target_properties(google-cloud-cpp::rest-internal + PROPERTIES IMPORTED_LOCATION + "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL}" + INTERFACE_INCLUDE_DIRECTORIES + "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + set_property(TARGET google-cloud-cpp::rest-internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::span + google-cloud-cpp::common + CURL::libcurl + nlohmann_json::nlohmann_json + OpenSSL::SSL + OpenSSL::Crypto) + add_library(google-cloud-cpp::storage STATIC IMPORTED) set_target_properties(google-cloud-cpp::storage PROPERTIES IMPORTED_LOCATION @@ -4071,6 +4092,7 @@ macro(build_google_cloud_cpp_storage) set_property(TARGET google-cloud-cpp::storage PROPERTY INTERFACE_LINK_LIBRARIES google-cloud-cpp::common + google-cloud-cpp::rest-internal absl::memory absl::strings absl::str_format @@ -4081,10 +4103,14 @@ macro(build_google_cloud_cpp_storage) CURL::libcurl Threads::Threads OpenSSL::SSL - OpenSSL::Crypto) + OpenSSL::Crypto + ZLIB::ZLIB) add_dependencies(google-cloud-cpp::storage google_cloud_cpp_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS google-cloud-cpp::storage + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + google-cloud-cpp::rest-internal + google-cloud-cpp::storage google-cloud-cpp::common) if(ABSL_VENDORED) # Figure out what absl libraries (not header-only) are required by the diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 3b4b4749add16..7dc95cd7e0968 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -49,8 +49,8 @@ ARROW_GFLAGS_BUILD_VERSION=v2.2.2 ARROW_GFLAGS_BUILD_SHA256_CHECKSUM=34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf ARROW_GLOG_BUILD_VERSION=v0.5.0 ARROW_GLOG_BUILD_SHA256_CHECKSUM=eede71f28371bf39aa69b45de23b329d37214016e2055269b3b5e7cfd40b59f5 -ARROW_GOOGLE_CLOUD_CPP_BUILD_VERSION=v1.39.0 -ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM=73e4e840018b24bec2beb49e036a3c2d8c471d4dc4a18b9026ccc4d8ab8e78cc +ARROW_GOOGLE_CLOUD_CPP_BUILD_VERSION=v1.42.0 +ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM=c06ae9aededbb8aa217a6d2453754daa40b815f9a4004bc4f2d2d215c79828aa ARROW_GRPC_BUILD_VERSION=v1.46.3 ARROW_GRPC_BUILD_SHA256_CHECKSUM=d6cbf22cb5007af71b61c6be316a79397469c58c82a942552a62e708bce60964 ARROW_GTEST_BUILD_VERSION=1.11.0 From 2dbed38a0d2580d76364b09a98e1a77807b8ba3e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 10:57:45 -0400 Subject: [PATCH 37/43] Add comment for google-cloud-cpp patch --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index bef87dd080250..510d6f41d0f4e 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4023,6 +4023,8 @@ macro(build_google_cloud_cpp_storage) if(CMAKE_VERSION VERSION_GREATER 3.9) find_package(Patch) if(Patch_FOUND) + # This patch is for google-cloud-cpp <= 1.42.0 + # Upstreamed: https://github.com/googleapis/google-cloud-cpp/pull/9345 set(GOOGLE_CLOUD_CPP_PATCH_COMMAND ${Patch_EXECUTABLE} "/cmake/FindCurlWithTargets.cmake" "${CMAKE_SOURCE_DIR}/build-support/google-cloud-cpp-curl-static-windows.patch") From ee4e8e66b5f972bc40cde548b9eab7e491eac9dc Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 12:06:34 -0400 Subject: [PATCH 38/43] Turn ARROW_GCS back off in mingw C++ workflow --- .github/workflows/cpp.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index bf54e26ee5d0e..ce6efa4d0c547 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -276,7 +276,12 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON - ARROW_GCS: ON + # With GCS on, + # * MinGW 32 build OOMs (maybe turn off unity build?) + # * MinGW 64 fails to compile the GCS filesystem tests, some conflict + # with boost. First error says: + # D:/a/_temp/msys64/mingw64/include/boost/asio/detail/socket_types.hpp:24:4: error: #error WinSock.h has already been included + # ARROW_GCS: ON ARROW_HDFS: OFF ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} ARROW_JEMALLOC: OFF From 226c31b3ec0fe94a55e10604bb4621b2fde3bbee Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 17:00:02 -0400 Subject: [PATCH 39/43] Fixes for failing nightly builds --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 +- dev/tasks/r/github.macos.brew.yml | 3 ++- dev/tasks/tasks.yml | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 510d6f41d0f4e..7cc10c22e7cfb 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -476,7 +476,7 @@ else() endif() if(DEFINED ENV{ARROW_CRC32C_URL}) - set(CRC32C_URL "$ENV{ARROW_CRC32C_URL}") + set(CRC32C_SOURCE_URL "$ENV{ARROW_CRC32C_URL}") else() set_urls(CRC32C_SOURCE_URL "https://github.com/google/crc32c/archive/${ARROW_CRC32C_BUILD_VERSION}.tar.gz" diff --git a/dev/tasks/r/github.macos.brew.yml b/dev/tasks/r/github.macos.brew.yml index 064ab550d4128..339dbeacad9c8 100644 --- a/dev/tasks/r/github.macos.brew.yml +++ b/dev/tasks/r/github.macos.brew.yml @@ -30,7 +30,8 @@ jobs: - name: Install apache-arrow run: | - brew install -v --HEAD apache-arrow + # reinstall not install: somehow it may already be installed? + brew reinstall -v --HEAD apache-arrow # for testing brew install minio - uses: r-lib/actions/setup-r@v1 diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 73e2257f1988e..43dccba66ebff 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1305,7 +1305,7 @@ tasks: ci: github template: docker-tests/github.linux.yml params: - flags: '-e ARROW_DEPENDENCY_SOURCE=SYSTEM -e xsimd_SOURCE=BUNDLED' + flags: '-e ARROW_DEPENDENCY_SOURCE=SYSTEM -e ARROW_GCS=OFF -e xsimd_SOURCE=BUNDLED' image: ubuntu-r-only-r test-r-offline-minimal: From c7a5f12f509f1d35ac3db5b863588a02b2e580f0 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 17:17:35 -0400 Subject: [PATCH 40/43] Try this for brew --- dev/tasks/r/github.macos.brew.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dev/tasks/r/github.macos.brew.yml b/dev/tasks/r/github.macos.brew.yml index 339dbeacad9c8..9e49150258b70 100644 --- a/dev/tasks/r/github.macos.brew.yml +++ b/dev/tasks/r/github.macos.brew.yml @@ -30,8 +30,11 @@ jobs: - name: Install apache-arrow run: | - # reinstall not install: somehow it may already be installed? - brew reinstall -v --HEAD apache-arrow + # In case it is somehow already installed, remove it. + # Otherwise, brew install will say "it's already installed" + # and won't do anything + brew uninstall apache-arrow || true + brew install -v --HEAD apache-arrow # for testing brew install minio - uses: r-lib/actions/setup-r@v1 From a2a5e87630000bc0577f651e4087f248eca8d332 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 17:19:49 -0400 Subject: [PATCH 41/43] Add TODO --- .github/workflows/cpp.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index ce6efa4d0c547..acb3270a5d5bf 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -281,6 +281,7 @@ jobs: # * MinGW 64 fails to compile the GCS filesystem tests, some conflict # with boost. First error says: # D:/a/_temp/msys64/mingw64/include/boost/asio/detail/socket_types.hpp:24:4: error: #error WinSock.h has already been included + # TODO(ARROW-16906) # ARROW_GCS: ON ARROW_HDFS: OFF ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} From 5dad768e4b646c1dad52f5ee1e1101572084e41c Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sat, 25 Jun 2022 17:39:33 -0400 Subject: [PATCH 42/43] Back out brew job change and note TODO. This will pass once it is merged to master --- dev/tasks/r/github.macos.brew.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dev/tasks/r/github.macos.brew.yml b/dev/tasks/r/github.macos.brew.yml index 9e49150258b70..a403a65595450 100644 --- a/dev/tasks/r/github.macos.brew.yml +++ b/dev/tasks/r/github.macos.brew.yml @@ -30,10 +30,8 @@ jobs: - name: Install apache-arrow run: | - # In case it is somehow already installed, remove it. - # Otherwise, brew install will say "it's already installed" - # and won't do anything - brew uninstall apache-arrow || true + # TODO(ARROW-16907): apache/arrow@master seems to be installed already + # so this does nothing on a branch/PR brew install -v --HEAD apache-arrow # for testing brew install minio From ea76c70021c7734833f33d6409f76bbeb2c3fedd Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sun, 26 Jun 2022 07:58:07 -0400 Subject: [PATCH 43/43] Swap order in bundled static libs Co-authored-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 7cc10c22e7cfb..b50b6f7983604 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4111,8 +4111,8 @@ macro(build_google_cloud_cpp_storage) list(APPEND ARROW_BUNDLED_STATIC_LIBS - google-cloud-cpp::rest-internal google-cloud-cpp::storage + google-cloud-cpp::rest-internal google-cloud-cpp::common) if(ABSL_VENDORED) # Figure out what absl libraries (not header-only) are required by the