diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 9e41e966eb3a9..d39f6722ce5ea 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -1690,7 +1690,7 @@ class TestTableSortIndicesRandom : public testing::TestWithParam { for (const auto& pair : sort_columns_) { ColumnComparator comparator(pair.second, options_.null_placement); const auto& chunked_array = *pair.first; - int64_t lhs_index, rhs_index; + int64_t lhs_index = 0, rhs_index = 0; const Array* lhs_array = FindTargetArray(chunked_array, lhs, &lhs_index); const Array* rhs_array = FindTargetArray(chunked_array, rhs, &rhs_index); int compared = comparator(*lhs_array, *rhs_array, lhs_index, rhs_index); diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index ff911d02ab388..898e54cf593bd 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -19,6 +19,7 @@ #include +#include "arrow/buffer.h" #include "arrow/filesystem/gcsfs_internal.h" #include "arrow/filesystem/path_util.h" #include "arrow/result.h" @@ -28,6 +29,8 @@ namespace arrow { namespace fs { namespace { +namespace gcs = google::cloud::storage; + auto constexpr kSep = '/'; struct GcsPath { @@ -58,9 +61,48 @@ struct GcsPath { } }; -} // namespace +class GcsInputStream : public arrow::io::InputStream { + public: + explicit GcsInputStream(gcs::ObjectReadStream stream) : stream_(std::move(stream)) {} -namespace gcs = google::cloud::storage; + ~GcsInputStream() override = default; + + Status Close() override { + stream_.Close(); + return Status::OK(); + } + + Result Tell() const override { + if (!stream_) { + return Status::IOError("invalid stream"); + } + return stream_.tellg(); + } + + bool closed() const override { return !stream_.IsOpen(); } + + Result Read(int64_t nbytes, void* out) override { + stream_.read(static_cast(out), nbytes); + if (!stream_.status().ok()) { + return internal::ToArrowStatus(stream_.status()); + } + return stream_.gcount(); + } + + Result> Read(int64_t nbytes) override { + ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)); + stream_.read(reinterpret_cast(buffer->mutable_data()), nbytes); + if (!stream_.status().ok()) { + return internal::ToArrowStatus(stream_.status()); + } + return arrow::SliceMutableBufferSafe(std::move(buffer), 0, stream_.gcount()); + } + + private: + mutable gcs::ObjectReadStream stream_; +}; + +} // namespace google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { auto options = google::cloud::Options{}; @@ -95,6 +137,14 @@ class GcsFileSystem::Impl { return GetFileInfoImpl(path, std::move(meta).status(), FileType::Directory); } + Result> OpenInputStream(const GcsPath& path) { + auto stream = client_.ReadObject(path.bucket, path.object); + if (!stream.status().ok()) { + return internal::ToArrowStatus(stream.status()); + } + return std::make_shared(std::move(stream)); + } + private: static Result GetFileInfoImpl(const GcsPath& path, const google::cloud::Status& status, @@ -169,12 +219,17 @@ Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) Result> GcsFileSystem::OpenInputStream( const std::string& path) { - return Status::NotImplemented("The GCS FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); + return impl_->OpenInputStream(p); } Result> GcsFileSystem::OpenInputStream( const FileInfo& info) { - return Status::NotImplemented("The GCS FileSystem is not fully implemented"); + if (!info.IsFile()) { + return Status::IOError("Only files can be opened as input streams"); + } + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); + return impl_->OpenInputStream(p); } Result> GcsFileSystem::OpenInputFile( diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.cc b/cpp/src/arrow/filesystem/gcsfs_internal.cc index 22df5cebf676c..898015859c297 100644 --- a/cpp/src/arrow/filesystem/gcsfs_internal.cc +++ b/cpp/src/arrow/filesystem/gcsfs_internal.cc @@ -38,10 +38,8 @@ Status ToArrowStatus(const google::cloud::Status& s) { case google::cloud::StatusCode::kInvalidArgument: return Status::Invalid(os.str()); case google::cloud::StatusCode::kDeadlineExceeded: - return Status::IOError(os.str()); case google::cloud::StatusCode::kNotFound: - // TODO: it is unclear if a better mapping would be possible. - return Status::UnknownError(os.str()); + return Status::IOError(os.str()); case google::cloud::StatusCode::kAlreadyExists: return Status::AlreadyExists(os.str()); case google::cloud::StatusCode::kPermissionDenied: diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 0776872e3ace2..369317fbb349b 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -24,11 +24,13 @@ #include #include +#include #include #include #include "arrow/filesystem/gcsfs_internal.h" #include "arrow/filesystem/test_util.h" +#include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" namespace arrow { @@ -45,6 +47,15 @@ using ::testing::Not; using ::testing::NotNull; auto const* kPreexistingBucket = "test-bucket-name"; +auto const* kPreexistingObject = "test-object-name"; +auto const* kLoremIpsum = R"""( +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. +)"""; class GcsIntegrationTest : public ::testing::Test { public: @@ -65,16 +76,29 @@ class GcsIntegrationTest : public ::testing::Test { server_process_ = bp::child(boost::this_process::environment(), exe_path, "-m", "testbench", "--port", port_); - // Create a bucket in the testbench. This makes it easier to bootstrap GcsFileSystem - // and its tests. + // Create a bucket and a small file in the testbench. This makes it easier to + // bootstrap GcsFileSystem and its tests. auto client = gcs::Client( google::cloud::Options{} .set("http://127.0.0.1:" + port_) .set(gc::MakeInsecureCredentials())); - google::cloud::StatusOr metadata = client.CreateBucketForProject( + google::cloud::StatusOr bucket = client.CreateBucketForProject( kPreexistingBucket, "ignored-by-testbench", gcs::BucketMetadata{}); - ASSERT_TRUE(metadata.ok()) << "Failed to create bucket <" << kPreexistingBucket - << ">, status=" << metadata.status(); + ASSERT_TRUE(bucket.ok()) << "Failed to create bucket <" << kPreexistingBucket + << ">, status=" << bucket.status(); + + google::cloud::StatusOr object = + client.InsertObject(kPreexistingBucket, kPreexistingObject, kLoremIpsum); + ASSERT_TRUE(object.ok()) << "Failed to create object <" << kPreexistingObject + << ">, status=" << object.status(); + } + + static std::string PreexistingObjectPath() { + return std::string(kPreexistingBucket) + "/" + kPreexistingObject; + } + + static std::string NotFoundObjectPath() { + return std::string(kPreexistingBucket) + "/not-found"; } GcsOptions TestGcsOptions() { @@ -114,7 +138,7 @@ TEST(GcsFileSystem, ToArrowStatus) { {google::cloud::StatusCode::kUnknown, StatusCode::UnknownError}, {google::cloud::StatusCode::kInvalidArgument, StatusCode::Invalid}, {google::cloud::StatusCode::kDeadlineExceeded, StatusCode::IOError}, - {google::cloud::StatusCode::kNotFound, StatusCode::UnknownError}, + {google::cloud::StatusCode::kNotFound, StatusCode::IOError}, {google::cloud::StatusCode::kAlreadyExists, StatusCode::AlreadyExists}, {google::cloud::StatusCode::kPermissionDenied, StatusCode::IOError}, {google::cloud::StatusCode::kUnauthenticated, StatusCode::IOError}, @@ -159,11 +183,82 @@ TEST(GcsFileSystem, FileSystemCompare) { EXPECT_FALSE(a->Equals(*b)); } -TEST_F(GcsIntegrationTest, MakeBucket) { +TEST_F(GcsIntegrationTest, GetFileInfoBucket) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); arrow::fs::AssertFileInfo(fs.get(), kPreexistingBucket, FileType::Directory); } +TEST_F(GcsIntegrationTest, GetFileInfoObject) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); +} + +TEST_F(GcsIntegrationTest, ReadObjectString) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(PreexistingObjectPath())); + + std::array buffer{}; + std::int64_t size; + ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); + + EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); +} + +TEST_F(GcsIntegrationTest, ReadObjectStringBuffers) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(PreexistingObjectPath())); + + std::string contents; + std::shared_ptr buffer; + do { + ASSERT_OK_AND_ASSIGN(buffer, stream->Read(16)); + contents.append(buffer->ToString()); + } while (buffer && buffer->size() != 0); + + EXPECT_EQ(contents, kLoremIpsum); +} + +TEST_F(GcsIntegrationTest, ReadObjectInfo) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + arrow::fs::FileInfo info; + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(info)); + + std::array buffer{}; + std::int64_t size; + ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); + + EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); +} + +TEST_F(GcsIntegrationTest, ReadObjectNotFound) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + auto result = fs->OpenInputStream(NotFoundObjectPath()); + EXPECT_EQ(result.status().code(), StatusCode::IOError); +} + +TEST_F(GcsIntegrationTest, ReadObjectInfoInvalid) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + arrow::fs::FileInfo info; + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(kPreexistingBucket)); + + auto result = fs->OpenInputStream(NotFoundObjectPath()); + EXPECT_EQ(result.status().code(), StatusCode::IOError); + + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); + result = fs->OpenInputStream(NotFoundObjectPath()); + EXPECT_EQ(result.status().code(), StatusCode::IOError); +} + } // namespace } // namespace fs } // namespace arrow diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index db652691618e5..18b1beb4e6b29 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -53,8 +53,8 @@ ARROW_GOOGLE_CLOUD_CPP_BUILD_VERSION=v1.32.0 ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM=c62338389f76915179fe61d8c0f5fefa06131b4e0d7312707af5309b1394e099 ARROW_GRPC_BUILD_VERSION=v1.35.0 ARROW_GRPC_BUILD_SHA256_CHECKSUM=27dd2fc5c9809ddcde8eb6fa1fa278a3486566dfc28335fca13eb8df8bd3b958 -ARROW_GTEST_BUILD_VERSION=1.11.0 -ARROW_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 +ARROW_GTEST_BUILD_VERSION=1.10.0 +ARROW_GTEST_BUILD_SHA256_CHECKSUM=9dc9157a9a1551ec7a7e43daea9a694a0bb5fb8bec81235d8a1e6ef64c716dcb ARROW_JEMALLOC_BUILD_VERSION=5.2.1 ARROW_JEMALLOC_BUILD_SHA256_CHECKSUM=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6 ARROW_LZ4_BUILD_VERSION=v1.9.3 diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 000bd3d0abf11..5ca6e7a754e9c 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -583,28 +583,28 @@ which columns are used to partition the dataset. This is useful when you expect query your data in specific ways and you can utilize partitioning to reduce the amount of data you need to read. -.. To add when ARROW-12364 is merged - Customizing & inspecting written files - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - By default the dataset API will create files named "part-i.format" where "i" is a integer - generated during the write and "format" is the file format specified in the write_dataset - call. For simple datasets it may be possible to know which files will be created but for - larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used - to supply a visitor that will be called as each file is created: - - .. ipython:: python - - def file_visitor(written_file): - print(f"path={written_file.path}") - print(f"metadata={written_file.metadata}") - ds.write_dataset(table, dataset_root, format="parquet", partitioning=part, - file_visitor=file_visitor) - - This will allow you to collect the filenames that belong to the dataset and store them elsewhere - which can be useful when you want to avoid scanning directories the next time you need to read - the data. It can also be used to generate the _metadata index file used by other tools such as - dask or spark to create an index of the dataset. +Customizing & inspecting written files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default the dataset API will create files named "part-i.format" where "i" is a integer +generated during the write and "format" is the file format specified in the write_dataset +call. For simple datasets it may be possible to know which files will be created but for +larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used +to supply a visitor that will be called as each file is created: + +.. ipython:: python + + def file_visitor(written_file): + print(f"path={written_file.path}") + print(f"metadata={written_file.metadata}") + + ds.write_dataset(table, base / "dataset_visited", format="parquet", partitioning=part, + file_visitor=file_visitor) + +This will allow you to collect the filenames that belong to the dataset and store them elsewhere +which can be useful when you want to avoid scanning directories the next time you need to read +the data. It can also be used to generate the _metadata index file used by other tools such as +dask or spark to create an index of the dataset. Configuring format-specific parameters during a write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go index c4ee0467067b1..89c81ef10eb3f 100644 --- a/go/arrow/array/compare.go +++ b/go/arrow/array/compare.go @@ -65,6 +65,136 @@ func RecordApproxEqual(left, right Record, opts ...EqualOption) bool { return true } +// helper function to evaluate a function on two chunked object having possibly different +// chunk layouts. the function passed in will be called for each corresponding slice of the +// two chunked arrays and if the function returns false it will end the loop early. +func chunkedBinaryApply(left, right *Chunked, fn func(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64) bool) { + var ( + pos int64 + length int64 = int64(left.length) + leftIdx, rightIdx int + leftPos, rightPos int64 + ) + + for pos < length { + var cleft, cright Interface + for { + cleft, cright = left.Chunk(leftIdx), right.Chunk(rightIdx) + if leftPos == int64(cleft.Len()) { + leftPos = 0 + leftIdx++ + continue + } + if rightPos == int64(cright.Len()) { + rightPos = 0 + rightIdx++ + continue + } + break + } + + sz := int64(min(cleft.Len()-int(leftPos), cright.Len()-int(rightPos))) + pos += sz + if !fn(cleft, leftPos, leftPos+sz, cright, rightPos, rightPos+sz) { + return + } + + leftPos += sz + rightPos += sz + } +} + +// ChunkedEqual reports whether two chunked arrays are equal regardless of their chunkings +func ChunkedEqual(left, right *Chunked) bool { + switch { + case left == right: + return true + case left.length != right.length: + return false + case left.nulls != right.nulls: + return false + case !arrow.TypeEqual(left.dtype, right.dtype): + return false + } + + var isequal bool + chunkedBinaryApply(left, right, func(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64) bool { + isequal = ArraySliceEqual(left, lbeg, lend, right, rbeg, rend) + return isequal + }) + + return isequal +} + +// ChunkedApproxEqual reports whether two chunked arrays are approximately equal regardless of their chunkings +// for non-floating point arrays, this is equivalent to ChunkedEqual +func ChunkedApproxEqual(left, right *Chunked, opts ...EqualOption) bool { + switch { + case left == right: + return true + case left.length != right.length: + return false + case left.nulls != right.nulls: + return false + case !arrow.TypeEqual(left.dtype, right.dtype): + return false + } + + var isequal bool + chunkedBinaryApply(left, right, func(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64) bool { + isequal = ArraySliceApproxEqual(left, lbeg, lend, right, rbeg, rend, opts...) + return isequal + }) + + return isequal +} + +// TableEqual returns if the two tables have the same data in the same schema +func TableEqual(left, right Table) bool { + switch { + case left.NumCols() != right.NumCols(): + return false + case left.NumRows() != right.NumRows(): + return false + } + + for i := 0; int64(i) < left.NumCols(); i++ { + lc := left.Column(i) + rc := right.Column(i) + if !lc.field.Equal(rc.field) { + return false + } + + if !ChunkedEqual(lc.data, rc.data) { + return false + } + } + return true +} + +// TableEqual returns if the two tables have the approximately equal data in the same schema +func TableApproxEqual(left, right Table, opts ...EqualOption) bool { + switch { + case left.NumCols() != right.NumCols(): + return false + case left.NumRows() != right.NumRows(): + return false + } + + for i := 0; int64(i) < left.NumCols(); i++ { + lc := left.Column(i) + rc := right.Column(i) + if !lc.field.Equal(rc.field) { + return false + } + + if !ChunkedApproxEqual(lc.data, rc.data, opts...) { + return false + } + } + return true +} + // ArrayEqual reports whether the two provided arrays are equal. func ArrayEqual(left, right Interface) bool { switch { @@ -188,6 +318,16 @@ func ArraySliceEqual(left Interface, lbeg, lend int64, right Interface, rbeg, re return ArrayEqual(l, r) } +// ArraySliceApproxEqual reports whether slices left[lbeg:lend] and right[rbeg:rend] are approximately equal. +func ArraySliceApproxEqual(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64, opts ...EqualOption) bool { + l := NewSlice(left, lbeg, lend) + defer l.Release() + r := NewSlice(right, rbeg, rend) + defer r.Release() + + return ArrayApproxEqual(l, r, opts...) +} + const defaultAbsoluteTolerance = 1e-5 type equalOption struct { diff --git a/go/arrow/array/compare_test.go b/go/arrow/array/compare_test.go index 3ed326be1758c..4006087943962 100644 --- a/go/arrow/array/compare_test.go +++ b/go/arrow/array/compare_test.go @@ -21,10 +21,12 @@ import ( "math" "testing" + "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/array" "github.com/apache/arrow/go/arrow/float16" "github.com/apache/arrow/go/arrow/internal/arrdata" "github.com/apache/arrow/go/arrow/memory" + "github.com/stretchr/testify/assert" ) func TestArrayEqual(t *testing.T) { @@ -529,3 +531,82 @@ func TestRecordApproxEqual(t *testing.T) { }) } } + +func TestChunkedEqual(t *testing.T) { + for name, recs := range arrdata.Records { + t.Run(name, func(t *testing.T) { + tbl := array.NewTableFromRecords(recs[0].Schema(), recs) + defer tbl.Release() + + for i := 0; i < int(tbl.NumCols()); i++ { + if !array.ChunkedEqual(tbl.Column(i).Data(), tbl.Column(i).Data()) && name != "nulls" { + t.Fatalf("identical chunked arrays should compare as equal:\narr:%v\n", tbl.Column(i).Data()) + } + } + }) + } +} + +func TestChunkedApproxEqual(t *testing.T) { + fb := array.NewFloat64Builder(memory.DefaultAllocator) + defer fb.Release() + + fb.AppendValues([]float64{1, 2, 3, 4, 5}, nil) + f1 := fb.NewFloat64Array() + defer f1.Release() + + fb.AppendValues([]float64{6, 7}, nil) + f2 := fb.NewFloat64Array() + defer f2.Release() + + fb.AppendValues([]float64{8, 9, 10}, nil) + f3 := fb.NewFloat64Array() + defer f3.Release() + + c1 := array.NewChunked( + arrow.PrimitiveTypes.Float64, + []array.Interface{f1, f2, f3}, + ) + defer c1.Release() + + fb.AppendValues([]float64{1, 2, 3}, nil) + f4 := fb.NewFloat64Array() + defer f4.Release() + + fb.AppendValues([]float64{4, 5}, nil) + f5 := fb.NewFloat64Array() + defer f5.Release() + + fb.AppendValues([]float64{6, 7, 8, 9}, nil) + f6 := fb.NewFloat64Array() + defer f6.Release() + + fb.AppendValues([]float64{10}, nil) + f7 := fb.NewFloat64Array() + defer f7.Release() + + c2 := array.NewChunked( + arrow.PrimitiveTypes.Float64, + []array.Interface{f4, f5, f6, f7}, + ) + defer c2.Release() + + assert.True(t, array.ChunkedEqual(c1, c2)) + assert.True(t, array.ChunkedApproxEqual(c1, c2)) +} + +func TestTableEqual(t *testing.T) { + for name, recs := range arrdata.Records { + t.Run(name, func(t *testing.T) { + tbl := array.NewTableFromRecords(recs[0].Schema(), recs) + defer tbl.Release() + + if !array.TableEqual(tbl, tbl) { + t.Fatalf("identical tables should compare as equal:\tbl:%v\n", tbl) + } + if !array.TableApproxEqual(tbl, tbl) { + t.Fatalf("identical tables should compare as approx equal:\tbl:%v\n", tbl) + } + }) + } +} diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 267ecbf0dbf72..8041b4e3cc744 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1835,7 +1835,9 @@ def partitioning(self): columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. use_threads : bool, default True Perform multi-threaded column reads. metadata : FileMetaData diff --git a/r/Makefile b/r/Makefile index 62e6130816d97..05cca5f1199cd 100644 --- a/r/Makefile +++ b/r/Makefile @@ -45,7 +45,7 @@ sync-cpp: cp -p ../.env tools/dotenv cp -p ../NOTICE.txt tools/ cp -p ../LICENSE.txt tools/ - sed -i .bak "s/\.env/dotenv/g" tools/cpp/CMakeLists.txt + sed -i"" -e "s/\.env/dotenv/g" tools/cpp/CMakeLists.txt build: doc sync-cpp R CMD build ${args} . diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index 5933fd99ab5cc..95c7f7bd33db9 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -22,11 +22,9 @@ #' make it much faster to read and query. #' #' @param dataset [Dataset], [RecordBatch], [Table], `arrow_dplyr_query`, or -#' `data.frame`. If an `arrow_dplyr_query` or `grouped_df`, -#' `schema` and `partitioning` will be taken from the result of any `select()` -#' and `group_by()` operations done on the dataset. `filter()` queries will be -#' applied to restrict written rows. -#' Note that `select()`-ed columns may not be renamed. +#' `data.frame`. If an `arrow_dplyr_query`, the query will be evaluated and +#' the result will be written. This means that you can `select()`, `filter()`, `mutate()`, +#' etc. to transform the data before it is written if you need to. #' @param path string path, URI, or `SubTreeFileSystem` referencing a directory #' to write to (directory will be created if it does not exist) #' @param format a string identifier of the file format. Default is to use diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 021762162b7cb..122f7682e1720 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -6,11 +6,7 @@ \alias{arrow-package} \title{arrow: Integration to 'Apache' 'Arrow'} \description{ -'Apache' 'Arrow' is a cross-language - development platform for in-memory data. It specifies a standardized - language-independent columnar memory format for flat and hierarchical data, - organized for efficient analytic operations on modern hardware. This - package provides an interface to the 'Arrow C++' library. +'Apache' 'Arrow' is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. This package provides an interface to the 'Arrow C++' library. } \seealso{ Useful links: diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index 6f36f8e72e8c6..219cc834510ea 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -16,11 +16,9 @@ write_dataset( } \arguments{ \item{dataset}{\link{Dataset}, \link{RecordBatch}, \link{Table}, \code{arrow_dplyr_query}, or -\code{data.frame}. If an \code{arrow_dplyr_query} or \code{grouped_df}, -\code{schema} and \code{partitioning} will be taken from the result of any \code{select()} -and \code{group_by()} operations done on the dataset. \code{filter()} queries will be -applied to restrict written rows. -Note that \code{select()}-ed columns may not be renamed.} +\code{data.frame}. If an \code{arrow_dplyr_query}, the query will be evaluated and +the result will be written. This means that you can \code{select()}, \code{filter()}, \code{mutate()}, +etc. to transform the data before it is written if you need to.} \item{path}{string path, URI, or \code{SubTreeFileSystem} referencing a directory to write to (directory will be created if it does not exist)}