diff --git a/Cargo.lock b/Cargo.lock index 9e5b10935..07b441d11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -205,9 +205,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -226,31 +226,28 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "ahash 0.8.11", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "chrono-tz 0.10.0", + "chrono-tz 0.10.1", "half", "hashbrown 0.15.2", "num", @@ -258,9 +255,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b02656a35cc103f28084bc80a0159668e0a680d919cef127bd7e0aaccb06ec1" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "bytes", "half", @@ -269,9 +265,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c73c6233c5b5d635a56f6010e6eb1ab9e30e94707db21cea03da317f67d84cf3" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", "arrow-buffer", @@ -290,28 +285,23 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f2861ffa86f107b8ab577d86cff7c7a490243eabe961ba1e1af4f27542bb79" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-buffer", "arrow-schema", @@ -321,13 +311,11 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0270dc511f11bb5fa98a25020ad51a99ca5b08d8a8dfbd17503bb9dba0388f0b" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -336,9 +324,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", "arrow-buffer", @@ -347,7 +334,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.7.0", + "indexmap 2.7.1", "lexical-core", "num", "serde", @@ -356,26 +343,21 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f202a879d287099139ff0d121e7f55ae5e0efe634b8cf2106ebc27a8715dee" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ - "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-data", @@ -385,15 +367,13 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" [[package]] name = "arrow-select" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -405,9 +385,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72993b01cb62507b06f1fb49648d7286c8989ecfabdb7b77a750fcb54410731b" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "arrow-array", "arrow-buffer", @@ -558,7 +537,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -588,6 +567,28 @@ dependencies = [ "wasm-bindgen-futures", ] +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "async-task" version = "4.7.1" @@ -602,7 +603,7 @@ checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -628,9 +629,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.15" +version = "1.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90" +checksum = "50236e4d60fe8458de90a71c0922c761e41755adf091b1b03de1cef537179915" dependencies = [ "aws-credential-types", "aws-runtime", @@ -670,9 +671,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac" +checksum = "76dd04d39cc12844c0994f2c9c5a6f5184c22e9188ec1ff723de41910a21dcad" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -695,9 +696,9 @@ dependencies = [ [[package]] name = "aws-sdk-glue" -version = "1.76.0" +version = "1.81.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c25c89d6efe63a398cb727b79c285e06184c432985a0d221df0f23d7d10f1f9" +checksum = "181b6fd3c6ef2300129a2e8b7c5d0c5da4156d059f025dd083aef3214f644b80" dependencies = [ "aws-credential-types", "aws-runtime", @@ -717,9 +718,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3tables" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "858cb35e9b97355d1013526de4ca5e09ab8ad9f71f1d4edf65425205a898220c" +checksum = "82e0218f51b519086a7cc8a8fc86af78f4085e1934eb1007549c7296c9f204a5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -739,9 +740,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.56.0" +version = "1.58.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12e057fdcb8842de9b83592a70f5b4da0ee10bc0ad278247da1425a742a444d7" +checksum = "16ff718c9ee45cc1ebd4774a0e086bb80a6ab752b4902edf1c9f56b86ee1f770" dependencies = [ "aws-credential-types", "aws-runtime", @@ -761,9 +762,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.57.1" +version = "1.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a120ade4a44691b3c5c2ff2fa61b14ed331fdc218397f61ab48d66593012ae2a" +checksum = "5183e088715cc135d8d396fdd3bc02f018f0da4c511f53cb8d795b6a31c55809" dependencies = [ "aws-credential-types", "aws-runtime", @@ -783,9 +784,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.57.0" +version = "1.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "115fd4fb663817ed595a5ee4f1649d7aacd861d47462323cb37576ce89271b93" +checksum = "c9f944ef032717596639cea4a2118a3a457268ef51bbb5fde9637e54c465da00" dependencies = [ "aws-credential-types", "aws-runtime", @@ -806,9 +807,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.7" +version = "1.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05" +checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -879,9 +880,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.7" +version = "1.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e" +checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -923,9 +924,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.12" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97" +checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042" dependencies = [ "base64-simd", "bytes", @@ -958,9 +959,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.4" +version = "1.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2" +checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -1052,9 +1053,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.7.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" dependencies = [ "serde", ] @@ -1126,9 +1127,9 @@ dependencies = [ [[package]] name = "borsh" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb65153674e51d3a42c8f27b05b9508cea85edfaade8aa46bc8fc18cecdfef3" +checksum = "5430e3be710b68d984d1391c854eb431a9d548640711faa54eecb1df93db91cc" dependencies = [ "borsh-derive", "cfg_aliases", @@ -1136,15 +1137,15 @@ dependencies = [ [[package]] name = "borsh-derive" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a396e17ad94059c650db3d253bb6e25927f1eb462eede7e7a153bb6e75dce0a7" +checksum = "f8b668d39970baad5356d7c83a86fee3a539e6f93bf6764c97368243e17a0487" dependencies = [ "once_cell", "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1160,9 +1161,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1180,9 +1181,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "bytecheck" @@ -1214,9 +1215,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" dependencies = [ "serde", ] @@ -1273,9 +1274,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.9" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" +checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda" dependencies = [ "jobserver", "libc", @@ -1322,9 +1323,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" dependencies = [ "chrono", "chrono-tz-build 0.4.0", @@ -1380,12 +1381,11 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.1.3" +version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] @@ -1457,9 +1457,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -1542,9 +1542,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -1584,7 +1584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501" dependencies = [ "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1608,7 +1608,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1619,7 +1619,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1644,9 +1644,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014fc8c384ecacedaabb3bc8359c2a6c6e9d8f7bea65be3434eccacfc37f52d9" +checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "arrow", "arrow-array", @@ -1657,7 +1657,6 @@ dependencies = [ "bytes", "bzip2 0.5.0", "chrono", - "dashmap", "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", @@ -1677,7 +1676,7 @@ dependencies = [ "flate2", "futures", "glob", - "itertools", + "itertools 0.14.0", "log", "object_store", "parking_lot", @@ -1696,33 +1695,41 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee60d33e210ef96070377ae667ece7caa0e959c8387496773d4a1a72f1a5012e" +checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ - "arrow-schema", + "arrow", "async-trait", + "dashmap", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", "parking_lot", + "sqlparser", ] [[package]] name = "datafusion-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" +checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", + "arrow-ipc", "arrow-schema", + "base64 0.22.1", "half", "hashbrown 0.14.5", - "indexmap 2.7.0", + "indexmap 2.7.1", "libc", "log", "object_store", @@ -1736,9 +1743,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fbf14d4079f7ce5306393084fe5057dddfdc2113577e0049310afa12e94281" +checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -1746,15 +1753,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c278dbd64860ed0bb5240fc1f4cb6aeea437153910aea69bcf7d5a8d6d0454f3" +checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22cb02af47e756468b3cbfee7a83e3d4f2278d452deb4b033ba933c75169486" +checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -1771,9 +1778,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62298eadb1d15b525df1315e61a71519ffc563d41d5c3b2a30fda2d70f77b93c" +checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -1783,7 +1790,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap 2.7.0", + "indexmap 2.7.1", "paste", "recursive", "serde_json", @@ -1792,20 +1799,21 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda7f73c5fc349251cd3dcb05773c5bf55d2505a698ef9d38dfc712161ea2f55" +checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", - "itertools", + "itertools 0.14.0", + "paste", ] [[package]] name = "datafusion-functions" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd197f3b2975424d3a4898ea46651be855a46721a56727515dbd5c9e2fb597da" +checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -1821,7 +1829,7 @@ dependencies = [ "datafusion-macros", "hashbrown 0.14.5", "hex", - "itertools", + "itertools 0.14.0", "log", "md-5", "rand", @@ -1833,12 +1841,13 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" +checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash 0.8.11", "arrow", + "arrow-buffer", "arrow-schema", "datafusion-common", "datafusion-doc", @@ -1855,9 +1864,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" +checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash 0.8.11", "arrow", @@ -1868,9 +1877,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6360f27464fab857bec698af39b2ae331dc07c8bf008fb4de387a19cdc6815a5" +checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" dependencies = [ "arrow", "arrow-array", @@ -1878,21 +1887,23 @@ dependencies = [ "arrow-ord", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-macros", "datafusion-physical-expr-common", - "itertools", + "itertools 0.14.0", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c35c070eb705c12795dab399c3809f4dfbc290678c624d3989490ca9b8449c1" +checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1906,9 +1917,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52229bca26b590b140900752226c829f15fc1a99840e1ca3ce1a9534690b82a8" +checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1923,9 +1934,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "367befc303b64a668a10ae6988a064a9289e1999e71a7f8e526b6e14d6bdd9d6" +checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1933,27 +1944,28 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" +checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ + "datafusion-expr", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "datafusion-optimizer" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53b520413906f755910422b016fb73884ae6e9e1b376de4f9584b6c0e031da75" +checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "indexmap 2.7.0", - "itertools", + "indexmap 2.7.1", + "itertools 0.14.0", "log", "recursive", "regex", @@ -1962,9 +1974,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" +checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash 0.8.11", "arrow", @@ -1978,8 +1990,8 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap 2.7.0", - "itertools", + "indexmap 2.7.1", + "itertools 0.14.0", "log", "paste", "petgraph", @@ -1987,40 +1999,46 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" +checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash 0.8.11", "arrow", + "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "itertools", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dc3a82190f49c37d377f31317e07ab5d7588b837adadba8ac367baad5dc2351" +checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", + "arrow-schema", "datafusion-common", "datafusion-execution", + "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools", + "futures", + "itertools 0.14.0", "log", "recursive", + "url", ] [[package]] name = "datafusion-physical-plan" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" +checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash 0.8.11", "arrow", @@ -2040,8 +2058,8 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap 2.7.0", - "itertools", + "indexmap 2.7.1", + "itertools 0.14.0", "log", "parking_lot", "pin-project-lite", @@ -2050,9 +2068,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a884061c79b33d0c8e84a6f4f4be8bdc12c0f53f5af28ddf5d6d95ac0b15fdc" +checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", "arrow-array", @@ -2060,7 +2078,7 @@ dependencies = [ "bigdecimal", "datafusion-common", "datafusion-expr", - "indexmap 2.7.0", + "indexmap 2.7.1", "log", "recursive", "regex", @@ -2117,7 +2135,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2127,7 +2145,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2162,7 +2180,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2290,21 +2308,21 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "faststr" -version = "0.2.27" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9154486833a83cb5d99de8c4d831314b8ae810dd4ef18d89ceb7a9c7c728dd74" +checksum = "16c6338e632ed4711dd1327f6dc607e72e3f02a591ddd46f2bbee878f2d93c65" dependencies = [ "bytes", - "rkyv 0.8.9", + "rkyv 0.8.10", "serde", "simdutf8", ] [[package]] name = "fixedbitset" -version = "0.4.2" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" @@ -2444,7 +2462,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2556,7 +2574,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", "ignore", "walkdir", ] @@ -2585,7 +2603,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.7.0", + "indexmap 2.7.1", "slab", "tokio", "tokio-util", @@ -2604,7 +2622,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.2.0", - "indexmap 2.7.0", + "indexmap 2.7.1", "slab", "tokio", "tokio-util", @@ -2776,9 +2794,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -2827,9 +2845,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -2870,9 +2888,9 @@ checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http 1.2.0", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-util", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-pki-types", "tokio", "tokio-rustls 0.26.1", @@ -2891,7 +2909,7 @@ dependencies = [ "futures-util", "http 1.2.0", "http-body 1.0.1", - "hyper 1.5.2", + "hyper 1.6.0", "pin-project-lite", "socket2", "tokio", @@ -2937,6 +2955,7 @@ dependencies = [ "arrow-select", "arrow-string", "async-std", + "async-stream", "async-trait", "bimap", "bitvec", @@ -2949,7 +2968,7 @@ dependencies = [ "futures", "iceberg-catalog-memory", "iceberg_test_utils", - "itertools", + "itertools 0.13.0", "moka", "murmur3", "num-bigint", @@ -3024,7 +3043,7 @@ dependencies = [ "async-trait", "futures", "iceberg", - "itertools", + "itertools 0.13.0", "regex", "serde_json", "tempfile", @@ -3042,7 +3061,7 @@ dependencies = [ "http 1.2.0", "iceberg", "iceberg_test_utils", - "itertools", + "itertools 0.13.0", "log", "mockito", "port_scanner", @@ -3065,9 +3084,10 @@ dependencies = [ "aws-sdk-s3tables", "iceberg", "iceberg_test_utils", - "itertools", + "itertools 0.13.0", "serde_json", "tokio", + "typed-builder 0.20.0", "uuid", ] @@ -3078,7 +3098,7 @@ dependencies = [ "async-trait", "iceberg", "iceberg_test_utils", - "itertools", + "itertools 0.13.0", "regex", "serde_json", "sqlx", @@ -3252,7 +3272,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3311,9 +3331,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -3348,9 +3368,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.10.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "is_terminal_polyfill" @@ -3367,6 +3387,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" @@ -3394,11 +3423,11 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "9.3.0" +version = "9.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9ae10193d25051e74945f1ea2d0b42e03cc3b890f7e4cc5faa44997d808193f" +checksum = "5a87cc7a48537badeae96744432de36f4be2b4a34a05a5ef32e9dd8a1c169dde" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "js-sys", "pem", "ring", @@ -3703,7 +3732,7 @@ dependencies = [ "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-util", "log", "rand", @@ -3776,7 +3805,7 @@ checksum = "1bb5c1d8184f13f7d0ccbeeca0def2f9a181bce2624302793005f5ca8aa62e5e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3797,7 +3826,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", "cfg-if", "cfg_aliases", "libc", @@ -3930,7 +3959,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3953,7 +3982,7 @@ dependencies = [ "chrono", "futures", "humantime", - "itertools", + "itertools 0.13.0", "parking_lot", "percent-encoding", "snafu", @@ -3965,15 +3994,15 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.2" +version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" [[package]] name = "opendal" -version = "0.51.1" +version = "0.51.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9dcfa7a3615e3c60eb662ed6b46b6f244cf2658098f593c0c0915430b3a268" +checksum = "5b1063ea459fa9e94584115743b06330f437902dd1d9f692b863ef1875a20548" dependencies = [ "anyhow", "async-trait", @@ -4000,9 +4029,9 @@ dependencies = [ [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "ordered-float" @@ -4036,9 +4065,9 @@ dependencies = [ [[package]] name = "outref" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "overload" @@ -4077,9 +4106,8 @@ dependencies = [ [[package]] name = "parquet" -version = "53.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" +version = "54.1.0" +source = "git+https://github.com/rshkv/arrow-rs?branch=wr/map-builder-with-key-field#0cd50a26529c880d54aa2dd363fe126439a885b9" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4103,6 +4131,7 @@ dependencies = [ "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -4192,7 +4221,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4208,12 +4237,12 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.6.5" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.7.0", + "indexmap 2.7.1", ] [[package]] @@ -4279,22 +4308,22 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4480,7 +4509,7 @@ checksum = "ca414edb151b4c8d125c12566ab0d74dc9cdba36fb80eb7b848c15f495fd32d1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4520,7 +4549,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.21", + "rustls 0.23.22", "socket2", "thiserror 2.0.11", "tokio", @@ -4538,7 +4567,7 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-pki-types", "slab", "thiserror 2.0.11", @@ -4634,7 +4663,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4643,7 +4672,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", ] [[package]] @@ -4663,7 +4692,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4774,7 +4803,7 @@ dependencies = [ "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-rustls 0.27.5", "hyper-util", "ipnet", @@ -4785,7 +4814,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", @@ -4841,18 +4870,18 @@ dependencies = [ [[package]] name = "rkyv" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b11a153aec4a6ab60795f8ebe2923c597b16b05bb1504377451e705ef1a45323" +checksum = "1e147371c75553e1e2fcdb483944a8540b8438c31426279553b9a8182a9b7b65" dependencies = [ "bytes", "hashbrown 0.15.2", - "indexmap 2.7.0", + "indexmap 2.7.1", "munge", "ptr_meta 0.3.0", "rancor", "rend 0.5.2", - "rkyv_derive 0.8.9", + "rkyv_derive 0.8.10", "tinyvec", "uuid", ] @@ -4870,13 +4899,13 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "beb382a4d9f53bd5c0be86b10d8179c3f8a14c30bf774ff77096ed6581e35981" +checksum = "246b40ac189af6c675d124b802e8ef6d5246c53e17367ce9501f8f66a81abb7a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4941,9 +4970,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" dependencies = [ "rand", ] @@ -4959,11 +4988,11 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys", @@ -4984,9 +5013,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.21" +version = "0.23.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" +checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" dependencies = [ "once_cell", "ring", @@ -5028,9 +5057,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" dependencies = [ "web-time", ] @@ -5064,9 +5093,9 @@ checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "salsa20" @@ -5140,7 +5169,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", "core-foundation", "core-foundation-sys", "libc", @@ -5159,9 +5188,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" [[package]] name = "seq-macro" @@ -5195,7 +5224,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5218,7 +5247,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5243,7 +5272,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.7.0", + "indexmap 2.7.1", "serde", "serde_derive", "serde_json", @@ -5260,7 +5289,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5327,9 +5356,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "similar" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" [[package]] name = "simple_asn1" @@ -5395,7 +5424,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5496,7 +5525,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5529,12 +5558,12 @@ dependencies = [ "futures-util", "hashbrown 0.15.2", "hashlink", - "indexmap 2.7.0", + "indexmap 2.7.1", "log", "memchr", "once_cell", "percent-encoding", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-pemfile 2.2.0", "serde", "serde_json", @@ -5558,7 +5587,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5579,7 +5608,7 @@ dependencies = [ "sha2", "sqlx-core", "sqlx-sqlite", - "syn 2.0.96", + "syn 2.0.98", "tempfile", "tokio", "url", @@ -5593,7 +5622,7 @@ checksum = "4560278f0e00ce64938540546f59f590d60beee33fffbd3b9cd47851e5fff233" dependencies = [ "atoi", "base64 0.22.1", - "bitflags 2.7.0", + "bitflags 2.8.0", "byteorder", "bytes", "crc", @@ -5634,7 +5663,7 @@ checksum = "c5b98a57f363ed6764d5b3a12bfedf62f07aa16e1856a7ddc2a0bb190a959613" dependencies = [ "atoi", "base64 0.22.1", - "bitflags 2.7.0", + "bitflags 2.8.0", "byteorder", "crc", "dotenvy", @@ -5744,7 +5773,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5766,9 +5795,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -5792,7 +5821,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5869,7 +5898,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5880,7 +5909,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -5995,7 +6024,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6014,7 +6043,7 @@ version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.21", + "rustls 0.23.22", "tokio", ] @@ -6050,11 +6079,11 @@ checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ - "indexmap 2.7.0", + "indexmap 2.7.1", "toml_datetime", "winnow", ] @@ -6106,7 +6135,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6196,7 +6225,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6207,7 +6236,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6280,9 +6309,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-normalization" @@ -6354,19 +6383,19 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.12.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.3.1", "serde", ] [[package]] name = "valuable" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "value-bag" @@ -6388,9 +6417,9 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "volo" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6fcd0ebc301fa50b3f7d5fd2fb0e87f6bf01196f6f4ddefa3649275c18f35f4" +checksum = "9fdd22501d86e9fffb3f83e3dc5d948cd3b33f3cde8e5eccc92d2f1c36f8035c" dependencies = [ "async-broadcast", "dashmap", @@ -6405,7 +6434,7 @@ dependencies = [ "pin-project", "rand", "socket2", - "thiserror 1.0.69", + "thiserror 2.0.11", "tokio", "tokio-stream", "tower", @@ -6510,7 +6539,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-shared", ] @@ -6545,7 +6574,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6594,9 +6623,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.26.7" +version = "0.26.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d642ff16b7e79272ae451b7322067cdc17cadf68c23264be9d94a32319efe7e" +checksum = "2210b291f7ea53617fbafcc4939f10914214ec15aace5ba62293a668f322c5c9" dependencies = [ "rustls-pki-types", ] @@ -6682,7 +6711,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6693,7 +6722,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6876,9 +6905,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "86e376c75f4f43f44db463cf729e0d3acbf954d13e22c51e26e4c264b4ab545f" dependencies = [ "memchr", ] @@ -6889,7 +6918,7 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", ] [[package]] @@ -6954,7 +6983,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -6976,7 +7005,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -6996,7 +7025,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -7025,7 +7054,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 36e88b485..6027d8c27 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,13 +41,13 @@ rust-version = "1.77.1" anyhow = "1.0.72" apache-avro = "0.17" array-init = "2" -arrow-arith = { version = "53.3.0" } -arrow-array = { version = "53.4.0" } -arrow-cast = { version = "53.4.0" } -arrow-ord = { version = "53.4.0" } -arrow-schema = { version = "53.4.0" } -arrow-select = { version = "53.4.0" } -arrow-string = { version = "53.4.0" } +arrow-arith = { version = "54.1.0" } +arrow-array = { version = "54.1.0" } +arrow-cast = { version = "54.1.0" } +arrow-ord = { version = "54.1.0" } +arrow-schema = { version = "54.1.0" } +arrow-select = { version = "54.1.0" } +arrow-string = { version = "54.1.0" } async-stream = "0.3.5" async-trait = "0.1.86" async-std = "1.12" @@ -58,7 +58,7 @@ bitvec = "1.0.1" bytes = "1.6" chrono = "0.4.38" ctor = "0.2.8" -datafusion = "44" +datafusion = "45.0.0" derive_builder = "0.20" either = "1" env_logger = "0.11.0" @@ -77,7 +77,7 @@ num-bigint = "0.4.6" once_cell = "1.19" opendal = "0.51.1" ordered-float = "4" -parquet = "53.4.0" +parquet = "54.1.0" paste = "1.0.15" pilota = "0.11.2" pretty_assertions = "1.4" @@ -103,3 +103,22 @@ hive_metastore = "0.1" tera = "1" zstd = "0.13.2" expect-test = "1" + +# Surely there's a better way to do this? +# https://github.com/apache/arrow-rs/pull/7101 +# TODO(rshkv): Remove obviously once that PR merges +[patch.crates-io] +arrow-arith = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-array = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-buffer = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-cast = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-csv = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-data = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-ipc = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-json = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-ord = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-row = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-schema = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-select = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +arrow-string = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } +parquet = { git = "https://github.com/rshkv/arrow-rs", branch = "wr/map-builder-with-key-field" } diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 08d5efabe..df3677624 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -52,6 +52,7 @@ arrow-schema = { workspace = true } arrow-select = { workspace = true } arrow-string = { workspace = true } async-std = { workspace = true, optional = true, features = ["attributes"] } +async-stream = { workspace = true } async-trait = { workspace = true } bimap = { workspace = true } bitvec = { workspace = true } @@ -85,6 +86,7 @@ uuid = { workspace = true } zstd = { workspace = true } [dev-dependencies] +arrow-cast = { workspace = true, features = ["prettyprint"] } ctor = { workspace = true } expect-test = { workspace = true } iceberg-catalog-memory = { workspace = true } diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 41afd8ea4..1e95db825 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -827,6 +827,193 @@ get_parquet_stat_as_datum!(min); get_parquet_stat_as_datum!(max); +/// Utilities to deal with [arrow_array::builder] types in the Iceberg context. +pub(crate) mod builder { + use arrow_array::builder::*; + use arrow_array::cast::AsArray; + use arrow_array::types::*; + use arrow_array::{ArrayRef, Datum as ArrowDatum}; + use arrow_schema::{DataType, TimeUnit}; + use ordered_float::OrderedFloat; + + use crate::spec::{Literal, PrimitiveLiteral}; + use crate::{Error, ErrorKind}; + + /// A helper wrapping [ArrayBuilder] for building arrays without declaring the inner type at + /// compile-time when types are determined dynamically (e.g. based on some column type). + /// A [DataType] is given at construction time which is used to later downcast the inner array + /// and provided values. + pub(crate) struct AnyPrimitiveArrayBuilder { + data_type: DataType, + inner: Box, + } + + impl AnyPrimitiveArrayBuilder { + pub(crate) fn new(data_type: &DataType) -> Self { + Self { + data_type: data_type.clone(), + inner: make_builder(data_type, 0), + } + } + + pub(crate) fn finish(&mut self) -> ArrayRef { + self.inner.finish() + } + + /// Append an [[arrow_array::Datum]] value. + pub(crate) fn append_datum(&mut self, value: &dyn ArrowDatum) -> crate::Result<()> { + let (array, is_scalar) = value.get(); + assert!(is_scalar, "Can only append scalar datum"); + + match array.data_type() { + DataType::Boolean => self + .builder::()? + .append_value(array.as_boolean().value(0)), + DataType::Int32 => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Int64 => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Float32 => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Float64 => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Decimal128(_, _) => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Date32 => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Time64(TimeUnit::Microsecond) => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Timestamp(TimeUnit::Microsecond, _) => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Timestamp(TimeUnit::Nanosecond, _) => self + .builder::()? + .append_value(array.as_primitive::().value(0)), + DataType::Utf8 => self + .builder::()? + .append_value(array.as_string::().value(0)), + DataType::FixedSizeBinary(_) => self + .builder::()? + .append_value(array.as_fixed_size_binary().value(0)), + DataType::LargeBinary => self + .builder::()? + .append_value(array.as_binary::().value(0)), + _ => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!("Cannot append data type: {:?}", array.data_type(),), + )); + } + } + Ok(()) + } + + /// Append a literal with the provided [DataType]. We're not solely relying on the literal to + /// infer the type because [Literal] values do not specify the expected type of builder. E.g., + /// a [PrimitiveLiteral::Long] may go into an array builder for longs but also for timestamps. + pub(crate) fn append_literal(&mut self, value: &Literal) -> crate::Result<()> { + let Some(primitive) = value.as_primitive_literal() else { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Expected primitive type", + )); + }; + + match (&self.data_type, primitive.clone()) { + (DataType::Boolean, PrimitiveLiteral::Boolean(value)) => { + self.builder::()?.append_value(value) + } + (DataType::Int32, PrimitiveLiteral::Int(value)) => { + self.builder::()?.append_value(value) + } + (DataType::Int64, PrimitiveLiteral::Long(value)) => { + self.builder::()?.append_value(value) + } + (DataType::Float32, PrimitiveLiteral::Float(OrderedFloat(value))) => { + self.builder::()?.append_value(value) + } + (DataType::Float64, PrimitiveLiteral::Double(OrderedFloat(value))) => { + self.builder::()?.append_value(value) + } + (DataType::Utf8, PrimitiveLiteral::String(value)) => { + self.builder::()?.append_value(value) + } + (DataType::FixedSizeBinary(_), PrimitiveLiteral::Binary(value)) => self + .builder::()? + .append_value(value)?, + (DataType::LargeBinary, PrimitiveLiteral::Binary(value)) => { + self.builder::()?.append_value(value) + } + (_, _) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( + "Builder of type {:?} does not accept literal {:?}", + self.data_type, primitive + ), + )); + } + } + + Ok(()) + } + + /// Append a null value for the provided [DataType]. + pub(crate) fn append_null(&mut self) -> crate::Result<()> { + match self.data_type { + DataType::Boolean => self.builder::()?.append_null(), + DataType::Int32 => self.builder::()?.append_null(), + DataType::Int64 => self.builder::()?.append_null(), + DataType::Float32 => self.builder::()?.append_null(), + DataType::Float64 => self.builder::()?.append_null(), + DataType::Decimal128(_, _) => self.builder::()?.append_null(), + DataType::Date32 => self.builder::()?.append_null(), + DataType::Time64(TimeUnit::Microsecond) => { + self.builder::()?.append_null() + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + self.builder::()?.append_null() + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + self.builder::()?.append_null() + } + DataType::Utf8 => self.builder::()?.append_null(), + DataType::FixedSizeBinary(_) => { + self.builder::()?.append_null() + } + DataType::LargeBinary => self.builder::()?.append_null(), + _ => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( + "Cannot append null values for data type: {:?}", + self.data_type + ), + )) + } + } + Ok(()) + } + + /// Cast the `inner` builder to a specific type or return [Error]. + fn builder(&mut self) -> crate::Result<&mut T> { + self.inner.as_any_mut().downcast_mut::().ok_or_else(|| { + Error::new( + ErrorKind::Unexpected, + "Failed to cast builder to expected type", + ) + }) + } + } +} + impl TryFrom<&ArrowSchema> for crate::spec::Schema { type Error = Error; diff --git a/crates/iceberg/src/inspect/entries.rs b/crates/iceberg/src/inspect/entries.rs new file mode 100644 index 000000000..d515a18c3 --- /dev/null +++ b/crates/iceberg/src/inspect/entries.rs @@ -0,0 +1,910 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::{HashMap, HashSet}; +use std::string::ToString; +use std::sync::Arc; + +use arrow_array::builder::{ + Int32Builder, Int64Builder, LargeBinaryBuilder, ListBuilder, MapBuilder, MapFieldNames, + StringBuilder, +}; +use arrow_array::{ArrayRef, RecordBatch, StructArray}; +use arrow_schema::{DataType, Field, FieldRef, Fields}; +use async_stream::try_stream; +use futures::StreamExt; +use itertools::Itertools; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +use crate::arrow::builder::AnyPrimitiveArrayBuilder; +use crate::arrow::{ + get_arrow_datum, schema_to_arrow_schema, type_to_arrow_type, DEFAULT_MAP_FIELD_NAME, +}; +use crate::scan::ArrowRecordBatchStream; +use crate::spec::{ + join_schemas, DataFile, ManifestFile, NestedField, NestedFieldRef, PartitionField, + PartitionSpec, PartitionSpecRef, PrimitiveType, Struct, TableMetadata, Transform, Type, + MAP_KEY_FIELD_NAME, MAP_VALUE_FIELD_NAME, +}; +use crate::table::Table; +use crate::{Error, ErrorKind, Result}; + +/// Entries table containing the entries of the current snapshot's manifest files. +/// +/// The table has one row for each manifest file entry in the current snapshot's manifest list file. +/// For reference, see the Java implementation of [`ManifestEntry`][1]. +/// +/// [1]: https://github.com/apache/iceberg/blob/apache-iceberg-1.7.1/core/src/main/java/org/apache/iceberg/ManifestEntry.java +pub struct EntriesTable<'a> { + table: &'a Table, +} + +impl<'a> EntriesTable<'a> { + /// Create a new Entries table instance. + pub fn new(table: &'a Table) -> Self { + Self { table } + } + + /// Get the schema for the manifest entries table. + pub fn schema(&self) -> crate::spec::Schema { + let schema = self.manifest_entry_schema(); + let readable_metric_schema = ReadableMetricsStructBuilder::readable_metrics_schema( + self.table.metadata().current_schema(), + &schema, + ); + join_schemas(&schema, &readable_metric_schema).unwrap() + } + + fn manifest_entry_schema(&self) -> crate::spec::Schema { + let fields = vec![ + NestedField::required(0, "status", Type::Primitive(PrimitiveType::Int)), + NestedField::optional(1, "snapshot_id", Type::Primitive(PrimitiveType::Long)), + NestedField::optional(3, "sequence_number", Type::Primitive(PrimitiveType::Long)), + NestedField::optional( + 4, + "file_sequence_number", + Type::Primitive(PrimitiveType::Long), + ), + NestedField::required( + 2, + "data_file", + Type::Struct( + DataFileStructBuilder::schema(self.table.metadata()) + .as_struct() + .clone(), + ), + ), + ]; + crate::spec::Schema::builder() + .with_fields(fields.into_iter().map(Arc::new).collect_vec()) + .build() + .unwrap() + } + + /// Scan the manifest entries table. + pub async fn scan(&self) -> Result { + let current_snapshot = self.table.metadata().current_snapshot().ok_or_else(|| { + Error::new( + ErrorKind::Unexpected, + "Cannot scan entries for table without current snapshot", + ) + })?; + + let manifest_list = current_snapshot + .load_manifest_list(self.table.file_io(), self.table.metadata()) + .await?; + + // Copy to ensure that the stream can take ownership of these dependencies + let schema = self.schema(); + let arrow_schema = Arc::new(schema_to_arrow_schema(&schema)?); + let table_metadata = self.table.metadata_ref(); + let file_io = Arc::new(self.table.file_io().clone()); + let readable_metrics_schema = schema + .field_by_name("readable_metrics") + .and_then(|field| field.field_type.clone().to_struct_type()) + .unwrap(); + + Ok(try_stream! { + for manifest_file in manifest_list.entries() { + let mut status = Int32Builder::new(); + let mut snapshot_id = Int64Builder::new(); + let mut sequence_number = Int64Builder::new(); + let mut file_sequence_number = Int64Builder::new(); + let mut data_file = DataFileStructBuilder::new(&table_metadata); + let mut readable_metrics = + ReadableMetricsStructBuilder::new( + table_metadata.current_schema(), &readable_metrics_schema); + + for manifest_entry in manifest_file.load_manifest(&file_io).await?.entries() { + status.append_value(manifest_entry.status() as i32); + snapshot_id.append_option(manifest_entry.snapshot_id()); + sequence_number.append_option(manifest_entry.sequence_number()); + file_sequence_number.append_option(manifest_entry.file_sequence_number()); + data_file.append(manifest_file, manifest_entry.data_file())?; + readable_metrics.append(manifest_entry.data_file())?; + } + + let batch = RecordBatch::try_new(arrow_schema.clone(), vec![ + Arc::new(status.finish()), + Arc::new(snapshot_id.finish()), + Arc::new(sequence_number.finish()), + Arc::new(file_sequence_number.finish()), + Arc::new(data_file.finish()), + Arc::new(readable_metrics.finish()), + ])?; + + yield batch; + } + } + .boxed()) + } +} + +/// Builds the struct describing data files listed in a table manifest. +/// +/// For reference, see the Java implementation of [`DataFile`][1]. +/// +/// [1]: https://github.com/apache/iceberg/blob/apache-iceberg-1.7.1/api/src/main/java/org/apache/iceberg/DataFile.java +struct DataFileStructBuilder<'a> { + // Reference to table metadata to retrieve partition specs based on partition spec ids + table_metadata: &'a TableMetadata, + // Below are the field builders of the "data_file" struct + content: Int32Builder, + file_path: StringBuilder, + file_format: StringBuilder, + partition: PartitionValuesStructBuilder, + record_count: Int64Builder, + file_size_in_bytes: Int64Builder, + column_sizes: MapBuilder, + value_counts: MapBuilder, + null_value_counts: MapBuilder, + nan_value_counts: MapBuilder, + lower_bounds: MapBuilder, + upper_bounds: MapBuilder, + key_metadata: LargeBinaryBuilder, + split_offsets: ListBuilder, + equality_ids: ListBuilder, + sort_order_ids: Int32Builder, +} + +impl<'a> DataFileStructBuilder<'a> { + fn new(table_metadata: &'a TableMetadata) -> Self { + let map_field_names = Some(MapFieldNames { + entry: DEFAULT_MAP_FIELD_NAME.to_string(), + key: MAP_KEY_FIELD_NAME.to_string(), + value: MAP_VALUE_FIELD_NAME.to_string(), + }); + + Self { + table_metadata, + content: Int32Builder::new(), + file_path: StringBuilder::new(), + file_format: StringBuilder::new(), + partition: PartitionValuesStructBuilder::new(table_metadata), + record_count: Int64Builder::new(), + file_size_in_bytes: Int64Builder::new(), + column_sizes: MapBuilder::new( + map_field_names.clone(), + Int32Builder::new(), + Int64Builder::new(), + ) + .with_keys_field(key_field(117, DataType::Int32)) + .with_values_field(value_field(118, DataType::Int64)), + value_counts: MapBuilder::new( + map_field_names.clone(), + Int32Builder::new(), + Int64Builder::new(), + ) + .with_keys_field(key_field(119, DataType::Int32)) + .with_values_field(value_field(120, DataType::Int64)), + null_value_counts: MapBuilder::new( + map_field_names.clone(), + Int32Builder::new(), + Int64Builder::new(), + ) + .with_keys_field(key_field(121, DataType::Int32)) + .with_values_field(value_field(122, DataType::Int64)), + nan_value_counts: MapBuilder::new( + map_field_names.clone(), + Int32Builder::new(), + Int64Builder::new(), + ) + .with_keys_field(key_field(138, DataType::Int32)) + .with_values_field(value_field(139, DataType::Int64)), + lower_bounds: MapBuilder::new( + map_field_names.clone(), + Int32Builder::new(), + LargeBinaryBuilder::new(), + ) + .with_keys_field(key_field(126, DataType::Int32)) + .with_values_field(value_field(127, DataType::LargeBinary)), + upper_bounds: MapBuilder::new( + map_field_names.clone(), + Int32Builder::new(), + LargeBinaryBuilder::new(), + ) + .with_keys_field(key_field(129, DataType::Int32)) + .with_values_field(value_field(130, DataType::LargeBinary)), + key_metadata: LargeBinaryBuilder::new(), + split_offsets: ListBuilder::new(Int64Builder::new()) + .with_field(list_field(133, DataType::Int64)), + equality_ids: ListBuilder::new(Int32Builder::new()) + .with_field(list_field(136, DataType::Int32)), + sort_order_ids: Int32Builder::new(), + } + } + + fn schema(table_metadata: &TableMetadata) -> crate::spec::Schema { + let partition_type = PartitionValuesStructBuilder::partition_type(table_metadata); + + let fields = vec![ + NestedField::required(134, "content", Type::Primitive(PrimitiveType::Int)), + NestedField::required(100, "file_path", Type::Primitive(PrimitiveType::String)), + NestedField::required(101, "file_format", Type::Primitive(PrimitiveType::String)), + NestedField::required(102, "partition", Type::Struct(partition_type)), + NestedField::required(103, "record_count", Type::Primitive(PrimitiveType::Long)), + NestedField::required( + 104, + "file_size_in_bytes", + Type::Primitive(PrimitiveType::Long), + ), + NestedField::required_map(108, "column_sizes") + .key(117, Type::Primitive(PrimitiveType::Int)) + .value(118, Type::Primitive(PrimitiveType::Long), true) + .build(), + NestedField::required_map(109, "value_counts") + .key(119, Type::Primitive(PrimitiveType::Int)) + .value(120, Type::Primitive(PrimitiveType::Long), true) + .build(), + NestedField::required_map(110, "null_value_counts") + .key(121, Type::Primitive(PrimitiveType::Int)) + .value(122, Type::Primitive(PrimitiveType::Long), true) + .build(), + NestedField::required_map(137, "nan_value_counts") + .key(138, Type::Primitive(PrimitiveType::Int)) + .value(139, Type::Primitive(PrimitiveType::Long), true) + .build(), + NestedField::required_map(125, "lower_bounds") + .key(126, Type::Primitive(PrimitiveType::Int)) + .value(127, Type::Primitive(PrimitiveType::Binary), true) + .build(), + NestedField::required_map(128, "upper_bounds") + .key(129, Type::Primitive(PrimitiveType::Int)) + .value(130, Type::Primitive(PrimitiveType::Binary), true) + .build(), + NestedField::optional(131, "key_metadata", Type::Primitive(PrimitiveType::Binary)), + NestedField::required_list(132, "split_offsets") + .element_field(133, Type::Primitive(PrimitiveType::Long), true) + .build(), + NestedField::required_list(135, "equality_ids") + .element_field(136, Type::Primitive(PrimitiveType::Int), true) + .build(), + NestedField::optional(140, "sort_order_id", Type::Primitive(PrimitiveType::Int)), + ]; + + crate::spec::Schema::builder() + .with_fields(fields.into_iter().map(Arc::new).collect_vec()) + .build() + .unwrap() + } + + fn append(&mut self, manifest_file: &ManifestFile, data_file: &DataFile) -> Result<()> { + self.content.append_value(data_file.content as i32); + self.file_path.append_value(data_file.file_path()); + self.file_format + .append_value(data_file.file_format().to_string().to_uppercase()); + self.partition.append( + self.partition_spec(manifest_file)?.clone().fields(), + data_file.partition(), + )?; + self.record_count + .append_value(data_file.record_count() as i64); + self.file_size_in_bytes + .append_value(data_file.file_size_in_bytes() as i64); + + // Sort keys to get matching order between rows + for (k, v) in data_file.column_sizes.iter().sorted_by_key(|(k, _)| *k) { + self.column_sizes.keys().append_value(*k); + self.column_sizes.values().append_value(*v as i64); + } + self.column_sizes.append(true)?; + + for (k, v) in data_file.value_counts.iter().sorted_by_key(|(k, _)| *k) { + self.value_counts.keys().append_value(*k); + self.value_counts.values().append_value(*v as i64); + } + self.value_counts.append(true)?; + + for (k, v) in data_file + .null_value_counts + .iter() + .sorted_by_key(|(k, _)| *k) + { + self.null_value_counts.keys().append_value(*k); + self.null_value_counts.values().append_value(*v as i64); + } + self.null_value_counts.append(true)?; + + for (k, v) in data_file.nan_value_counts.iter().sorted_by_key(|(k, _)| *k) { + self.nan_value_counts.keys().append_value(*k); + self.nan_value_counts.values().append_value(*v as i64); + } + self.nan_value_counts.append(true)?; + + for (k, v) in data_file.lower_bounds.iter().sorted_by_key(|(k, _)| *k) { + self.lower_bounds.keys().append_value(*k); + self.lower_bounds.values().append_value(v.to_bytes()?); + } + self.lower_bounds.append(true)?; + + for (k, v) in data_file.upper_bounds.iter().sorted_by_key(|(k, _)| *k) { + self.upper_bounds.keys().append_value(*k); + self.upper_bounds.values().append_value(v.to_bytes()?); + } + self.upper_bounds.append(true)?; + + self.key_metadata.append_option(data_file.key_metadata()); + + self.split_offsets + .values() + .append_slice(data_file.split_offsets()); + self.split_offsets.append(true); + + self.equality_ids + .values() + .append_slice(data_file.equality_ids()); + self.equality_ids.append(true); + + self.sort_order_ids.append_option(data_file.sort_order_id()); + Ok(()) + } + + fn partition_spec(&self, manifest_file: &ManifestFile) -> Result<&PartitionSpec> { + self.table_metadata + .partition_spec_by_id(manifest_file.partition_spec_id) + .ok_or_else(|| { + Error::new( + ErrorKind::Unexpected, + "Partition spec not found for manifest file", + ) + }) + .map(|spec| spec.as_ref()) + } + + fn finish(&mut self) -> StructArray { + let schema = schema_to_arrow_schema(&Self::schema(self.table_metadata)).unwrap(); + + let inner_arrays: Vec = vec![ + Arc::new(self.content.finish()), + Arc::new(self.file_path.finish()), + Arc::new(self.file_format.finish()), + Arc::new(self.partition.finish()), + Arc::new(self.record_count.finish()), + Arc::new(self.file_size_in_bytes.finish()), + Arc::new(self.column_sizes.finish()), + Arc::new(self.value_counts.finish()), + Arc::new(self.null_value_counts.finish()), + Arc::new(self.nan_value_counts.finish()), + Arc::new(self.lower_bounds.finish()), + Arc::new(self.upper_bounds.finish()), + Arc::new(self.key_metadata.finish()), + Arc::new(self.split_offsets.finish()), + Arc::new(self.equality_ids.finish()), + Arc::new(self.sort_order_ids.finish()), + ]; + + StructArray::from( + schema + .fields() + .iter() + .cloned() + .zip_eq(inner_arrays) + .collect_vec(), + ) + } +} + +/// Builds a readable metrics struct for a single column. +/// +/// For reference, see [Java][1] and [Python][2] implementations. +/// +/// [1]: https://github.com/apache/iceberg/blob/4a432839233f2343a9eae8255532f911f06358ef/core/src/main/java/org/apache/iceberg/MetricsUtil.java#L337 +/// [2]: https://github.com/apache/iceberg-python/blob/a051584a3684392d2db6556449eb299145d47d15/pyiceberg/table/inspect.py#L101-L110 +struct PerColumnReadableMetricsBuilder { + data_table_field_id: i32, + metadata_fields: Fields, + column_size: Int64Builder, + value_count: Int64Builder, + null_value_count: Int64Builder, + nan_value_count: Int64Builder, + lower_bound: AnyPrimitiveArrayBuilder, + upper_bound: AnyPrimitiveArrayBuilder, +} + +impl PerColumnReadableMetricsBuilder { + fn struct_type( + field_ids: &mut IncrementingFieldId, + data_type: &Type, + ) -> crate::spec::StructType { + let fields = vec![ + NestedField::optional( + field_ids.next_id(), + "column_size", + Type::Primitive(PrimitiveType::Long), + ), + NestedField::optional( + field_ids.next_id(), + "value_count", + Type::Primitive(PrimitiveType::Long), + ), + NestedField::optional( + field_ids.next_id(), + "null_value_count", + Type::Primitive(PrimitiveType::Long), + ), + NestedField::optional( + field_ids.next_id(), + "nan_value_count", + Type::Primitive(PrimitiveType::Long), + ), + NestedField::optional(field_ids.next_id(), "lower_bound", data_type.clone()), + NestedField::optional(field_ids.next_id(), "upper_bound", data_type.clone()), + ] + .into_iter() + .map(Arc::new) + .collect_vec(); + crate::spec::StructType::new(fields) + } + + fn new_for_field( + data_table_field_id: i32, + data_type: &DataType, + metadata_fields: Fields, + ) -> Self { + Self { + data_table_field_id, + metadata_fields, + column_size: Int64Builder::new(), + value_count: Int64Builder::new(), + null_value_count: Int64Builder::new(), + nan_value_count: Int64Builder::new(), + lower_bound: AnyPrimitiveArrayBuilder::new(data_type), + upper_bound: AnyPrimitiveArrayBuilder::new(data_type), + } + } + + fn append(&mut self, data_file: &DataFile) -> Result<()> { + self.column_size.append_option( + data_file + .column_sizes() + .get(&self.data_table_field_id) + .map(|&v| v as i64), + ); + self.value_count.append_option( + data_file + .value_counts() + .get(&self.data_table_field_id) + .map(|&v| v as i64), + ); + self.null_value_count.append_option( + data_file + .null_value_counts() + .get(&self.data_table_field_id) + .map(|&v| v as i64), + ); + self.nan_value_count.append_option( + data_file + .nan_value_counts() + .get(&self.data_table_field_id) + .map(|&v| v as i64), + ); + match data_file.lower_bounds().get(&self.data_table_field_id) { + Some(datum) => self + .lower_bound + .append_datum(get_arrow_datum(datum)?.as_ref())?, + None => self.lower_bound.append_null()?, + } + match data_file.upper_bounds().get(&self.data_table_field_id) { + Some(datum) => self + .upper_bound + .append_datum(get_arrow_datum(datum)?.as_ref())?, + None => self.upper_bound.append_null()?, + } + Ok(()) + } + + fn finish(&mut self) -> StructArray { + let inner_arrays: Vec = vec![ + Arc::new(self.column_size.finish()), + Arc::new(self.value_count.finish()), + Arc::new(self.null_value_count.finish()), + Arc::new(self.nan_value_count.finish()), + Arc::new(self.lower_bound.finish()), + Arc::new(self.upper_bound.finish()), + ]; + + StructArray::from( + self.metadata_fields + .into_iter() + .cloned() + .zip_eq(inner_arrays) + .collect::>(), + ) + } +} + +/// Build a [StructArray] with partition columns as fields and partition values as rows. +struct PartitionValuesStructBuilder { + builders: Vec, + partition_fields: Fields, +} + +impl PartitionValuesStructBuilder { + /// Construct a new builder from the combined partition columns of the table metadata. + fn new(table_metadata: &TableMetadata) -> Self { + let combined_struct_type = Self::partition_type(table_metadata); + let DataType::Struct(partition_fields) = + type_to_arrow_type(&Type::Struct(combined_struct_type)).unwrap() + else { + panic!("Converted Arrow type was not struct") + }; + Self { + builders: partition_fields + .iter() + .map(|field| AnyPrimitiveArrayBuilder::new(field.data_type())) + .collect(), + partition_fields, + } + } + + /// Builds a unified partition type considering all specs in the table. + /// + /// Based on Iceberg Java's [`Partitioning#partitionType`][1]. + /// + /// [1]: https://github.com/apache/iceberg/blob/7e0cd3fa1e51d3c80f6c8cff23a03dca86f942fa/core/src/main/java/org/apache/iceberg/Partitioning.java#L240 + fn partition_type(table_metadata: &TableMetadata) -> crate::spec::StructType { + Self::build_partition_projection_type( + table_metadata.current_schema(), + table_metadata.partition_specs_iter(), + Self::all_fields_ids(table_metadata.partition_specs_iter()), + ) + } + + /// Based on Iceberg Java's [`Partitioning#buildPartitionProjectionType`][1] with the difference + /// that we pass along the [Schema] to map [PartitionField] to the current type. + // + /// [1]: https://github.com/apache/iceberg/blob/7e0cd3fa1e51d3c80f6c8cff23a03dca86f942fa/core/src/main/java/org/apache/iceberg/Partitioning.java#L255 + fn build_partition_projection_type<'a>( + schema: &crate::spec::Schema, + specs: impl Iterator, + projected_field_ids: HashSet, + ) -> crate::spec::StructType { + let mut field_map: HashMap = HashMap::new(); + let mut type_map: HashMap = HashMap::new(); + let mut name_map: HashMap = HashMap::new(); + + // Sort specs by ID in descending order to get latest field names + let sorted_specs = specs + .sorted_by_key(|spec| spec.spec_id()) + .rev() + .collect_vec(); + + for spec in sorted_specs { + for field in spec.fields() { + let field_id = field.field_id; + + if !projected_field_ids.contains(&field_id) { + continue; + } + + let partition_type = spec.partition_type(schema).unwrap(); + let struct_field = partition_type.field_by_id(field_id).unwrap(); + let existing_field = field_map.get(&field_id); + + match existing_field { + None => { + field_map.insert(field_id, field.clone()); + type_map.insert(field_id, struct_field.field_type.as_ref().clone()); + name_map.insert(field_id, struct_field.name.clone()); + } + Some(existing_field) => { + // verify the fields are compatible as they may conflict in v1 tables + if !Self::equivalent_ignoring_name(existing_field, field) { + panic!( + "Conflicting partition fields: ['{existing_field:?}', '{field:?}']", + ); + } + + // use the correct type for dropped partitions in v1 tables + if Self::is_void_transform(existing_field) + && !Self::is_void_transform(field) + { + field_map.insert(field_id, field.clone()); + type_map.insert(field_id, struct_field.field_type.as_ref().clone()); + } + } + } + } + } + + let sorted_struct_fields = field_map + .into_keys() + .sorted() + .map(|field_id| { + NestedField::optional(field_id, &name_map[&field_id], type_map[&field_id].clone()) + }) + .map(Arc::new) + .collect_vec(); + + crate::spec::StructType::new(sorted_struct_fields) + } + + fn is_void_transform(field: &PartitionField) -> bool { + field.transform == Transform::Void + } + + fn equivalent_ignoring_name(field: &PartitionField, another_field: &PartitionField) -> bool { + field.field_id == another_field.field_id + && field.source_id == another_field.source_id + && Self::compatible_transforms(field.transform, another_field.transform) + } + + fn compatible_transforms(t1: Transform, t2: Transform) -> bool { + t1 == t2 || t1 == Transform::Void || t2 == Transform::Void + } + + // collects IDs of all partition field used across specs + fn all_fields_ids<'a>(specs: impl Iterator) -> HashSet { + specs + .flat_map(|spec| spec.fields()) + .map(|partition| partition.field_id) + .collect() + } + + fn append( + &mut self, + partition_fields: &[PartitionField], + partition_values: &Struct, + ) -> Result<()> { + for (field, value) in partition_fields.iter().zip_eq(partition_values.iter()) { + let index = self.find_field(&field.name)?; + + match value { + Some(literal) => self.builders[index].append_literal(literal)?, + None => self.builders[index].append_null()?, + } + } + Ok(()) + } + + fn finish(&mut self) -> StructArray { + let arrays: Vec = self + .builders + .iter_mut() + .map::(|builder| Arc::new(builder.finish())) + .collect(); + StructArray::from( + self.partition_fields + .iter() + .cloned() + .zip_eq(arrays) + .collect::>(), + ) + } + + fn find_field(&self, name: &str) -> Result { + match self.partition_fields.find(name) { + Some((index, _)) => Ok(index), + None => Err(Error::new( + ErrorKind::Unexpected, + format!("Field not found: {}", name), + )), + } + } +} + +struct ReadableMetricsStructBuilder { + column_builders: Vec, + column_fields: Fields, +} + +impl ReadableMetricsStructBuilder { + /// Calculates a dynamic schema for `readable_metrics` to add to metadata tables. The type + /// will be a struct containing all primitive columns in the data table. + /// + /// We take the table's schema to get the set of fields in the table. We also take the manifest + /// entry schema to get the highest field ID in the entries metadata table to know which field + /// ID to begin with. + fn readable_metrics_schema( + data_table_schema: &crate::spec::Schema, + manifest_entry_schema: &crate::spec::Schema, + ) -> crate::spec::Schema { + let mut field_ids = IncrementingFieldId(manifest_entry_schema.highest_field_id() + 1); + let mut per_column_readable_metrics_fields: Vec = Vec::new(); + + for data_table_field in Self::sorted_primitive_fields(data_table_schema) { + per_column_readable_metrics_fields.push(Arc::new(NestedField::required( + field_ids.next_id(), + &data_table_field.name, + Type::Struct(PerColumnReadableMetricsBuilder::struct_type( + &mut field_ids, + &data_table_field.field_type, + )), + ))); + } + + crate::spec::Schema::builder() + .with_fields(vec![Arc::new(NestedField::optional( + field_ids.next_id(), + "readable_metrics", + Type::Struct(crate::spec::StructType::new( + per_column_readable_metrics_fields, + )), + ))]) + .build() + .unwrap() + } + + fn sorted_primitive_fields(data_table_schema: &crate::spec::Schema) -> Vec { + let mut fields = data_table_schema + .as_struct() + .fields() + .iter() + .filter(|field| field.field_type.is_primitive()) + .cloned() + .collect_vec(); + fields.sort_by_key(|field| field.name.clone()); + fields + } + + fn new( + data_table_schema: &crate::spec::Schema, + readable_metrics_schema: &crate::spec::StructType, + ) -> ReadableMetricsStructBuilder { + let DataType::Struct(column_fields) = + type_to_arrow_type(&Type::Struct(readable_metrics_schema.clone())).unwrap() + else { + panic!("Converted Arrow type was not struct") + }; + let column_builders = readable_metrics_schema + .fields() + .iter() + .zip_eq(Self::sorted_primitive_fields(data_table_schema)) + .map(|(readable_metrics_field, data_field)| { + let DataType::Struct(fields) = + type_to_arrow_type(&readable_metrics_field.field_type).unwrap() + else { + panic!("Readable metrics field was not a struct") + }; + let arrow_type = type_to_arrow_type(&data_field.field_type).unwrap(); + PerColumnReadableMetricsBuilder::new_for_field(data_field.id, &arrow_type, fields) + }) + .collect_vec(); + + Self { + column_fields, + column_builders, + } + } + + fn append(&mut self, data_file: &DataFile) -> Result<()> { + for column_builder in &mut self.column_builders { + column_builder.append(data_file)?; + } + Ok(()) + } + + fn finish(&mut self) -> StructArray { + let arrays: Vec = self + .column_builders + .iter_mut() + .map::(|builder| Arc::new(builder.finish())) + .collect(); + + let inner_arrays: Vec<(FieldRef, ArrayRef)> = self + .column_fields + .into_iter() + .cloned() + .zip_eq(arrays) + .collect_vec(); + + StructArray::from(inner_arrays) + } +} + +/// Helper to serve increment field ids. +struct IncrementingFieldId(i32); + +impl IncrementingFieldId { + fn next_id(&mut self) -> i32 { + let current = self.0; + self.0 += 1; + current + } +} + +fn key_field(field_id: i32, data_type: DataType) -> FieldRef { + Arc::new( + Field::new("key", data_type, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )])), + ) +} + +// All value fields are required. +fn value_field(field_id: i32, data_type: DataType) -> FieldRef { + Arc::new( + Field::new("value", data_type, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )])), + ) +} + +// All element fields are required. +fn list_field(field_id: i32, data_type: DataType) -> FieldRef { + Arc::new( + Field::new("element", data_type, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )])), + ) +} + + +#[cfg(test)] +mod tests { + use expect_test::expect; + + use crate::inspect::metadata_table::tests::check_record_batches; + use crate::scan::tests::TableTestFixture; + + #[tokio::test] + async fn test_entries_table() { + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + let table = fixture.table; + let inspect = table.inspect(); + let entries_table = inspect.entries(); + + let batch_stream = entries_table.scan().await.unwrap(); + + check_record_batches( + batch_stream, + expect![[r#" + Field { name: "status", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "0"} }, + Field { name: "snapshot_id", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1"} }, + Field { name: "sequence_number", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "3"} }, + Field { name: "file_sequence_number", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "4"} }, + Field { name: "data_file", data_type: Struct([Field { name: "content", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "134"} }, Field { name: "file_path", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "100"} }, Field { name: "file_format", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "101"} }, Field { name: "partition", data_type: Struct([Field { name: "x", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1000"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "102"} }, Field { name: "record_count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "103"} }, Field { name: "file_size_in_bytes", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "104"} }, Field { name: "column_sizes", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "117"} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "118"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "108"} }, Field { name: "value_counts", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "119"} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "120"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "109"} }, Field { name: "null_value_counts", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "121"} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "122"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "110"} }, Field { name: "nan_value_counts", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "138"} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "139"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "137"} }, Field { name: "lower_bounds", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "126"} }, Field { name: "value", data_type: LargeBinary, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "127"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "125"} }, Field { name: "upper_bounds", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "129"} }, Field { name: "value", data_type: LargeBinary, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "130"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "128"} }, Field { name: "key_metadata", data_type: LargeBinary, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "131"} }, Field { name: "split_offsets", data_type: List(Field { name: "element", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "133"} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "132"} }, Field { name: "equality_ids", data_type: List(Field { name: "element", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "136"} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "135"} }, Field { name: "sort_order_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "140"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "2"} }, + Field { name: "readable_metrics", data_type: Struct([Field { name: "a", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1002"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1003"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1004"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1005"} }, Field { name: "lower_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1006"} }, Field { name: "upper_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1007"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1001"} }, Field { name: "binary", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1009"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1010"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1011"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1012"} }, Field { name: "lower_bound", data_type: LargeBinary, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1013"} }, Field { name: "upper_bound", data_type: LargeBinary, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1014"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1008"} }, Field { name: "bool", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1016"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1017"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1018"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1019"} }, Field { name: "lower_bound", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1020"} }, Field { name: "upper_bound", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1021"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1015"} }, Field { name: "date", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1023"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1024"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1025"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1026"} }, Field { name: "lower_bound", data_type: Date32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1027"} }, Field { name: "upper_bound", data_type: Date32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1028"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1022"} }, Field { name: "dbl", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1030"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1031"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1032"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1033"} }, Field { name: "lower_bound", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1034"} }, Field { name: "upper_bound", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1035"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1029"} }, Field { name: "decimal", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1037"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1038"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1039"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1040"} }, Field { name: "lower_bound", data_type: Decimal128(3, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1041"} }, Field { name: "upper_bound", data_type: Decimal128(3, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1042"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1036"} }, Field { name: "float", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1044"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1045"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1046"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1047"} }, Field { name: "lower_bound", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1048"} }, Field { name: "upper_bound", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1049"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1043"} }, Field { name: "i32", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1051"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1052"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1053"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1054"} }, Field { name: "lower_bound", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1055"} }, Field { name: "upper_bound", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1056"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1050"} }, Field { name: "i64", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1058"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1059"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1060"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1061"} }, Field { name: "lower_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1062"} }, Field { name: "upper_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1063"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1057"} }, Field { name: "timestamp", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1065"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1066"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1067"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1068"} }, Field { name: "lower_bound", data_type: Timestamp(Microsecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1069"} }, Field { name: "upper_bound", data_type: Timestamp(Microsecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1070"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1064"} }, Field { name: "timestampns", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1072"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1073"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1074"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1075"} }, Field { name: "lower_bound", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1076"} }, Field { name: "upper_bound", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1077"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1071"} }, Field { name: "timestamptz", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1079"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1080"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1081"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1082"} }, Field { name: "lower_bound", data_type: Timestamp(Microsecond, Some("+00:00")), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1083"} }, Field { name: "upper_bound", data_type: Timestamp(Microsecond, Some("+00:00")), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1084"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1078"} }, Field { name: "timestamptzns", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1086"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1087"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1088"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1089"} }, Field { name: "lower_bound", data_type: Timestamp(Nanosecond, Some("+00:00")), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1090"} }, Field { name: "upper_bound", data_type: Timestamp(Nanosecond, Some("+00:00")), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1091"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1085"} }, Field { name: "x", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1093"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1094"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1095"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1096"} }, Field { name: "lower_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1097"} }, Field { name: "upper_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1098"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1092"} }, Field { name: "y", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1100"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1101"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1102"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1103"} }, Field { name: "lower_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1104"} }, Field { name: "upper_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1105"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1099"} }, Field { name: "z", data_type: Struct([Field { name: "column_size", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1107"} }, Field { name: "value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1108"} }, Field { name: "null_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1109"} }, Field { name: "nan_value_count", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1110"} }, Field { name: "lower_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1111"} }, Field { name: "upper_bound", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1112"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1106"} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1113"} }"#]], + expect![[r| status | snapshot_id | sequence_number | file_sequence_number | data_file | readable_metrics || 1 | 3055729675574597004 | 1 | 1 | {content: 0, file_format: PARQUET, partition: {x: 100}, record_count: 1, file_size_in_bytes: 100, column_sizes: {1: 1, 2: 1}, value_counts: {1: 2, 2: 2}, null_value_counts: {1: 3, 2: 3}, nan_value_counts: {1: 4, 2: 4}, lower_bounds: {1: 0100000000000000, 2: 0200000000000000, 3: 0300000000000000, 4: 417061636865, 5: 0000000000005940, 6: 64000000, 7: 6400000000000000, 8: 00, 9: 0000c842, 11: 00000000, 12: 0000000000000000, 13: 0000000000000000}, upper_bounds: {1: 0100000000000000, 2: 0500000000000000, 3: 0400000000000000, 4: 49636562657267, 5: 0000000000006940, 6: c8000000, 7: c800000000000000, 8: 01, 9: 00004843, 11: 00000000, 12: 0000000000000000, 13: 0000000000000000}, key_metadata: , split_offsets: [], equality_ids: [], sort_order_id: } | {a: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: Apache, upper_bound: Iceberg}, binary: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, bool: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: false, upper_bound: true}, date: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 1970-01-01, upper_bound: 1970-01-01}, dbl: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 100.0, upper_bound: 200.0}, decimal: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, float: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 100.0, upper_bound: 200.0}, i32: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 100, upper_bound: 200}, i64: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 100, upper_bound: 200}, timestamp: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 1970-01-01T00:00:00, upper_bound: 1970-01-01T00:00:00}, timestampns: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamptz: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 1970-01-01T00:00:00Z, upper_bound: 1970-01-01T00:00:00Z}, timestamptzns: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, x: {column_size: 1, value_count: 2, null_value_count: 3, nan_value_count: 4, lower_bound: 1, upper_bound: 1}, y: {column_size: 1, value_count: 2, null_value_count: 3, nan_value_count: 4, lower_bound: 2, upper_bound: 5}, z: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: 3, upper_bound: 4}} | + | 2 | 3055729675574597004 | 0 | 0 | {content: 0, file_format: PARQUET, partition: {x: 200}, record_count: 1, file_size_in_bytes: 100, column_sizes: {}, value_counts: {}, null_value_counts: {}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: , split_offsets: [], equality_ids: [], sort_order_id: } | {a: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, binary: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, bool: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, date: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, dbl: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, decimal: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, float: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, i32: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, i64: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamp: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestampns: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamptz: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamptzns: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, x: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, y: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, z: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }} | + | 0 | 3051729675574597004 | 0 | 0 | {content: 0, file_format: PARQUET, partition: {x: 300}, record_count: 1, file_size_in_bytes: 100, column_sizes: {}, value_counts: {}, null_value_counts: {}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: , split_offsets: [], equality_ids: [], sort_order_id: } | {a: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, binary: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, bool: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, date: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, dbl: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, decimal: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, float: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, i32: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, i64: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamp: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestampns: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamptz: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, timestamptzns: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, x: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, y: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }, z: {column_size: , value_count: , null_value_count: , nan_value_count: , lower_bound: , upper_bound: }} |file_path"], + None, + ) + .await; + } +} diff --git a/crates/iceberg/src/inspect/manifests.rs b/crates/iceberg/src/inspect/manifests.rs index e94e48a45..8f5da7f8f 100644 --- a/crates/iceberg/src/inspect/manifests.rs +++ b/crates/iceberg/src/inspect/manifests.rs @@ -285,75 +285,13 @@ mod tests { Field { name: "deleted_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "17"} }, Field { name: "partition_summaries", data_type: List(Field { name: "item", data_type: Struct([Field { name: "contains_null", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "10"} }, Field { name: "contains_nan", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "11"} }, Field { name: "lower_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "12"} }, Field { name: "upper_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "13"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "9"} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "8"} }"#]], expect![[r#" - content: PrimitiveArray - [ - 0, - ], - path: (skipped), - length: (skipped), - partition_spec_id: PrimitiveArray - [ - 0, - ], - added_snapshot_id: PrimitiveArray - [ - 3055729675574597004, - ], - added_data_files_count: PrimitiveArray - [ - 1, - ], - existing_data_files_count: PrimitiveArray - [ - 1, - ], - deleted_data_files_count: PrimitiveArray - [ - 1, - ], - added_delete_files_count: PrimitiveArray - [ - 1, - ], - existing_delete_files_count: PrimitiveArray - [ - 1, - ], - deleted_delete_files_count: PrimitiveArray - [ - 1, - ], - partition_summaries: ListArray - [ - StructArray - -- validity: - [ - valid, - ] - [ - -- child 0: "contains_null" (Boolean) - BooleanArray - [ - false, - ] - -- child 1: "contains_nan" (Boolean) - BooleanArray - [ - false, - ] - -- child 2: "lower_bound" (Utf8) - StringArray - [ - "100", - ] - -- child 3: "upper_bound" (Utf8) - StringArray - [ - "300", - ] - ], - ]"#]], + +---------+-------------------+---------------------+------------------------+---------------------------+--------------------------+--------------------------+-----------------------------+----------------------------+-----------------------------------------------------------------------------------+ + | content | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count | added_delete_files_count | existing_delete_files_count | deleted_delete_files_count | partition_summaries | + +---------+-------------------+---------------------+------------------------+---------------------------+--------------------------+--------------------------+-----------------------------+----------------------------+-----------------------------------------------------------------------------------+ + | 0 | 0 | 3055729675574597004 | 1 | 1 | 1 | 1 | 1 | 1 | [{contains_null: false, contains_nan: false, lower_bound: 100, upper_bound: 300}] | + +---------+-------------------+---------------------+------------------------+---------------------------+--------------------------+--------------------------+-----------------------------+----------------------------+-----------------------------------------------------------------------------------+"#]], &["path", "length"], + &[], Some("path"), ).await; } diff --git a/crates/iceberg/src/inspect/metadata_table.rs b/crates/iceberg/src/inspect/metadata_table.rs index 75dbc7472..20d75caa9 100644 --- a/crates/iceberg/src/inspect/metadata_table.rs +++ b/crates/iceberg/src/inspect/metadata_table.rs @@ -16,6 +16,7 @@ // under the License. use super::{ManifestsTable, SnapshotsTable}; +use crate::inspect::entries::EntriesTable; use crate::table::Table; /// Metadata table is used to inspect a table's history, snapshots, and other metadata as a table. @@ -33,6 +34,11 @@ impl<'a> MetadataTable<'a> { Self(table) } + /// Returns the current manifest file's entries. + pub fn entries(&self) -> EntriesTable { + EntriesTable::new(self.0) + } + /// Get the snapshots table. pub fn snapshots(&self) -> SnapshotsTable { SnapshotsTable::new(self.0) @@ -46,6 +52,11 @@ impl<'a> MetadataTable<'a> { #[cfg(test)] pub mod tests { + use std::sync::Arc; + + use arrow_array::{ArrayRef, RecordBatch, StructArray}; + use arrow_cast::pretty::pretty_format_batches; + use arrow_schema::{DataType, Field, FieldRef, Schema as ArrowSchema}; use expect_test::Expect; use futures::TryStreamExt; use itertools::Itertools; @@ -59,12 +70,14 @@ pub mod tests { /// or use rust-analyzer (see [video](https://github.com/rust-analyzer/expect-test)). /// Check the doc of [`expect_test`] for more details. /// - `ignore_check_columns`: Some columns are not stable, so we can skip them. + /// - `ignore_check_struct_fields`: Same as `ignore_check_columns` but for (top-level) struct fields. /// - `sort_column`: The order of the data might be non-deterministic, so we can sort it by a column. pub async fn check_record_batches( batch_stream: ArrowRecordBatchStream, expected_schema: Expect, expected_data: Expect, ignore_check_columns: &[&str], + ignore_check_struct_fields: &[&str], sort_column: Option<&str>, ) { let record_batches = batch_stream.try_collect::>().await.unwrap(); @@ -85,25 +98,48 @@ pub mod tests { .collect_vec(); } + // Filter columns + let (fields, columns): (Vec<_>, Vec<_>) = record_batch + .schema() + .fields + .iter() + .zip_eq(columns) + // Filter ignored columns + .filter(|(field, _)| !ignore_check_columns.contains(&field.name().as_str())) + // For struct fields, filter ignored struct fields + .map(|(field, column)| match field.data_type() { + DataType::Struct(fields) => { + let struct_array = column.as_any().downcast_ref::().unwrap(); + let filtered: Vec<(FieldRef, ArrayRef)> = fields + .iter() + .zip_eq(struct_array.columns().iter()) + .filter(|(f, _)| !ignore_check_struct_fields.contains(&f.name().as_str())) + .map(|(f, c)| (f.clone(), c.clone())) + .collect_vec(); + let filtered_struct_type: DataType = DataType::Struct( + filtered.iter().map(|(f, _)| f.clone()).collect_vec().into(), + ); + ( + Field::new(field.name(), filtered_struct_type, field.is_nullable()).into(), + Arc::new(StructArray::from(filtered)) as ArrayRef, + ) + } + _ => (field.clone(), column), + }) + .unzip(); + expected_schema.assert_eq(&format!( "{}", record_batch.schema().fields().iter().format(",\n") )); - expected_data.assert_eq(&format!( - "{}", - record_batch - .schema() - .fields() - .iter() - .zip_eq(columns) - .map(|(field, column)| { - if ignore_check_columns.contains(&field.name().as_str()) { - format!("{}: (skipped)", field.name()) - } else { - format!("{}: {:?}", field.name(), column) - } - }) - .format(",\n") - )); + expected_data.assert_eq( + &pretty_format_batches(&[RecordBatch::try_new( + Arc::new(ArrowSchema::new(fields)), + columns, + ) + .unwrap()]) + .unwrap() + .to_string(), + ); } } diff --git a/crates/iceberg/src/inspect/mod.rs b/crates/iceberg/src/inspect/mod.rs index b64420ea1..02a57e664 100644 --- a/crates/iceberg/src/inspect/mod.rs +++ b/crates/iceberg/src/inspect/mod.rs @@ -17,10 +17,12 @@ //! Metadata table APIs. +mod entries; mod manifests; mod metadata_table; mod snapshots; +pub use entries::EntriesTable; pub use manifests::ManifestsTable; pub use metadata_table::*; pub use snapshots::SnapshotsTable; diff --git a/crates/iceberg/src/inspect/snapshots.rs b/crates/iceberg/src/inspect/snapshots.rs index 1ee89963d..f5703b043 100644 --- a/crates/iceberg/src/inspect/snapshots.rs +++ b/crates/iceberg/src/inspect/snapshots.rs @@ -130,59 +130,14 @@ mod tests { Field { name: "manifest_list", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "summary", data_type: Map(Field { name: "entries", data_type: Struct([Field { name: "keys", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "values", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"#]], expect![[r#" - committed_at: PrimitiveArray - [ - 2018-01-04T21:22:35.770+00:00, - 2019-04-12T20:29:15.770+00:00, - ], - snapshot_id: PrimitiveArray - [ - 3051729675574597004, - 3055729675574597004, - ], - parent_id: PrimitiveArray - [ - null, - 3051729675574597004, - ], - operation: StringArray - [ - "append", - "append", - ], - manifest_list: (skipped), - summary: MapArray - [ - StructArray - -- validity: - [ - ] - [ - -- child 0: "keys" (Utf8) - StringArray - [ - ] - -- child 1: "values" (Utf8) - StringArray - [ - ] - ], - StructArray - -- validity: - [ - ] - [ - -- child 0: "keys" (Utf8) - StringArray - [ - ] - -- child 1: "values" (Utf8) - StringArray - [ - ] - ], - ]"#]], + +--------------------------+---------------------+---------------------+-----------+---------+ + | committed_at | snapshot_id | parent_id | operation | summary | + +--------------------------+---------------------+---------------------+-----------+---------+ + | 2018-01-04T21:22:35.770Z | 3051729675574597004 | | append | {} | + | 2019-04-12T20:29:15.770Z | 3055729675574597004 | 3051729675574597004 | append | {} | + +--------------------------+---------------------+---------------------+-----------+---------+"#]], &["manifest_list"], + &[], Some("committed_at"), ).await; } diff --git a/crates/iceberg/src/scan.rs b/crates/iceberg/src/scan.rs index 30e45a074..7a5ae32b1 100644 --- a/crates/iceberg/src/scan.rs +++ b/crates/iceberg/src/scan.rs @@ -1123,8 +1123,11 @@ pub mod tests { use std::sync::Arc; use arrow_array::{ - ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, + ArrayRef, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, + Int32Array, Int64Array, LargeBinaryArray, RecordBatch, StringArray, + TimestampMicrosecondArray, TimestampNanosecondArray, }; + use arrow_schema::{DataType, TimeUnit}; use futures::{stream, TryStreamExt}; use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; use parquet::basic::Compression; @@ -1235,6 +1238,50 @@ pub mod tests { .record_count(1) .partition(Struct::from_iter([Some(Literal::long(100))])) .key_metadata(None) + // Note: + // The bounds below need to agree with the test data written below + // into the Parquet file. If not, tests that rely on filter scans + // fail because of wrong bounds. + .lower_bounds(HashMap::from([ + (1, Datum::long(1)), + (2, Datum::long(2)), + (3, Datum::long(3)), + (4, Datum::string("Apache")), + (5, Datum::double(100)), + (6, Datum::int(100)), + (7, Datum::long(100)), + (8, Datum::bool(false)), + (9, Datum::float(100.0)), + // decimal values are not supported by schema::get_arrow_datum + // (10, Datum::decimal(Decimal(123, 2))), + (11, Datum::date(0)), + (12, Datum::timestamp_micros(0)), + (13, Datum::timestamptz_micros(0)), + // ns timestamps, uuid, fixed, binary are currently not + // supported in schema::get_arrow_datum + ])) + .upper_bounds(HashMap::from([ + (1, Datum::long(1)), + (2, Datum::long(5)), + (3, Datum::long(4)), + (4, Datum::string("Iceberg")), + (5, Datum::double(200)), + (6, Datum::int(200)), + (7, Datum::long(200)), + (8, Datum::bool(true)), + (9, Datum::float(200.0)), + // decimal values are not supported by schema::get_arrow_datum + // (10, Datum::decimal(Decimal(123, 2))), + (11, Datum::date(0)), + (12, Datum::timestamp_micros(0)), + (13, Datum::timestamptz_micros(0)), + // ns timestamps, uuid, fixed, binary are currently not + // supported in schema::get_arrow_datum + ])) + .column_sizes(HashMap::from([(1, 1u64), (2, 1u64)])) + .value_counts(HashMap::from([(1, 2u64), (2, 2u64)])) + .null_value_counts(HashMap::from([(1, 3u64), (2, 3u64)])) + .nan_value_counts(HashMap::from([(1, 4u64), (2, 4u64)])) .build() .unwrap(), ) @@ -1343,6 +1390,69 @@ pub mod tests { PARQUET_FIELD_ID_META_KEY.to_string(), "8".to_string(), )])), + arrow_schema::Field::new("float", arrow_schema::DataType::Float32, false) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "9".to_string(), + )])), + arrow_schema::Field::new( + "decimal", + arrow_schema::DataType::Decimal128(3, 2), + false, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "10".to_string(), + )])), + arrow_schema::Field::new("date", arrow_schema::DataType::Date32, false) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "11".to_string(), + )])), + arrow_schema::Field::new( + "timestamp", + arrow_schema::DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "12".to_string(), + )])), + arrow_schema::Field::new( + "timestamptz", + arrow_schema::DataType::Timestamp( + TimeUnit::Microsecond, + Some("UTC".into()), + ), + false, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "13".to_string(), + )])), + arrow_schema::Field::new( + "timestampns", + arrow_schema::DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "14".to_string(), + )])), + arrow_schema::Field::new( + "timestamptzns", + arrow_schema::DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + false, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "15".to_string(), + )])), + arrow_schema::Field::new("binary", arrow_schema::DataType::LargeBinary, false) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "16".to_string(), + )])), ]; Arc::new(arrow_schema::Schema::new(fields)) }; @@ -1392,8 +1502,54 @@ pub mod tests { let values: BooleanArray = values.into(); let col8 = Arc::new(values) as ArrayRef; + // float: + let mut values = vec![100.0f32; 512]; + values.append(vec![150.0f32; 12].as_mut()); + values.append(vec![200.0f32; 500].as_mut()); + let col9 = Arc::new(Float32Array::from_iter_values(values)) as ArrayRef; + + // decimal: + let values = vec![123i128; 1024]; + let col10 = Arc::new( + Decimal128Array::from_iter_values(values) + .with_data_type(DataType::Decimal128(3, 2)), + ); + + // date: + let values = vec![0i32; 1024]; + let col11 = Arc::new(Date32Array::from_iter_values(values)); + + // timestamp: + let values = vec![0i64; 1024]; + let col12 = Arc::new(TimestampMicrosecondArray::from_iter_values(values)); + + // timestamptz: + let values = vec![0i64; 1024]; + let col13 = Arc::new( + TimestampMicrosecondArray::from_iter_values(values).with_data_type( + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), + ), + ); + + // timestampns: + let values = vec![0i64; 1024]; + let col14 = Arc::new(TimestampNanosecondArray::from_iter_values(values)); + + // timestamptzns: + let values = vec![0i64; 1024]; + let col15 = Arc::new( + TimestampNanosecondArray::from_iter_values(values).with_data_type( + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + ), + ); + + // binary: + let values = vec![[0u8; 8]; 1024]; + let col16 = Arc::new(LargeBinaryArray::from_iter_values(values)); + let to_write = RecordBatch::try_new(schema.clone(), vec![ - col1, col2, col3, col4, col5, col6, col7, col8, + col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, + col14, col15, col16, ]) .unwrap(); diff --git a/crates/iceberg/src/spec/datatypes.rs b/crates/iceberg/src/spec/datatypes.rs index c806d16ea..28eee93aa 100644 --- a/crates/iceberg/src/spec/datatypes.rs +++ b/crates/iceberg/src/spec/datatypes.rs @@ -632,6 +632,46 @@ impl NestedField { } } + /// Construct a required struct field using a builder. + pub fn required_struct(id: i32, name: impl ToString) -> NestedStructFieldBuilder { + Self::r#struct(id, name, true) + } + + /// Construct an optional struct field using a builder. + pub fn optional_struct(id: i32, name: impl ToString) -> NestedStructFieldBuilder { + Self::r#struct(id, name, false) + } + + /// Construct a struct field using a builder. + fn r#struct(id: i32, name: impl ToString, required: bool) -> NestedStructFieldBuilder { + NestedStructFieldBuilder { + id, + name: name.to_string(), + required, + fields: vec![], + } + } + + /// Construct a required list field using a builder. + pub fn required_list(id: i32, name: impl ToString) -> NestedListFieldBuilder { + Self::list(id, name, true) + } + + /// Construct an optional list field using a builder. + pub fn optional_list(id: i32, name: impl ToString) -> NestedListFieldBuilder { + Self::list(id, name, false) + } + + /// Construct a list field using a builder. + fn list(id: i32, name: impl ToString, required: bool) -> NestedListFieldBuilder { + NestedListFieldBuilder { + id, + name: name.to_string(), + required, + element_field: None, + } + } + /// Construct list type's element field. pub fn list_element(id: i32, field_type: Type, required: bool) -> Self { if required { @@ -641,6 +681,27 @@ impl NestedField { } } + /// Construct a required map field using a builder. + pub fn required_map(id: i32, name: impl ToString) -> NestedMapFieldBuilder { + Self::map(id, name, true) + } + + /// Construct an optional map field using a builder. + pub fn optional_map(id: i32, name: impl ToString) -> NestedMapFieldBuilder { + Self::map(id, name, false) + } + + /// Construct a map field using a builder. + fn map(id: i32, name: impl ToString, required: bool) -> NestedMapFieldBuilder { + NestedMapFieldBuilder { + id, + name: name.to_string(), + required, + key: None, + value: None, + } + } + /// Construct map type's key field. pub fn map_key_element(id: i32, field_type: Type) -> Self { Self::required(id, MAP_KEY_FIELD_NAME, field_type) @@ -680,6 +741,110 @@ impl NestedField { } } +/// Builder for struct type. +pub struct NestedStructFieldBuilder { + id: i32, + name: String, + required: bool, + fields: Vec, +} + +impl NestedStructFieldBuilder { + /// Add a required field to the struct. + pub fn required(mut self, id: i32, name: impl ToString, field_type: Type) -> Self { + self.fields.push(Arc::new(NestedField::required( + id, + name.to_string(), + field_type, + ))); + self + } + + /// Add an optional field to the struct. + pub fn optional(mut self, id: i32, name: impl ToString, field_type: Type) -> Self { + self.fields.push(Arc::new(NestedField::required( + id, + name.to_string(), + field_type, + ))); + self + } + + /// Build a [NestedField] of the struct type. + pub fn build(self) -> NestedField { + NestedField::new( + self.id, + self.name, + Type::Struct(StructType::new(self.fields)), + self.required, + ) + } +} + +/// Builder for list type. +pub struct NestedListFieldBuilder { + id: i32, + name: String, + required: bool, + element_field: Option, +} + +impl NestedListFieldBuilder { + /// Set the element field of the list. + pub fn element_field(mut self, id: i32, field_type: Type, required: bool) -> Self { + self.element_field = Some(Arc::new(NestedField::list_element( + id, field_type, required, + ))); + self + } + + /// Build a [NestedField] of the list type. + pub fn build(self) -> NestedField { + NestedField::new( + self.id, + self.name, + Type::List(ListType::new(self.element_field.unwrap())), + self.required, + ) + } +} + +/// Builder for map type. +#[derive(Debug)] +pub struct NestedMapFieldBuilder { + id: i32, + name: String, + required: bool, + key: Option, + value: Option, +} + +impl NestedMapFieldBuilder { + /// Set the key field of the map. + pub fn key(mut self, id: i32, field_type: Type) -> Self { + self.key = Some(Arc::new(NestedField::map_key_element(id, field_type))); + self + } + + /// Set the value field of the map. + pub fn value(mut self, id: i32, field_type: Type, required: bool) -> Self { + self.value = Some(Arc::new(NestedField::map_value_element( + id, field_type, required, + ))); + self + } + + /// Build a new [NestedField] of the map type. + pub fn build(self) -> NestedField { + NestedField::new( + self.id, + self.name, + Type::Map(MapType::new(self.key.unwrap(), self.value.unwrap())), + self.required, + ) + } +} + impl fmt::Display for NestedField { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}: ", self.id)?; diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index 856339ab1..6124130cd 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -1171,6 +1171,12 @@ impl ManifestEntry { self.sequence_number } + /// File sequence number. + #[inline] + pub fn file_sequence_number(&self) -> Option { + self.file_sequence_number + } + /// File size in bytes. #[inline] pub fn file_size_in_bytes(&self) -> u64 { diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index f290441aa..edde04382 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -868,6 +868,34 @@ impl PruneColumn { } } +/// Join two schemas by concatenating fields. Return [Error] if the schemas have different columns +/// with the same id. +pub fn join_schemas(left: &Schema, right: &Schema) -> Result { + let mut joined_fields: Vec = + left.as_struct().fields().iter().cloned().collect_vec(); + + for right_field in right.as_struct().fields() { + match left.field_by_id(right_field.id) { + None => { + joined_fields.push(right_field.clone()); + } + Some(left_field) => { + if left_field != right_field { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Schemas have different columns with the same id: {:?}, {:?}", + left_field, right_field + ), + )); + } + } + } + } + + Schema::builder().with_fields(joined_fields).build() +} + impl SchemaVisitor for PruneColumn { type T = Option; @@ -952,9 +980,9 @@ impl SchemaVisitor for PruneColumn { return Ok(Some(Type::List(list.clone()))); } else { return Err(Error::new( - ErrorKind::DataInvalid, - format!("Cannot explicitly project List or Map types, List element {} of type {} was selected", list.element_field.id, list.element_field.field_type), - )); + ErrorKind::DataInvalid, + format!("Cannot explicitly project List or Map types, List element {} of type {} was selected", list.element_field.id, list.element_field.field_type), + )); } } else if let Some(result) = value { Ok(Some(Type::List(PruneColumn::project_list(list, result)?))) @@ -983,9 +1011,9 @@ impl SchemaVisitor for PruneColumn { return Ok(Some(Type::Map(map.clone()))); } else { return Err(Error::new( - ErrorKind::DataInvalid, - format!("Cannot explicitly project List or Map types, Map value {} of type {} was selected", map.value_field.id, map.value_field.field_type), - )); + ErrorKind::DataInvalid, + format!("Cannot explicitly project List or Map types, Map value {} of type {} was selected", map.value_field.id, map.value_field.field_type), + )); } } else if let Some(value_result) = value { return Ok(Some(Type::Map(PruneColumn::project_map( diff --git a/crates/iceberg/testdata/example_table_metadata_v2.json b/crates/iceberg/testdata/example_table_metadata_v2.json index 17bbd7d99..8ac937751 100644 --- a/crates/iceberg/testdata/example_table_metadata_v2.json +++ b/crates/iceberg/testdata/example_table_metadata_v2.json @@ -25,7 +25,15 @@ {"id": 5, "name": "dbl", "required": true, "type": "double"}, {"id": 6, "name": "i32", "required": true, "type": "int"}, {"id": 7, "name": "i64", "required": true, "type": "long"}, - {"id": 8, "name": "bool", "required": true, "type": "boolean"} + {"id": 8, "name": "bool", "required": true, "type": "boolean"}, + {"id": 9, "name": "float", "required": true, "type": "float"}, + {"id": 10, "name": "decimal", "required": true, "type": "decimal(3,2)"}, + {"id": 11, "name": "date", "required": true, "type": "date"}, + {"id": 12, "name": "timestamp", "required": true, "type": "timestamp"}, + {"id": 13, "name": "timestamptz", "required": true, "type": "timestamptz"}, + {"id": 14, "name": "timestampns", "required": true, "type": "timestamp_ns"}, + {"id": 15, "name": "timestamptzns", "required": true, "type": "timestamptz_ns"}, + {"id": 16, "name": "binary", "required": true, "type": "binary"} ] } ],