upmerge

andygrove · Feb 18, 2025 · 61c3e3c · 61c3e3c
2 parents 70979cb + d61bcfc
commit 61c3e3c
Show file tree

Hide file tree

Showing 24 changed files with 179 additions and 316 deletions.
diff --git a/.github/actions/setup-spark-builder/action.yaml b/.github/actions/setup-spark-builder/action.yaml
@@ -26,10 +26,6 @@ inputs:
     description: 'The Apache Spark version (e.g., 3.4.3) to build'
     required: true
     default: '3.4.3'
-  comet-version:
-    description: 'The Comet version to use for Spark'
-    required: true
-    default: '0.5.0-SNAPSHOT'
 runs:
   using: "composite"
   steps:
@@ -46,7 +42,6 @@ runs:
       run: |
         cd apache-spark
         git apply ../dev/diffs/${{inputs.spark-version}}.diff
-        ../mvnw -nsu -q versions:set-property -Dproperty=comet.version  -DnewVersion=${{inputs.comet-version}} -DgenerateBackupPoms=false
 
     - name: Cache Maven dependencies
       uses: actions/cache@v4

diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -71,7 +71,6 @@ jobs:
         with:
           spark-version: ${{ matrix.spark-version.full }}
           spark-short-version: ${{ matrix.spark-version.short }}
-          comet-version: '0.6.0-SNAPSHOT' # TODO: get this from pom.xml
       - name: Run Spark tests
         run: |
           cd apache-spark

diff --git a/.github/workflows/spark_sql_test_ansi.yml b/.github/workflows/spark_sql_test_ansi.yml
@@ -69,7 +69,6 @@ jobs:
         with:
           spark-version: ${{ matrix.spark-version.full }}
           spark-short-version: ${{ matrix.spark-version.short }}
-          comet-version: '0.6.0-SNAPSHOT' # TODO: get this from pom.xml
       - name: Run Spark tests
         run: |
           cd apache-spark

diff --git a/Makefile b/Makefile
@@ -21,6 +21,9 @@ define spark_jvm_17_extra_args
 $(shell ./mvnw help:evaluate -Dexpression=extraJavaTestArgs | grep -v '\[')
 endef
 
+# Build optional Comet native features (like hdfs e.g)
+FEATURES_ARG := $(shell ! [ -z $(COMET_FEATURES) ] && echo '--features=$(COMET_FEATURES)')
+
 all: core jvm
 
 core:
@@ -95,7 +98,7 @@ release-linux: clean
 	cd native && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
 	./mvnw install -Prelease -DskipTests $(PROFILES)
 release:
-	cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
+	cd native && RUSTFLAGS="$(RUSTFLAGS) -Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)
 	./mvnw install -Prelease -DskipTests $(PROFILES)
 release-nogit:
 	cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release

diff --git a/common/pom.xml b/common/pom.xml
@@ -26,7 +26,7 @@ under the License.
   <parent>
     <groupId>org.apache.datafusion</groupId>
     <artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
-    <version>0.6.0-SNAPSHOT</version>
+    <version>0.7.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -211,6 +211,7 @@ under the License.
         <includes>
           <include>libcomet.dylib</include>
           <include>libcomet.so</include>
+          <include>comet.dll</include>
         </includes>
         <targetPath>org/apache/comet/${platform}/${arch}</targetPath>
       </resource>

diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff
@@ -7,7 +7,7 @@ index d3544881af1..26ab186c65d 100644
      <ivy.version>2.5.1</ivy.version>
      <oro.version>2.0.8</oro.version>
 +    <spark.version.short>3.4</spark.version.short>
-+    <comet.version>0.5.0-SNAPSHOT</comet.version>
++    <comet.version>0.7.0-SNAPSHOT</comet.version>
      <!--
      If you changes codahale.metrics.version, you also need to change
      the link to metrics.dropwizard.io in docs/monitoring.md.

diff --git a/dev/diffs/3.5.1.diff b/dev/diffs/3.5.1.diff
@@ -7,7 +7,7 @@ index 0f504dbee85..430ec217e59 100644
      <ivy.version>2.5.1</ivy.version>
      <oro.version>2.0.8</oro.version>
 +    <spark.version.short>3.5</spark.version.short>
-+    <comet.version>0.5.0-SNAPSHOT</comet.version>
++    <comet.version>0.7.0-SNAPSHOT</comet.version>
      <!--
      If you changes codahale.metrics.version, you also need to change
      the link to metrics.dropwizard.io in docs/monitoring.md.

diff --git a/dev/diffs/4.0.0-preview1.diff b/dev/diffs/4.0.0-preview1.diff
@@ -7,7 +7,7 @@ index a4b1b2c3c9f..6a532749978 100644
      <ivy.version>2.5.2</ivy.version>
      <oro.version>2.0.8</oro.version>
 +    <spark.version.short>4.0</spark.version.short>
-+    <comet.version>0.5.0-SNAPSHOT</comet.version>
++    <comet.version>0.7.0-SNAPSHOT</comet.version>
      <!--
      If you change codahale.metrics.version, you also need to change
      the link to metrics.dropwizard.io in docs/monitoring.md.

diff --git a/dev/release/README.md b/dev/release/README.md
@@ -272,6 +272,11 @@ it to GitHub Container Registry at https://github.com/apache/datafusion-comet/pk
 
 Reply to the vote thread to close the vote and announce the release.
 
+## Update released version number in documentation
+
+- We provide direct links to the jar files in Maven
+- The Kubernetes page needs updating once the Docker image has been published to GitHub Container Regsistry
+
 ## Post Release Admin
 
 Register the release with the [Apache Reporter Service](https://reporter.apache.org/addrelease.html?datafusion) using

diff --git a/docs/source/contributor-guide/debugging.md b/docs/source/contributor-guide/debugging.md
@@ -130,7 +130,7 @@ Then build the Comet as [described](https://github.com/apache/arrow-datafusion-c
 Start Comet with `RUST_BACKTRACE=1`
 
 ```console
-RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.4_2.12-0.6.0-SNAPSHOT.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true
+RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.4_2.12-0.7.0-SNAPSHOT.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true
 ```
 
 Get the expanded exception details

diff --git a/docs/source/user-guide/datasources.md b/docs/source/user-guide/datasources.md
@@ -35,3 +35,81 @@ converted into Arrow format, allowing native execution to happen after that.
 
 Comet does not provide native JSON scan, but when `spark.comet.convert.json.enabled` is enabled, data is immediately
 converted into Arrow format, allowing native execution to happen after that.
+
+# Supported Storages
+
+## Local
+In progress
+
+## HDFS
+
+Apache DataFusion Comet native reader seamlessly scans files from remote HDFS for [supported formats](#supported-spark-data-sources)
+
+### Using experimental native DataFusion reader
+Unlike to native Comet reader the Datafusion reader fully supports nested types processing. This reader is currently experimental only
+
+To build Comet with native DataFusion reader and remote HDFS support it is required to have a JDK installed
+
+Example:
+Build a Comet for `spark-3.4` provide a JDK path in `JAVA_HOME` 
+Provide the JRE linker path in `RUSTFLAGS`, the path can vary depending on the system. Typically JRE linker is a part of installed JDK
+
+```shell
+export JAVA_HOME="/opt/homebrew/opt/openjdk@11"
+make release PROFILES="-Pspark-3.4" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server"
+```
+
+Start Comet with experimental reader and HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled)
+and add additional parameters 
+
+```shell
+--conf spark.comet.scan.impl=native_datafusion \
+--conf spark.hadoop.fs.defaultFS="hdfs://namenode:9000" \
+--conf spark.hadoop.dfs.client.use.datanode.hostname = true \
+--conf dfs.client.use.datanode.hostname = true
+```
+
+Query a struct type from Remote HDFS 
+```shell
+spark.read.parquet("hdfs://namenode:9000/user/data").show(false)
+
+root
+ |-- id: integer (nullable = true)
+ |-- first_name: string (nullable = true)
+ |-- personal_info: struct (nullable = true)
+ |    |-- firstName: string (nullable = true)
+ |    |-- lastName: string (nullable = true)
+ |    |-- ageInYears: integer (nullable = true)
+
+25/01/30 16:50:43 INFO core/src/lib.rs: Comet native library version 0.6.0 initialized
+== Physical Plan ==
+* CometColumnarToRow (2)
++- CometNativeScan:  (1)
+
+
+(1) CometNativeScan: 
+Output [3]: [id#0, first_name#1, personal_info#4]
+Arguments: [id#0, first_name#1, personal_info#4]
+
+(2) CometColumnarToRow [codegen id : 1]
+Input [3]: [id#0, first_name#1, personal_info#4]
+
+
+25/01/30 16:50:44 INFO fs-hdfs-0.1.12/src/hdfs.rs: Connecting to Namenode (hdfs://namenode:9000)
++---+----------+-----------------+
+|id |first_name|personal_info    |
++---+----------+-----------------+
+|2  |Jane      |{Jane, Smith, 34}|
+|1  |John      |{John, Doe, 28}  |
++---+----------+-----------------+
+
+
+
+```
+
+Verify the native scan type should be `CometNativeScan`.
+
+More on [HDFS Reader](../../../native/hdfs/README.md)
+
+## S3
+In progress 
diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md
@@ -49,14 +49,14 @@ Cloud Service Providers.
 Comet jar files are available in [Maven Central](https://central.sonatype.com/namespace/org.apache.datafusion) for amd64 and arm64 architectures for Linux. For Apple OSX, it 
 is currently necessary to build from source.
 
-Here are the direct links for downloading the Comet jar file.
+Here are the direct links for downloading the Comet 0.6.0 jar file.
 
-- [Comet plugin for Spark 3.3 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.12/0.4.0/comet-spark-spark3.3_2.12-0.4.0.jar)
-- [Comet plugin for Spark 3.3 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.13/0.4.0/comet-spark-spark3.3_2.13-0.4.0.jar)
-- [Comet plugin for Spark 3.4 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.12/0.4.0/comet-spark-spark3.4_2.12-0.4.0.jar)
-- [Comet plugin for Spark 3.4 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.13/0.4.0/comet-spark-spark3.4_2.13-0.4.0.jar)
-- [Comet plugin for Spark 3.5 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.4.0/comet-spark-spark3.5_2.12-0.4.0.jar)
-- [Comet plugin for Spark 3.5 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/0.4.0/comet-spark-spark3.5_2.13-0.4.0.jar)
+- [Comet plugin for Spark 3.3 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.12/0.6.0/comet-spark-spark3.3_2.12-0.6.0.jar)
+- [Comet plugin for Spark 3.3 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.13/0.6.0/comet-spark-spark3.3_2.13-0.6.0.jar)
+- [Comet plugin for Spark 3.4 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.12/0.6.0/comet-spark-spark3.4_2.12-0.6.0.jar)
+- [Comet plugin for Spark 3.4 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.13/0.6.0/comet-spark-spark3.4_2.13-0.6.0.jar)
+- [Comet plugin for Spark 3.5 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.6.0/comet-spark-spark3.5_2.12-0.6.0.jar)
+- [Comet plugin for Spark 3.5 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/0.6.0/comet-spark-spark3.5_2.13-0.6.0.jar)
 
 ## Building from source
 
@@ -74,7 +74,7 @@ See the [Comet Kubernetes Guide](kubernetes.md) guide.
 Make sure `SPARK_HOME` points to the same Spark version as Comet was built for.
 
 ```console
-export COMET_JAR=spark/target/comet-spark-spark3.4_2.12-0.6.0-SNAPSHOT.jar
+export COMET_JAR=spark/target/comet-spark-spark3.4_2.12-0.7.0-SNAPSHOT.jar
 
 $SPARK_HOME/bin/spark-shell \
     --jars $COMET_JAR \
@@ -130,7 +130,7 @@ explicitly contain Comet otherwise Spark may use a different class-loader for th
 components which will then fail at runtime. For example:
 
 ```
---driver-class-path spark/target/comet-spark-spark3.4_2.12-0.6.0-SNAPSHOT.jar
+--driver-class-path spark/target/comet-spark-spark3.4_2.12-0.7.0-SNAPSHOT.jar
 ```
 
 Some cluster managers may require additional configuration, see <https://spark.apache.org/docs/latest/cluster-overview.html>

diff --git a/docs/source/user-guide/kubernetes.md b/docs/source/user-guide/kubernetes.md
@@ -65,13 +65,13 @@ metadata:
 spec:
   type: Scala
   mode: cluster
-  image: ghcr.io/apache/datafusion-comet:spark-3.4-scala-2.12-0.2.0
+  image: ghcr.io/apache/datafusion-comet:spark-3.4-scala-2.12-0.5.0
   imagePullPolicy: IfNotPresent
   mainClass: org.apache.spark.examples.SparkPi
   mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.4.2.jar
   sparkConf:
-    "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.2.0.jar"
-    "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.2.0.jar"
+    "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0.jar"
+    "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0.jar"
     "spark.plugins": "org.apache.spark.CometPlugin"
     "spark.comet.enabled": "true"
     "spark.comet.exec.enabled": "true"

diff --git a/docs/source/user-guide/source.md b/docs/source/user-guide/source.md
@@ -27,7 +27,7 @@ Official source releases can be downloaded from https://dist.apache.org/repos/di
 
 ```console
 # Pick the latest version
-export COMET_VERSION=0.4.0
+export COMET_VERSION=0.6.0
 # Download the tarball
 curl -O "https://dist.apache.org/repos/dist/release/datafusion/datafusion-comet-$COMET_VERSION/apache-datafusion-comet-$COMET_VERSION.tar.gz"
 # Unpack

diff --git a/fuzz-testing/README.md b/fuzz-testing/README.md
@@ -61,7 +61,7 @@ Set appropriate values for `SPARK_HOME`, `SPARK_MASTER`, and `COMET_JAR` environ
 $SPARK_HOME/bin/spark-submit \
     --master $SPARK_MASTER \
     --class org.apache.comet.fuzz.Main \
-    target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
+    target/comet-fuzz-spark3.4_2.12-0.7.0-SNAPSHOT-jar-with-dependencies.jar \
     data --num-files=2 --num-rows=200 --exclude-negative-zero --generate-arrays --generate-structs --generate-maps
 ```
 
@@ -77,7 +77,7 @@ Generate random queries that are based on the available test files.
 $SPARK_HOME/bin/spark-submit \
     --master $SPARK_MASTER \
     --class org.apache.comet.fuzz.Main \
-    target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
+    target/comet-fuzz-spark3.4_2.12-0.7.0-SNAPSHOT-jar-with-dependencies.jar \
     queries --num-files=2 --num-queries=500
 ```
 
@@ -99,7 +99,7 @@ $SPARK_HOME/bin/spark-submit \
     --conf spark.driver.extraClassPath=$COMET_JAR \
     --conf spark.executor.extraClassPath=$COMET_JAR \
     --class org.apache.comet.fuzz.Main \
-    target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
+    target/comet-fuzz-spark3.4_2.12-0.7.0-SNAPSHOT-jar-with-dependencies.jar \
     run --num-files=2 --filename=queries.sql
 ```
 

diff --git a/fuzz-testing/pom.xml b/fuzz-testing/pom.xml
@@ -25,7 +25,7 @@ under the License.
     <parent>
         <groupId>org.apache.datafusion</groupId>
         <artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
-        <version>0.6.0-SNAPSHOT</version>
+        <version>0.7.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 

diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/Cargo.toml b/native/Cargo.toml
@@ -21,7 +21,7 @@ members = ["core", "spark-expr", "proto", "hdfs"]
 resolver = "2"
 
 [workspace.package]
-version = "0.6.0"
+version = "0.7.0"
 homepage = "https://datafusion.apache.org/comet"
 repository = "https://github.com/apache/datafusion-comet"
 authors = ["Apache DataFusion <dev@datafusion.apache.org>"]

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -74,7 +74,7 @@ use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctio
 
 use crate::execution::shuffle::CompressionCodec;
 use crate::execution::spark_plan::SparkPlan;
-use crate::parquet::parquet_support::SparkParquetOptions;
+use crate::parquet::parquet_support::{register_object_store, SparkParquetOptions};
 use crate::parquet::schema_adapter::SparkSchemaAdapterFactory;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder;
@@ -106,7 +106,6 @@ use datafusion_common::{
     tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter},
     JoinType as DFJoinType, ScalarValue,
 };
-use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_expr::type_coercion::other::get_coerce_type_for_case_expression;
 use datafusion_expr::{
     AggregateUDF, ReturnTypeArgs, ScalarUDF, WindowFrame, WindowFrameBound, WindowFrameUnits,
@@ -1165,12 +1164,9 @@ impl PhysicalPlanner {
                     ))
                 });
 
-                let object_store = object_store::local::LocalFileSystem::new();
-                // register the object store with the runtime environment
-                let url = Url::try_from("file://").unwrap();
-                self.session_ctx
-                    .runtime_env()
-                    .register_object_store(&url, Arc::new(object_store));
+                // By default, local FS object store registered
+                // if `hdfs` feature enabled then HDFS file object store registered
+                let object_store_url = register_object_store(Arc::clone(&self.session_ctx))?;
 
                 // Generate file groups
                 let mut file_groups: Vec<Vec<PartitionedFile>> =
@@ -1229,8 +1225,6 @@ impl PhysicalPlanner {
 
                 // TODO: I think we can remove partition_count in the future, but leave for testing.
                 assert_eq!(file_groups.len(), partition_count);
-
-                let object_store_url = ObjectStoreUrl::local_filesystem();
                 let partition_fields: Vec<Field> = partition_schema
                     .fields()
                     .iter()