Skip to content

Commit

Permalink
upmerge
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove committed Feb 18, 2025
2 parents 70979cb + d61bcfc commit 61c3e3c
Show file tree
Hide file tree
Showing 24 changed files with 179 additions and 316 deletions.
5 changes: 0 additions & 5 deletions .github/actions/setup-spark-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ inputs:
description: 'The Apache Spark version (e.g., 3.4.3) to build'
required: true
default: '3.4.3'
comet-version:
description: 'The Comet version to use for Spark'
required: true
default: '0.5.0-SNAPSHOT'
runs:
using: "composite"
steps:
Expand All @@ -46,7 +42,6 @@ runs:
run: |
cd apache-spark
git apply ../dev/diffs/${{inputs.spark-version}}.diff
../mvnw -nsu -q versions:set-property -Dproperty=comet.version -DnewVersion=${{inputs.comet-version}} -DgenerateBackupPoms=false
- name: Cache Maven dependencies
uses: actions/cache@v4
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/spark_sql_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ jobs:
with:
spark-version: ${{ matrix.spark-version.full }}
spark-short-version: ${{ matrix.spark-version.short }}
comet-version: '0.6.0-SNAPSHOT' # TODO: get this from pom.xml
- name: Run Spark tests
run: |
cd apache-spark
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/spark_sql_test_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ jobs:
with:
spark-version: ${{ matrix.spark-version.full }}
spark-short-version: ${{ matrix.spark-version.short }}
comet-version: '0.6.0-SNAPSHOT' # TODO: get this from pom.xml
- name: Run Spark tests
run: |
cd apache-spark
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ define spark_jvm_17_extra_args
$(shell ./mvnw help:evaluate -Dexpression=extraJavaTestArgs | grep -v '\[')
endef

# Build optional Comet native features (like hdfs e.g)
FEATURES_ARG := $(shell ! [ -z $(COMET_FEATURES) ] && echo '--features=$(COMET_FEATURES)')

all: core jvm

core:
Expand Down Expand Up @@ -95,7 +98,7 @@ release-linux: clean
cd native && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release:
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
cd native && RUSTFLAGS="$(RUSTFLAGS) -Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)
./mvnw install -Prelease -DskipTests $(PROFILES)
release-nogit:
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
Expand Down
3 changes: 2 additions & 1 deletion common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ under the License.
<parent>
<groupId>org.apache.datafusion</groupId>
<artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
<version>0.6.0-SNAPSHOT</version>
<version>0.7.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -211,6 +211,7 @@ under the License.
<includes>
<include>libcomet.dylib</include>
<include>libcomet.so</include>
<include>comet.dll</include>
</includes>
<targetPath>org/apache/comet/${platform}/${arch}</targetPath>
</resource>
Expand Down
2 changes: 1 addition & 1 deletion dev/diffs/3.4.3.diff
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ index d3544881af1..26ab186c65d 100644
<ivy.version>2.5.1</ivy.version>
<oro.version>2.0.8</oro.version>
+ <spark.version.short>3.4</spark.version.short>
+ <comet.version>0.5.0-SNAPSHOT</comet.version>
+ <comet.version>0.7.0-SNAPSHOT</comet.version>
<!--
If you changes codahale.metrics.version, you also need to change
the link to metrics.dropwizard.io in docs/monitoring.md.
Expand Down
2 changes: 1 addition & 1 deletion dev/diffs/3.5.1.diff
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ index 0f504dbee85..430ec217e59 100644
<ivy.version>2.5.1</ivy.version>
<oro.version>2.0.8</oro.version>
+ <spark.version.short>3.5</spark.version.short>
+ <comet.version>0.5.0-SNAPSHOT</comet.version>
+ <comet.version>0.7.0-SNAPSHOT</comet.version>
<!--
If you changes codahale.metrics.version, you also need to change
the link to metrics.dropwizard.io in docs/monitoring.md.
Expand Down
2 changes: 1 addition & 1 deletion dev/diffs/4.0.0-preview1.diff
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ index a4b1b2c3c9f..6a532749978 100644
<ivy.version>2.5.2</ivy.version>
<oro.version>2.0.8</oro.version>
+ <spark.version.short>4.0</spark.version.short>
+ <comet.version>0.5.0-SNAPSHOT</comet.version>
+ <comet.version>0.7.0-SNAPSHOT</comet.version>
<!--
If you change codahale.metrics.version, you also need to change
the link to metrics.dropwizard.io in docs/monitoring.md.
Expand Down
5 changes: 5 additions & 0 deletions dev/release/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,11 @@ it to GitHub Container Registry at https://github.com/apache/datafusion-comet/pk

Reply to the vote thread to close the vote and announce the release.

## Update released version number in documentation

- We provide direct links to the jar files in Maven
- The Kubernetes page needs updating once the Docker image has been published to GitHub Container Regsistry

## Post Release Admin

Register the release with the [Apache Reporter Service](https://reporter.apache.org/addrelease.html?datafusion) using
Expand Down
2 changes: 1 addition & 1 deletion docs/source/contributor-guide/debugging.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ Then build the Comet as [described](https://github.com/apache/arrow-datafusion-c
Start Comet with `RUST_BACKTRACE=1`

```console
RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.4_2.12-0.6.0-SNAPSHOT.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true
RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.4_2.12-0.7.0-SNAPSHOT.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true
```

Get the expanded exception details
Expand Down
78 changes: 78 additions & 0 deletions docs/source/user-guide/datasources.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,81 @@ converted into Arrow format, allowing native execution to happen after that.

Comet does not provide native JSON scan, but when `spark.comet.convert.json.enabled` is enabled, data is immediately
converted into Arrow format, allowing native execution to happen after that.

# Supported Storages

## Local
In progress

## HDFS

Apache DataFusion Comet native reader seamlessly scans files from remote HDFS for [supported formats](#supported-spark-data-sources)

### Using experimental native DataFusion reader
Unlike to native Comet reader the Datafusion reader fully supports nested types processing. This reader is currently experimental only

To build Comet with native DataFusion reader and remote HDFS support it is required to have a JDK installed

Example:
Build a Comet for `spark-3.4` provide a JDK path in `JAVA_HOME`
Provide the JRE linker path in `RUSTFLAGS`, the path can vary depending on the system. Typically JRE linker is a part of installed JDK

```shell
export JAVA_HOME="/opt/homebrew/opt/openjdk@11"
make release PROFILES="-Pspark-3.4" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server"
```

Start Comet with experimental reader and HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled)
and add additional parameters

```shell
--conf spark.comet.scan.impl=native_datafusion \
--conf spark.hadoop.fs.defaultFS="hdfs://namenode:9000" \
--conf spark.hadoop.dfs.client.use.datanode.hostname = true \
--conf dfs.client.use.datanode.hostname = true
```

Query a struct type from Remote HDFS
```shell
spark.read.parquet("hdfs://namenode:9000/user/data").show(false)

root
|-- id: integer (nullable = true)
|-- first_name: string (nullable = true)
|-- personal_info: struct (nullable = true)
| |-- firstName: string (nullable = true)
| |-- lastName: string (nullable = true)
| |-- ageInYears: integer (nullable = true)

25/01/30 16:50:43 INFO core/src/lib.rs: Comet native library version 0.6.0 initialized
== Physical Plan ==
* CometColumnarToRow (2)
+- CometNativeScan: (1)


(1) CometNativeScan:
Output [3]: [id#0, first_name#1, personal_info#4]
Arguments: [id#0, first_name#1, personal_info#4]

(2) CometColumnarToRow [codegen id : 1]
Input [3]: [id#0, first_name#1, personal_info#4]


25/01/30 16:50:44 INFO fs-hdfs-0.1.12/src/hdfs.rs: Connecting to Namenode (hdfs://namenode:9000)
+---+----------+-----------------+
|id |first_name|personal_info |
+---+----------+-----------------+
|2 |Jane |{Jane, Smith, 34}|
|1 |John |{John, Doe, 28} |
+---+----------+-----------------+



```

Verify the native scan type should be `CometNativeScan`.

More on [HDFS Reader](../../../native/hdfs/README.md)

## S3
In progress
18 changes: 9 additions & 9 deletions docs/source/user-guide/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ Cloud Service Providers.
Comet jar files are available in [Maven Central](https://central.sonatype.com/namespace/org.apache.datafusion) for amd64 and arm64 architectures for Linux. For Apple OSX, it
is currently necessary to build from source.

Here are the direct links for downloading the Comet jar file.
Here are the direct links for downloading the Comet 0.6.0 jar file.

- [Comet plugin for Spark 3.3 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.12/0.4.0/comet-spark-spark3.3_2.12-0.4.0.jar)
- [Comet plugin for Spark 3.3 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.13/0.4.0/comet-spark-spark3.3_2.13-0.4.0.jar)
- [Comet plugin for Spark 3.4 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.12/0.4.0/comet-spark-spark3.4_2.12-0.4.0.jar)
- [Comet plugin for Spark 3.4 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.13/0.4.0/comet-spark-spark3.4_2.13-0.4.0.jar)
- [Comet plugin for Spark 3.5 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.4.0/comet-spark-spark3.5_2.12-0.4.0.jar)
- [Comet plugin for Spark 3.5 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/0.4.0/comet-spark-spark3.5_2.13-0.4.0.jar)
- [Comet plugin for Spark 3.3 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.12/0.6.0/comet-spark-spark3.3_2.12-0.6.0.jar)
- [Comet plugin for Spark 3.3 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.3_2.13/0.6.0/comet-spark-spark3.3_2.13-0.6.0.jar)
- [Comet plugin for Spark 3.4 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.12/0.6.0/comet-spark-spark3.4_2.12-0.6.0.jar)
- [Comet plugin for Spark 3.4 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.4_2.13/0.6.0/comet-spark-spark3.4_2.13-0.6.0.jar)
- [Comet plugin for Spark 3.5 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.6.0/comet-spark-spark3.5_2.12-0.6.0.jar)
- [Comet plugin for Spark 3.5 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/0.6.0/comet-spark-spark3.5_2.13-0.6.0.jar)

## Building from source

Expand All @@ -74,7 +74,7 @@ See the [Comet Kubernetes Guide](kubernetes.md) guide.
Make sure `SPARK_HOME` points to the same Spark version as Comet was built for.

```console
export COMET_JAR=spark/target/comet-spark-spark3.4_2.12-0.6.0-SNAPSHOT.jar
export COMET_JAR=spark/target/comet-spark-spark3.4_2.12-0.7.0-SNAPSHOT.jar

$SPARK_HOME/bin/spark-shell \
--jars $COMET_JAR \
Expand Down Expand Up @@ -130,7 +130,7 @@ explicitly contain Comet otherwise Spark may use a different class-loader for th
components which will then fail at runtime. For example:

```
--driver-class-path spark/target/comet-spark-spark3.4_2.12-0.6.0-SNAPSHOT.jar
--driver-class-path spark/target/comet-spark-spark3.4_2.12-0.7.0-SNAPSHOT.jar
```

Some cluster managers may require additional configuration, see <https://spark.apache.org/docs/latest/cluster-overview.html>
Expand Down
6 changes: 3 additions & 3 deletions docs/source/user-guide/kubernetes.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ metadata:
spec:
type: Scala
mode: cluster
image: ghcr.io/apache/datafusion-comet:spark-3.4-scala-2.12-0.2.0
image: ghcr.io/apache/datafusion-comet:spark-3.4-scala-2.12-0.5.0
imagePullPolicy: IfNotPresent
mainClass: org.apache.spark.examples.SparkPi
mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.4.2.jar
sparkConf:
"spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.2.0.jar"
"spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.2.0.jar"
"spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0.jar"
"spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0.jar"
"spark.plugins": "org.apache.spark.CometPlugin"
"spark.comet.enabled": "true"
"spark.comet.exec.enabled": "true"
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/source.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Official source releases can be downloaded from https://dist.apache.org/repos/di

```console
# Pick the latest version
export COMET_VERSION=0.4.0
export COMET_VERSION=0.6.0
# Download the tarball
curl -O "https://dist.apache.org/repos/dist/release/datafusion/datafusion-comet-$COMET_VERSION/apache-datafusion-comet-$COMET_VERSION.tar.gz"
# Unpack
Expand Down
6 changes: 3 additions & 3 deletions fuzz-testing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Set appropriate values for `SPARK_HOME`, `SPARK_MASTER`, and `COMET_JAR` environ
$SPARK_HOME/bin/spark-submit \
--master $SPARK_MASTER \
--class org.apache.comet.fuzz.Main \
target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
target/comet-fuzz-spark3.4_2.12-0.7.0-SNAPSHOT-jar-with-dependencies.jar \
data --num-files=2 --num-rows=200 --exclude-negative-zero --generate-arrays --generate-structs --generate-maps
```

Expand All @@ -77,7 +77,7 @@ Generate random queries that are based on the available test files.
$SPARK_HOME/bin/spark-submit \
--master $SPARK_MASTER \
--class org.apache.comet.fuzz.Main \
target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
target/comet-fuzz-spark3.4_2.12-0.7.0-SNAPSHOT-jar-with-dependencies.jar \
queries --num-files=2 --num-queries=500
```

Expand All @@ -99,7 +99,7 @@ $SPARK_HOME/bin/spark-submit \
--conf spark.driver.extraClassPath=$COMET_JAR \
--conf spark.executor.extraClassPath=$COMET_JAR \
--class org.apache.comet.fuzz.Main \
target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
target/comet-fuzz-spark3.4_2.12-0.7.0-SNAPSHOT-jar-with-dependencies.jar \
run --num-files=2 --filename=queries.sql
```

Expand Down
2 changes: 1 addition & 1 deletion fuzz-testing/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ under the License.
<parent>
<groupId>org.apache.datafusion</groupId>
<artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
<version>0.6.0-SNAPSHOT</version>
<version>0.7.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
8 changes: 4 additions & 4 deletions native/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion native/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ members = ["core", "spark-expr", "proto", "hdfs"]
resolver = "2"

[workspace.package]
version = "0.6.0"
version = "0.7.0"
homepage = "https://datafusion.apache.org/comet"
repository = "https://github.com/apache/datafusion-comet"
authors = ["Apache DataFusion <dev@datafusion.apache.org>"]
Expand Down
14 changes: 4 additions & 10 deletions native/core/src/execution/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctio

use crate::execution::shuffle::CompressionCodec;
use crate::execution::spark_plan::SparkPlan;
use crate::parquet::parquet_support::SparkParquetOptions;
use crate::parquet::parquet_support::{register_object_store, SparkParquetOptions};
use crate::parquet::schema_adapter::SparkSchemaAdapterFactory;
use datafusion::datasource::listing::PartitionedFile;
use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder;
Expand Down Expand Up @@ -106,7 +106,6 @@ use datafusion_common::{
tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter},
JoinType as DFJoinType, ScalarValue,
};
use datafusion_execution::object_store::ObjectStoreUrl;
use datafusion_expr::type_coercion::other::get_coerce_type_for_case_expression;
use datafusion_expr::{
AggregateUDF, ReturnTypeArgs, ScalarUDF, WindowFrame, WindowFrameBound, WindowFrameUnits,
Expand Down Expand Up @@ -1165,12 +1164,9 @@ impl PhysicalPlanner {
))
});

let object_store = object_store::local::LocalFileSystem::new();
// register the object store with the runtime environment
let url = Url::try_from("file://").unwrap();
self.session_ctx
.runtime_env()
.register_object_store(&url, Arc::new(object_store));
// By default, local FS object store registered
// if `hdfs` feature enabled then HDFS file object store registered
let object_store_url = register_object_store(Arc::clone(&self.session_ctx))?;

// Generate file groups
let mut file_groups: Vec<Vec<PartitionedFile>> =
Expand Down Expand Up @@ -1229,8 +1225,6 @@ impl PhysicalPlanner {

// TODO: I think we can remove partition_count in the future, but leave for testing.
assert_eq!(file_groups.len(), partition_count);

let object_store_url = ObjectStoreUrl::local_filesystem();
let partition_fields: Vec<Field> = partition_schema
.fields()
.iter()
Expand Down
Loading

0 comments on commit 61c3e3c

Please sign in to comment.