Skip to content

Commit

Permalink
Merge branch 'main' into spillread
Browse files Browse the repository at this point in the history
  • Loading branch information
jinchengchenghh authored Nov 27, 2024
2 parents 6471ecc + 2649fa7 commit 366c89c
Show file tree
Hide file tree
Showing 95 changed files with 918 additions and 501 deletions.
3 changes: 3 additions & 0 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ github:
- spark-sql
- vectorization
- velox
collaborators:
- majetideepak
- pedroerp
enabled_merge_buttons:
squash: true
merge: false
Expand Down
31 changes: 19 additions & 12 deletions .github/workflows/build_bundle_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ on:
jobs:
build-native-lib:
runs-on: ubuntu-20.04
container: apache/gluten:vcpkg-centos-7
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
with:
Expand All @@ -50,39 +49,47 @@ jobs:
ccache-centos7-release-default
- name: Build Gluten velox third party
run: |
df -a
cd $GITHUB_WORKSPACE/
bash dev/ci-velox-buildstatic-centos-7.sh
docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
df -a
cd /work
export CCACHE_DIR=/work/.ccache
bash dev/ci-velox-buildstatic-centos-7.sh
ccache -s
mkdir -p /work/.m2/repository/org/apache/arrow/
cp -r /root/.m2/repository/org/apache/arrow/* /work/.m2/repository/org/apache/arrow/
"
- name: Upload native libs
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v3
with:
path: ./cpp/build/releases/
name: velox-native-lib-${{github.sha}}
retention-days: 1
- name: Upload Artifact Arrow Jar
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v3
with:
path: /root/.m2/repository/org/apache/arrow/
name: velox-arrow-jar-centos-7-${{github.sha}}

build-bundle-package-centos7:
build-bundle-package-centos8:
needs: build-native-lib
runs-on: ubuntu-20.04
container: centos:7
container: centos:8
steps:
- uses: actions/checkout@v2
- name: Download All Artifacts
uses: actions/download-artifact@v2
uses: actions/download-artifact@v3
with:
name: velox-native-lib-${{github.sha}}
path: ./cpp/build/releases
- name: Download All Arrow Jar Artifacts
uses: actions/download-artifact@v2
uses: actions/download-artifact@v3
with:
name: velox-arrow-jar-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
- name: Setup java and maven
run: |
sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && \
yum update -y && yum install -y java-1.8.0-openjdk-devel wget && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz && \
tar -xvf apache-maven-3.8.8-bin.tar.gz && \
Expand All @@ -94,7 +101,7 @@ jobs:
export PATH=${PATH}:${MAVEN_HOME}/bin && \
mvn clean install -P${{ github.event.inputs.spark }} -Dhadoop.version=${{ github.event.inputs.hadoop }} -Pbackends-velox -Pceleborn -Puniffle -DskipTests -Dmaven.source.skip
- name: Upload bundle package
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v3
with:
name: gluten-velox-bundle-package
path: package/target/gluten-velox-bundle-*.jar
Expand Down
98 changes: 83 additions & 15 deletions .github/workflows/velox_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,8 @@ concurrency:
jobs:
build-native-lib-centos-7:
runs-on: ubuntu-20.04
container: apache/gluten:vcpkg-centos-7
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
with:
Expand All @@ -68,10 +67,17 @@ jobs:
ccache-centos7-release-default
- name: Build Gluten native libraries
run: |
df -a
cd $GITHUB_WORKSPACE/
bash dev/ci-velox-buildstatic-centos-7.sh
ccache -s
docker pull apache/gluten:vcpkg-centos-7
docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
df -a
cd /work
export CCACHE_DIR=/work/.ccache
bash dev/ci-velox-buildstatic-centos-7.sh
ccache -s
mkdir -p /work/.m2/repository/org/apache/arrow/
cp -r /root/.m2/repository/org/apache/arrow/* /work/.m2/repository/org/apache/arrow/
"
- name: "Save ccache"
uses: actions/cache/save@v3
id: ccache
Expand All @@ -85,7 +91,7 @@ jobs:
- uses: actions/upload-artifact@v3
with:
name: arrow-jars-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
path: .m2/repository/org/apache/arrow/

run-tpc-test-ubuntu:
needs: build-native-lib-centos-7
Expand Down Expand Up @@ -158,12 +164,12 @@ jobs:
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
run-tpc-test-centos:
run-tpc-test-centos8:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
os: [ "centos:7", "centos:8" ]
os: [ "centos:8" ]
spark: [ "spark-3.2", "spark-3.3", "spark-3.4", "spark-3.5" ]
java: [ "java-8", "java-11", "java-17" ]
# Spark supports JDK17 since 3.3 and later, see https://issues.apache.org/jira/browse/SPARK-33772
Expand Down Expand Up @@ -249,6 +255,68 @@ jobs:
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
run-tpc-test-centos7:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
spark: [ "spark-3.2", "spark-3.3", "spark-3.4", "spark-3.5" ]
java: [ "java-8" ]
# Spark supports JDK17 since 3.3 and later, see https://issues.apache.org/jira/browse/SPARK-33772
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v4
- name: Download All Native Artifacts
uses: actions/download-artifact@v3
with:
name: velox-native-lib-centos-7-${{github.sha}}
path: ./cpp/build/releases/
- name: Download All Arrow Jar Artifacts
uses: actions/download-artifact@v3
with:
name: arrow-jars-centos-7-${{github.sha}}
path: .m2/repository/org/apache/arrow/
- name: Build and run TPCH/DS tests
run: |
docker pull centos:7
docker run -v $GITHUB_WORKSPACE:/work -v /$GITHUB_WORKSPACE/.m2:/root/.m2/ -w /work \
-e matrix.java=${{ matrix.java }} -e matrix.spark=${{ matrix.spark }} \
centos:7 \
bash -c "
sed -i -e 's|mirrorlist=|#mirrorlist=|g' /etc/yum.repos.d/CentOS-* || true
sed -i -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* || true
# Setup java and maven
yum update -y && yum install -y java-1.8.0-openjdk-devel wget
wget -nv https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz && mv apache-maven-3.8.8 /usr/lib/maven
export PATH=${PATH}:/usr/lib/maven/bin
# Set environment variables
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
# Build gluten-it
mvn -ntp clean install -P${{ matrix.spark }} -P${{ matrix.java }} -Pbackends-velox -DskipTests
cd /work/tools/gluten-it
mvn -ntp clean install -P${{ matrix.spark }} -P${{ matrix.java }}
# Run TPC-H / TPC-DS
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
# Run TPC-H / TPC-DS with RAS
cd /work/tools/gluten-it
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
"
run-tpc-test-ubuntu-oom:
needs: build-native-lib-centos-7
strategy:
Expand Down Expand Up @@ -956,12 +1024,12 @@ jobs:
df -a
bash dev/ci-velox-buildshared-centos-8.sh
ccache -s
- name: "Save ccache"
uses: actions/cache/save@v3
id: ccache
with:
path: '${{ env.CCACHE_DIR }}'
key: ccache-centos8-release-default-${{github.sha}}
# - name: "Save ccache"
# uses: actions/cache/save@v3
# id: ccache
# with:
# path: '${{ env.CCACHE_DIR }}'
# key: ccache-centos8-release-default-${{github.sha}}
- name: Run CPP unit test
run: |
cd ./cpp/build && ctest -V
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/velox_backend_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ concurrency:
jobs:
cache-native-lib-centos-7:
runs-on: ubuntu-20.04
container: apache/gluten:vcpkg-centos-7
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
with:
Expand All @@ -43,8 +42,9 @@ jobs:
ccache-centos7-release-default
- name: Build Gluten native libraries
run: |
df -a
bash dev/ci-velox-buildstatic-centos-7.sh
docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
bash dev/ci-velox-buildstatic-centos-7.sh
"
- name: Save Ccache
uses: actions/cache/save@v3
id: ccache
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import org.apache.gluten.columnarbatch.CHBatch
import org.apache.gluten.execution.WriteFilesExecTransformer
import org.apache.gluten.expression.WindowFunctionsBuilder
import org.apache.gluten.extension.ValidationResult
import org.apache.gluten.extension.columnar.transition.Convention
import org.apache.gluten.extension.columnar.transition.{Convention, ConventionFunc}
import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat
import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat._

Expand All @@ -34,6 +34,8 @@ import org.apache.spark.sql.catalyst.catalog.BucketSpec
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
Expand All @@ -45,10 +47,11 @@ import java.util.Locale
import scala.util.control.Breaks.{break, breakable}

class CHBackend extends SubstraitBackend {
import CHBackend._
override def name(): String = CHConf.BACKEND_NAME
override def defaultBatchType: Convention.BatchType = CHBatch
override def buildInfo(): Backend.BuildInfo =
Backend.BuildInfo("ClickHouse", CH_BRANCH, CH_COMMIT, "UNKNOWN")
override def convFuncOverride(): ConventionFunc.Override = new ConvFunc()
override def iteratorApi(): IteratorApi = new CHIteratorApi
override def sparkPlanExecApi(): SparkPlanExecApi = new CHSparkPlanExecApi
override def transformerApi(): TransformerApi = new CHTransformerApi
Expand All @@ -59,7 +62,17 @@ class CHBackend extends SubstraitBackend {
override def settings(): BackendSettingsApi = CHBackendSettings
}

object CHBackend {
private class ConvFunc() extends ConventionFunc.Override {
override def batchTypeOf: PartialFunction[SparkPlan, Convention.BatchType] = {
case a: AdaptiveSparkPlanExec if a.supportsColumnar =>
CHBatch
}
}
}

object CHBackendSettings extends BackendSettingsApi with Logging {
override def primaryBatchType: Convention.BatchType = CHBatch

private val GLUTEN_CLICKHOUSE_SEP_SCAN_RDD = "spark.gluten.sql.columnar.separate.scan.rdd.for.ch"
private val GLUTEN_CLICKHOUSE_SEP_SCAN_RDD_DEFAULT = "false"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package org.apache.gluten.backendsapi.clickhouse

import org.apache.gluten.GlutenConfig
import org.apache.gluten.backendsapi.RuleApi
import org.apache.gluten.columnarbatch.CHBatch
import org.apache.gluten.extension._
import org.apache.gluten.extension.columnar._
import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast}
Expand Down Expand Up @@ -114,7 +115,7 @@ object CHRuleApi {
intercept(
SparkPlanRules.extendedColumnarRule(c.glutenConf.extendedColumnarTransformRules)(
c.session)))
injector.injectPostTransform(c => InsertTransitions(c.outputsColumnar))
injector.injectPostTransform(c => InsertTransitions.create(c.outputsColumnar, CHBatch))

// Gluten columnar: Fallback policies.
injector.injectFallbackPolicy(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ object MetricsUtil extends Logging {
j.metricsUpdater(),
// must put the buildPlan first
Seq(treeifyMetricsUpdaters(j.buildPlan), treeifyMetricsUpdaters(j.streamedPlan)))
case t: TransformSupport if t.metricsUpdater() == MetricsUpdater.None =>
assert(t.children.size == 1, "MetricsUpdater.None can only be used on unary operator")
treeifyMetricsUpdaters(t.children.head)
case t: TransformSupport =>
MetricsUpdaterTree(t.metricsUpdater(), t.children.map(treeifyMetricsUpdaters))
case _ =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
}
}

test("GLUTEN-3135: Bug fix to_date") {
test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") {
val create_table_sql =
"""
| create table test_tbl_3135(id bigint, data string) using parquet
Expand All @@ -2209,13 +2209,27 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
|(7, '1970-01-01 00:00:00'),
|(8, '2024-3-2'),
|(9, '2024-03-2'),
|(10, '2024-03')
|(10, '2024-03'),
|(11, '2024-03-02 11:22:33')
|""".stripMargin
spark.sql(create_table_sql)
spark.sql(insert_data_sql)

val select_sql = "select id, to_date(data) from test_tbl_3135"
compareResultsAgainstVanillaSpark(select_sql, true, { _ => })

withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) {
compareResultsAgainstVanillaSpark(
"select id, to_date('2024-03-2 11:22:33', 'yyyy-MM-dd') from test_tbl_3135 where id = 11",
true,
{ _ => })
}
withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) {
compareResultsAgainstVanillaSpark(
"select id, to_date(data, 'yyyy-MM-dd') from test_tbl_3135 where id = 11",
true,
{ _ => })
}
spark.sql("drop table test_tbl_3135")
}

Expand Down
Loading

0 comments on commit 366c89c

Please sign in to comment.