Skip to content

Commit

Permalink
Merge pull request #13773 from JohnSnowLabs/feature/SPARKNLP-819-Is-S…
Browse files Browse the repository at this point in the history
…park-NLP-fully-compatible-with-Apache-Spark-3.4

SPARKNLP-819 Adding changes to make spark-nlp 3.4.0 default version
  • Loading branch information
maziyarpanahi authored May 10, 2023
2 parents 7b2674e + 5255538 commit 706ea4a
Show file tree
Hide file tree
Showing 13 changed files with 67 additions and 23 deletions.
52 changes: 44 additions & 8 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ on:
- 'main'

jobs:
spark33:
spark34:
if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
runs-on: macos-latest
env:
TF_CPP_MIN_LOG_LEVEL: 3
JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
name: Build and Test on Apache Spark 3.3.x
name: Build and Test on Apache Spark 3.4.x

steps:
- uses: actions/checkout@v3
Expand All @@ -54,19 +54,55 @@ jobs:
- name: Install Python packages (Python 3.7)
run: |
python -m pip install --upgrade pip
pip install pyspark==3.3.1 numpy pytest
- name: Build Spark NLP on Apache Spark 3.3.0
pip install pyspark==3.4.0 numpy pytest
- name: Build Spark NLP on Apache Spark 3.4.0
run: |
brew install sbt
sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.3.x
sbt -mem 4096 -Dis_spark34=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.4.x
run: |
sbt -mem 4096 coverage test
- name: Upload coverage data to Coveralls
run: sbt coverageReport coveralls
env:
COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COVERALLS_FLAG_NAME: Apache Spark 3.3.x - Scala 2.12
COVERALLS_FLAG_NAME: Apache Spark 3.4.x - Scala 2.12
- name: Test Spark NLP in Python - Apache Spark 3.4.x
run: |
cd python
python3.7 -m pytest -v -m fast
spark33:
if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
runs-on: macos-latest
env:
TF_CPP_MIN_LOG_LEVEL: 3
JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
name: Build and Test on Apache Spark 3.3.x

steps:
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
with:
distribution: 'adopt'
java-version: '8'
cache: 'sbt'
- name: Install Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7.7
architecture: x64
- name: Install Python packages (Python 3.7)
run: |
python -m pip install --upgrade pip
pip install pyspark==3.3.1 numpy pytest
- name: Build Spark NLP on Apache Spark 3.3.1
run: |
brew install sbt
sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.3.x
run: |
sbt -mem 4096 test
- name: Test Spark NLP in Python - Apache Spark 3.3.x
run: |
cd python
Expand Down Expand Up @@ -99,7 +135,7 @@ jobs:
- name: Build Spark NLP on Apache Spark 3.2.3
run: |
brew install sbt
sbt -mem 4096 clean assemblyAndCopy
sbt -mem 4096 -Dis_spark32=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.2.x
run: |
sbt -mem 4096 test
Expand Down
15 changes: 10 additions & 5 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ object Dependencies {

/** ------- Spark version start ------- */
/* default spark version to base the APIS */
val spark33Ver = "3.3.1"
val spark34Ver = "3.4.0"
/* only used in unit tests */
val spark30Ver = "3.0.3"
val spark31Ver = "3.1.3"
val spark32Ver = "3.2.3"
val spark33Ver = "3.3.1"

/* required for different hardware */
val is_gpu: String = System.getProperty("is_gpu", "false")
Expand All @@ -20,9 +21,10 @@ object Dependencies {
val is_spark30: String = System.getProperty("is_spark30", "false")
val is_spark31: String = System.getProperty("is_spark31", "false")
val is_spark32: String = System.getProperty("is_spark32", "false")
val is_spark33: String = System.getProperty("is_spark33", "true")
val is_spark33: String = System.getProperty("is_spark33", "false")
val is_spark34: String = System.getProperty("is_spark34", "true")

val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33)
val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34)

/** ------- Spark version end ------- */

Expand All @@ -43,16 +45,19 @@ object Dependencies {
is_spark30: String,
is_spark31: String,
is_spark32: String,
is_spark33: String): String = {
is_spark33: String,
is_spark34: String): String = {
if (is_spark30.equals("true")) {
spark30Ver
} else if (is_spark31.equals("true")) {
spark31Ver
} else if (is_spark32.equals("true")) {
spark32Ver
} else if (is_spark33.equals("true")) {
spark33Ver
} else {
/* default spark version */
spark33Ver
spark34Ver
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.ml.tensorflow.sentencepiece

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.commons.lang.SystemUtils
import org.apache.commons.lang3.SystemUtils
import org.apache.spark.SparkFiles
import org.apache.spark.sql.SparkSession
import org.tensorflow.TensorFlow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
import org.apache.commons.lang.time.DateUtils
import org.apache.commons.lang3.time.DateUtils
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import java.text.SimpleDateFormat
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.nlp.annotators.ner.dl

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.commons.lang.SystemUtils
import org.apache.commons.lang3.SystemUtils
import org.apache.spark.SparkFiles
import org.apache.spark.sql.SparkSession
import org.tensorflow.TensorFlow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import com.johnsnowlabs.nlp.util.io.{OutputHelper, ResourceHelper}
import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorType, ParamsAndFeaturesWritable}
import com.johnsnowlabs.storage.HasStorageRef
import org.apache.commons.io.IOUtils
import org.apache.commons.lang.SystemUtils
import org.apache.commons.lang3.SystemUtils
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.nlp.annotators.tokenizer.bpe

import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, Sentence, TokenPiece}
import org.apache.commons.lang.StringUtils
import org.apache.commons.lang3.StringUtils

import scala.collection.mutable
import scala.collection.mutable.ListBuffer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package com.johnsnowlabs.nlp.annotators.tokenizer.normalizer

import org.apache.commons.lang.StringUtils
import org.apache.commons.lang3.StringUtils

import scala.util.matching.Regex

Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import org.apache.commons.lang.StringEscapeUtils.escapeJava
import org.apache.commons.text.StringEscapeUtils.escapeJava
import scala.collection.mutable.ArrayBuffer
import scala.util.Try

Expand Down Expand Up @@ -89,6 +89,7 @@ object CoNLLGenerator {
CoNLLDataset
.coalesce(1)
.write
.mode("overwrite")
.format("com.databricks.spark.csv")
.options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> ""))
.save(outputPath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ trait SparkSessionTest extends BeforeAndAfterAll { this: Suite =>
val emptyDataSet: Dataset[_] = PipelineModels.dummyDataset
val pipeline = new Pipeline()

println(s"Spark version: ${spark.version}")

override def beforeAll(): Unit = {
super.beforeAll()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ trait NorvigSweetingBehaviors { this: AnyFlatSpec =>
"Unknown exception. Please check Spark version for correct handling."
}

assert(caught.getMessage == expectedErrorMessage)
assert(caught.getMessage.contains(expectedErrorMessage))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ trait SymmetricDeleteBehaviors {
spell.fit(trainDataSet)
}

assert(caught.getMessage == expectedErrorMessage)
assert(caught.getMessage.contains(expectedErrorMessage))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class CoNLLGeneratorTestSpec extends AnyFlatSpec {
assert(fileContents == testNERText)
}

"The generator" should "work even if token metadata has non-ints" in {
"The generator" should "work even if token metadata has non-ints" taggedAs SlowTest in {
val df = ResourceHelper.spark.read.load(
"src/test/resources/conllgenerator/conllgenerator_nonint_token_metadata.parquet")

Expand Down

0 comments on commit 706ea4a

Please sign in to comment.