Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPARKNLP-819 Adding changes to make spark-nlp 3.4.0 default version #13773

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 44 additions & 8 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ on:
- 'main'

jobs:
spark33:
spark34:
if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
runs-on: macos-latest
env:
TF_CPP_MIN_LOG_LEVEL: 3
JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
name: Build and Test on Apache Spark 3.3.x
name: Build and Test on Apache Spark 3.4.x

steps:
- uses: actions/checkout@v3
Expand All @@ -54,19 +54,55 @@ jobs:
- name: Install Python packages (Python 3.7)
run: |
python -m pip install --upgrade pip
pip install pyspark==3.3.1 numpy pytest
- name: Build Spark NLP on Apache Spark 3.3.0
pip install pyspark==3.4.0 numpy pytest
- name: Build Spark NLP on Apache Spark 3.4.0
run: |
brew install sbt
sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.3.x
sbt -mem 4096 -Dis_spark34=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.4.x
run: |
sbt -mem 4096 coverage test
- name: Upload coverage data to Coveralls
run: sbt coverageReport coveralls
env:
COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COVERALLS_FLAG_NAME: Apache Spark 3.3.x - Scala 2.12
COVERALLS_FLAG_NAME: Apache Spark 3.4.x - Scala 2.12
- name: Test Spark NLP in Python - Apache Spark 3.4.x
run: |
cd python
python3.7 -m pytest -v -m fast

spark33:
if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
runs-on: macos-latest
env:
TF_CPP_MIN_LOG_LEVEL: 3
JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
name: Build and Test on Apache Spark 3.3.x

steps:
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
with:
distribution: 'adopt'
java-version: '8'
cache: 'sbt'
- name: Install Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7.7
architecture: x64
- name: Install Python packages (Python 3.7)
run: |
python -m pip install --upgrade pip
pip install pyspark==3.3.1 numpy pytest
- name: Build Spark NLP on Apache Spark 3.3.1
run: |
brew install sbt
sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.3.x
run: |
sbt -mem 4096 test
- name: Test Spark NLP in Python - Apache Spark 3.3.x
run: |
cd python
Expand Down Expand Up @@ -99,7 +135,7 @@ jobs:
- name: Build Spark NLP on Apache Spark 3.2.3
run: |
brew install sbt
sbt -mem 4096 clean assemblyAndCopy
sbt -mem 4096 -Dis_spark32=true clean assemblyAndCopy
- name: Test Spark NLP in Scala - Apache Spark 3.2.x
run: |
sbt -mem 4096 test
Expand Down
15 changes: 10 additions & 5 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ object Dependencies {

/** ------- Spark version start ------- */
/* default spark version to base the APIS */
val spark33Ver = "3.3.1"
val spark34Ver = "3.4.0"
/* only used in unit tests */
val spark30Ver = "3.0.3"
val spark31Ver = "3.1.3"
val spark32Ver = "3.2.3"
val spark33Ver = "3.3.1"

/* required for different hardware */
val is_gpu: String = System.getProperty("is_gpu", "false")
Expand All @@ -20,9 +21,10 @@ object Dependencies {
val is_spark30: String = System.getProperty("is_spark30", "false")
val is_spark31: String = System.getProperty("is_spark31", "false")
val is_spark32: String = System.getProperty("is_spark32", "false")
val is_spark33: String = System.getProperty("is_spark33", "true")
val is_spark33: String = System.getProperty("is_spark33", "false")
val is_spark34: String = System.getProperty("is_spark34", "true")

val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33)
val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34)

/** ------- Spark version end ------- */

Expand All @@ -43,16 +45,19 @@ object Dependencies {
is_spark30: String,
is_spark31: String,
is_spark32: String,
is_spark33: String): String = {
is_spark33: String,
is_spark34: String): String = {
if (is_spark30.equals("true")) {
spark30Ver
} else if (is_spark31.equals("true")) {
spark31Ver
} else if (is_spark32.equals("true")) {
spark32Ver
} else if (is_spark33.equals("true")) {
spark33Ver
} else {
/* default spark version */
spark33Ver
spark34Ver
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.ml.tensorflow.sentencepiece

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.commons.lang.SystemUtils
import org.apache.commons.lang3.SystemUtils
import org.apache.spark.SparkFiles
import org.apache.spark.sql.SparkSession
import org.tensorflow.TensorFlow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
import org.apache.commons.lang.time.DateUtils
import org.apache.commons.lang3.time.DateUtils
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import java.text.SimpleDateFormat
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.nlp.annotators.ner.dl

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.commons.lang.SystemUtils
import org.apache.commons.lang3.SystemUtils
import org.apache.spark.SparkFiles
import org.apache.spark.sql.SparkSession
import org.tensorflow.TensorFlow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import com.johnsnowlabs.nlp.util.io.{OutputHelper, ResourceHelper}
import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorType, ParamsAndFeaturesWritable}
import com.johnsnowlabs.storage.HasStorageRef
import org.apache.commons.io.IOUtils
import org.apache.commons.lang.SystemUtils
import org.apache.commons.lang3.SystemUtils
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.nlp.annotators.tokenizer.bpe

import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, Sentence, TokenPiece}
import org.apache.commons.lang.StringUtils
import org.apache.commons.lang3.StringUtils

import scala.collection.mutable
import scala.collection.mutable.ListBuffer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package com.johnsnowlabs.nlp.annotators.tokenizer.normalizer

import org.apache.commons.lang.StringUtils
import org.apache.commons.lang3.StringUtils

import scala.util.matching.Regex

Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import org.apache.commons.lang.StringEscapeUtils.escapeJava
import org.apache.commons.text.StringEscapeUtils.escapeJava
import scala.collection.mutable.ArrayBuffer
import scala.util.Try

Expand Down Expand Up @@ -89,6 +89,7 @@ object CoNLLGenerator {
CoNLLDataset
.coalesce(1)
.write
.mode("overwrite")
.format("com.databricks.spark.csv")
.options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> ""))
.save(outputPath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ trait SparkSessionTest extends BeforeAndAfterAll { this: Suite =>
val emptyDataSet: Dataset[_] = PipelineModels.dummyDataset
val pipeline = new Pipeline()

println(s"Spark version: ${spark.version}")

override def beforeAll(): Unit = {
super.beforeAll()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ trait NorvigSweetingBehaviors { this: AnyFlatSpec =>
"Unknown exception. Please check Spark version for correct handling."
}

assert(caught.getMessage == expectedErrorMessage)
assert(caught.getMessage.contains(expectedErrorMessage))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ trait SymmetricDeleteBehaviors {
spell.fit(trainDataSet)
}

assert(caught.getMessage == expectedErrorMessage)
assert(caught.getMessage.contains(expectedErrorMessage))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class CoNLLGeneratorTestSpec extends AnyFlatSpec {
assert(fileContents == testNERText)
}

"The generator" should "work even if token metadata has non-ints" in {
"The generator" should "work even if token metadata has non-ints" taggedAs SlowTest in {
val df = ResourceHelper.spark.read.load(
"src/test/resources/conllgenerator/conllgenerator_nonint_token_metadata.parquet")

Expand Down