From a56ad800561f18b1d502c072d3382e3ee2fee061 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Thu, 27 Apr 2023 08:36:00 -0500 Subject: [PATCH 1/2] SPARKNLP-819 Adding changes to make spark-nlp 3.4.0 default version --- project/Dependencies.scala | 15 ++++++++++----- .../sentencepiece/LoadSentencepiece.scala | 2 +- .../nlp/annotators/MultiDateMatcher.scala | 2 +- .../nlp/annotators/ner/dl/LoadsContrib.scala | 2 +- .../nlp/annotators/ner/dl/NerDLApproach.scala | 2 +- .../annotators/tokenizer/bpe/BpeTokenizer.scala | 2 +- .../normalizer/MosesPunctNormalizer.scala | 2 +- .../com/johnsnowlabs/util/CoNLLGenerator.scala | 3 ++- .../nlp/annotators/SparkSessionTest.scala | 2 ++ .../spell/norvig/NorvigSweetingBehaviors.scala | 2 +- .../symmetric/SymmetricDeleteBehaviors.scala | 2 +- .../nlp/util/CoNLLGeneratorTestSpec.scala | 2 +- 12 files changed, 23 insertions(+), 15 deletions(-) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 26607629fcbd47..2956315905c7f9 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -4,11 +4,12 @@ object Dependencies { /** ------- Spark version start ------- */ /* default spark version to base the APIS */ - val spark33Ver = "3.3.1" + val spark34Ver = "3.4.0" /* only used in unit tests */ val spark30Ver = "3.0.3" val spark31Ver = "3.1.3" val spark32Ver = "3.2.3" + val spark33Ver = "3.3.1" /* required for different hardware */ val is_gpu: String = System.getProperty("is_gpu", "false") @@ -20,9 +21,10 @@ object Dependencies { val is_spark30: String = System.getProperty("is_spark30", "false") val is_spark31: String = System.getProperty("is_spark31", "false") val is_spark32: String = System.getProperty("is_spark32", "false") - val is_spark33: String = System.getProperty("is_spark33", "true") + val is_spark33: String = System.getProperty("is_spark33", "false") + val is_spark34: String = System.getProperty("is_spark33", "true") - val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33) + val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34) /** ------- Spark version end ------- */ @@ -43,16 +45,19 @@ object Dependencies { is_spark30: String, is_spark31: String, is_spark32: String, - is_spark33: String): String = { + is_spark33: String, + is_spark34: String): String = { if (is_spark30.equals("true")) { spark30Ver } else if (is_spark31.equals("true")) { spark31Ver } else if (is_spark32.equals("true")) { spark32Ver + } else if (is_spark32.equals("true")) { + spark33Ver } else { /* default spark version */ - spark33Ver + spark34Ver } } diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala index 815bb126449a02..4e089d7edecf3c 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.ml.tensorflow.sentencepiece import com.johnsnowlabs.nlp.util.io.ResourceHelper -import org.apache.commons.lang.SystemUtils +import org.apache.commons.lang3.SystemUtils import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession import org.tensorflow.TensorFlow diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala index a2c31e0ddb82cb..53561604988481 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.util.regex.RuleFactory import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate} -import org.apache.commons.lang.time.DateUtils +import org.apache.commons.lang3.time.DateUtils import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import java.text.SimpleDateFormat diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala index 702e0dd0e9a991..af5e16fd751a11 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.nlp.annotators.ner.dl import com.johnsnowlabs.nlp.util.io.ResourceHelper -import org.apache.commons.lang.SystemUtils +import org.apache.commons.lang3.SystemUtils import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession import org.tensorflow.TensorFlow diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala index 93b2ca2325a318..3dc6f1012656f5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala @@ -27,7 +27,7 @@ import com.johnsnowlabs.nlp.util.io.{OutputHelper, ResourceHelper} import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorType, ParamsAndFeaturesWritable} import com.johnsnowlabs.storage.HasStorageRef import org.apache.commons.io.IOUtils -import org.apache.commons.lang.SystemUtils +import org.apache.commons.lang3.SystemUtils import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala index c777d12f475a65..42d78d27687891 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.nlp.annotators.tokenizer.bpe import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, Sentence, TokenPiece} -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import scala.collection.mutable import scala.collection.mutable.ListBuffer diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala index 38ee5e8fb6df34..27d8df3158ed05 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala @@ -16,7 +16,7 @@ package com.johnsnowlabs.nlp.annotators.tokenizer.normalizer -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import scala.util.matching.Regex diff --git a/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala b/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala index 48055f8db50668..188d954a6c8572 100644 --- a/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala +++ b/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala @@ -20,7 +20,7 @@ import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} -import org.apache.commons.lang.StringEscapeUtils.escapeJava +import org.apache.commons.text.StringEscapeUtils.escapeJava import scala.collection.mutable.ArrayBuffer import scala.util.Try @@ -89,6 +89,7 @@ object CoNLLGenerator { CoNLLDataset .coalesce(1) .write + .mode("overwrite") .format("com.databricks.spark.csv") .options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> "")) .save(outputPath) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala index bb8f4d291ef18d..1053fffe4309f3 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala @@ -35,6 +35,8 @@ trait SparkSessionTest extends BeforeAndAfterAll { this: Suite => val emptyDataSet: Dataset[_] = PipelineModels.dummyDataset val pipeline = new Pipeline() + println(s"Spark version: ${spark.version}") + override def beforeAll(): Unit = { super.beforeAll() diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala index 9db12399c79392..dcba5ea98928cf 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala @@ -168,7 +168,7 @@ trait NorvigSweetingBehaviors { this: AnyFlatSpec => "Unknown exception. Please check Spark version for correct handling." } - assert(caught.getMessage == expectedErrorMessage) + assert(caught.getMessage.contains(expectedErrorMessage)) } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala index 2f5afaedd71d58..77bd4108f5b8ee 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala @@ -299,7 +299,7 @@ trait SymmetricDeleteBehaviors { spell.fit(trainDataSet) } - assert(caught.getMessage == expectedErrorMessage) + assert(caught.getMessage.contains(expectedErrorMessage)) } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala index 9a9c2f18134328..be01a7278b3ac8 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala @@ -168,7 +168,7 @@ class CoNLLGeneratorTestSpec extends AnyFlatSpec { assert(fileContents == testNERText) } - "The generator" should "work even if token metadata has non-ints" in { + "The generator" should "work even if token metadata has non-ints" taggedAs SlowTest in { val df = ResourceHelper.spark.read.load( "src/test/resources/conllgenerator/conllgenerator_nonint_token_metadata.parquet") From 52555387b612c19939f6e6cd604f366e92c5d4c3 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Thu, 27 Apr 2023 15:49:26 +0200 Subject: [PATCH 2/2] Add unit test for sark 3.4 - fix GA job without spark version - fix sark34 references in build.sbt --- .github/workflows/build_and_test.yml | 52 +++++++++++++++++++++++----- project/Dependencies.scala | 4 +-- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 243191303fced0..07868e9b9ddaf8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -31,13 +31,13 @@ on: - 'main' jobs: - spark33: + spark34: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest env: TF_CPP_MIN_LOG_LEVEL: 3 JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.3.x + name: Build and Test on Apache Spark 3.4.x steps: - uses: actions/checkout@v3 @@ -54,19 +54,55 @@ jobs: - name: Install Python packages (Python 3.7) run: | python -m pip install --upgrade pip - pip install pyspark==3.3.1 numpy pytest - - name: Build Spark NLP on Apache Spark 3.3.0 + pip install pyspark==3.4.0 numpy pytest + - name: Build Spark NLP on Apache Spark 3.4.0 run: | brew install sbt - sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.3.x + sbt -mem 4096 -Dis_spark34=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.4.x run: | sbt -mem 4096 coverage test - name: Upload coverage data to Coveralls run: sbt coverageReport coveralls env: COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COVERALLS_FLAG_NAME: Apache Spark 3.3.x - Scala 2.12 + COVERALLS_FLAG_NAME: Apache Spark 3.4.x - Scala 2.12 + - name: Test Spark NLP in Python - Apache Spark 3.4.x + run: | + cd python + python3.7 -m pytest -v -m fast + + spark33: + if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" + runs-on: macos-latest + env: + TF_CPP_MIN_LOG_LEVEL: 3 + JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" + name: Build and Test on Apache Spark 3.3.x + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'adopt' + java-version: '8' + cache: 'sbt' + - name: Install Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7.7 + architecture: x64 + - name: Install Python packages (Python 3.7) + run: | + python -m pip install --upgrade pip + pip install pyspark==3.3.1 numpy pytest + - name: Build Spark NLP on Apache Spark 3.3.1 + run: | + brew install sbt + sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.3.x + run: | + sbt -mem 4096 test - name: Test Spark NLP in Python - Apache Spark 3.3.x run: | cd python @@ -99,7 +135,7 @@ jobs: - name: Build Spark NLP on Apache Spark 3.2.3 run: | brew install sbt - sbt -mem 4096 clean assemblyAndCopy + sbt -mem 4096 -Dis_spark32=true clean assemblyAndCopy - name: Test Spark NLP in Scala - Apache Spark 3.2.x run: | sbt -mem 4096 test diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 2956315905c7f9..f36d7f528d3c54 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -22,7 +22,7 @@ object Dependencies { val is_spark31: String = System.getProperty("is_spark31", "false") val is_spark32: String = System.getProperty("is_spark32", "false") val is_spark33: String = System.getProperty("is_spark33", "false") - val is_spark34: String = System.getProperty("is_spark33", "true") + val is_spark34: String = System.getProperty("is_spark34", "true") val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34) @@ -53,7 +53,7 @@ object Dependencies { spark31Ver } else if (is_spark32.equals("true")) { spark32Ver - } else if (is_spark32.equals("true")) { + } else if (is_spark33.equals("true")) { spark33Ver } else { /* default spark version */