Merge pull request #14112 from JohnSnowLabs/release/521-release-candi…

…date Release/521 release candidate
JohnSnowLabs · Dec 27, 2023 · 02ef1a8 · 02ef1a8
2 parents e9099b0 + 3a9234b
commit 02ef1a8
Show file tree

Hide file tree

Showing 1,522 changed files with 25,058 additions and 5,544 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -71,14 +71,13 @@ jobs:
         run: |
           cd python
           python3.7 -m pytest -v -m fast
-
-  spark33:
+  spark35:
     if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
     runs-on: macos-latest
     env:
       TF_CPP_MIN_LOG_LEVEL: 3
       JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
-    name: Build and Test on Apache Spark 3.3.x
+    name: Build and Test on Apache Spark 3.5.x
 
     steps:
       - uses: actions/checkout@v3
@@ -87,34 +86,34 @@ jobs:
           distribution: 'adopt'
           java-version: '8'
           cache: 'sbt'
-      - name: Install Python 3.7
+      - name: Install Python 3.10
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7.7
+          python-version: 3.10.12
           architecture: x64
-      - name: Install Python packages (Python 3.7)
+      - name: Install Python packages (Python 3.10)
         run: |
           python -m pip install --upgrade pip
-          pip install pyspark==3.3.1 numpy pytest
-      - name: Build Spark NLP on Apache Spark 3.3.1
+          pip install pyspark==3.5.0 numpy pytest
+      - name: Build Spark NLP on Apache Spark 3.5.0
         run: |
           brew install sbt
-          sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy
-      - name: Test Spark NLP in Scala - Apache Spark 3.3.x
+          sbt -mem 4096 -Dis_spark35=true clean assemblyAndCopy
+      - name: Test Spark NLP in Scala - Apache Spark 3.5.x
         run: |
           sbt -mem 4096 test
-      - name: Test Spark NLP in Python - Apache Spark 3.3.x
+      - name: Test Spark NLP in Python - Apache Spark 3.5.x
         run: |
           cd python
-          python3.7 -m pytest -v -m fast
+          python3.10 -m pytest -v -m fast
 
-  spark32:
+  spark33:
     if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
     runs-on: macos-latest
     env:
       TF_CPP_MIN_LOG_LEVEL: 3
       JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
-    name: Build and Test on Apache Spark 3.2.x
+    name: Build and Test on Apache Spark 3.3.x
 
     steps:
       - uses: actions/checkout@v3
@@ -131,87 +130,16 @@ jobs:
       - name: Install Python packages (Python 3.7)
         run: |
           python -m pip install --upgrade pip
-          pip install pyspark==3.2.3 numpy pytest
-      - name: Build Spark NLP on Apache Spark 3.2.3
+          pip install pyspark==3.3.1 numpy pytest
+      - name: Build Spark NLP on Apache Spark 3.3.1
         run: |
           brew install sbt
-          sbt -mem 4096 -Dis_spark32=true clean assemblyAndCopy
-      - name: Test Spark NLP in Scala - Apache Spark 3.2.x
+          sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy
+      - name: Test Spark NLP in Scala - Apache Spark 3.3.x
         run: |
           sbt -mem 4096 test
-      - name: Test Spark NLP in Python - Apache Spark 3.2.x
+      - name: Test Spark NLP in Python - Apache Spark 3.3.x
         run: |
           cd python
           python3.7 -m pytest -v -m fast
 
-  # spark31:
-  #   if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
-  #   runs-on: macos-latest
-  #   env:
-  #     TF_CPP_MIN_LOG_LEVEL: 3
-  #     JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
-  #   name: Build and Test on Apache Spark 3.1.x
-
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - uses: actions/setup-java@v3
-  #       with:
-  #         distribution: 'adopt'
-  #         java-version: '8'
-  #         cache: 'sbt'
-  #     - name: Install Python 3.7
-  #       uses: actions/setup-python@v2
-  #       with:
-  #         python-version: 3.7.7
-  #         architecture: x64
-  #     - name: Install Python packages (Python 3.7)
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install pyspark==3.1.3 numpy pytest
-  #     - name: Build Spark NLP on Apache Spark 3.1.x
-  #       run: |
-  #         brew install sbt
-  #         sbt -mem 4096 -Dis_spark31=true clean assemblyAndCopy
-  #     - name: Test Spark NLP in Scala - Apache Spark 3.1.x
-  #       run: |
-  #         sbt -mem 4096 test
-  #     - name: Test Spark NLP in Python - Apache Spark 3.1.x
-  #       run: |
-  #         cd python
-  #         python3.7 -m pytest -v -m fast
-
-  # spark30:
-  #   if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')"
-  #   runs-on: macos-latest
-  #   env:
-  #     TF_CPP_MIN_LOG_LEVEL: 3
-  #     JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC"
-  #   name: Build and Test on Apache Spark 3.0.x
-
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - uses: actions/setup-java@v3
-  #       with:
-  #         distribution: 'adopt'
-  #         java-version: '8'
-  #         cache: 'sbt'
-  #     - name: Install Python 3.7
-  #       uses: actions/setup-python@v2
-  #       with:
-  #         python-version: 3.7.7
-  #         architecture: x64
-  #     - name: Install Python packages (Python 3.7)
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install pyspark==3.0.3 numpy pytest
-  #     - name: Build Spark NLP on Apache Spark 3.0.x
-  #       run: |
-  #         brew install sbt
-  #         sbt -mem 4096 -Dis_spark30=true clean assemblyAndCopy
-  #     - name: Test Spark NLP in Scala - Apache Spark 3.0.x
-  #       run: |
-  #         sbt -mem 4096 test
-  #     - name: Test Spark NLP in Python - Apache Spark 3.0.x
-  #       run: |
-  #         cd python
-  #         python3.7 -m pytest -v -m fast
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,27 @@
+========
+5.2.1
+========
+----------------
+New Features & Enhancements
+----------------
+* Add support for Spark and PySpark 3.5 major release
+* Support Databricks Runtimes of 14.0, 14.1, 14.2, 14.0 ML, 14.1 ML, 14.2 ML, 14.0 GPU, 14.1 GPU, and 14.2 GPU
+* **NEW:** Introducing the `BGEEmbeddings` annotator for Spark NLP. This annotator enables the integration of `BGE` models, based on the BERT architecture, into Spark NLP. The `BGEEmbeddings` annotator is designed for generating dense vectors suitable for a variety of applications, including `retrieval`, `classification`, `clustering`, and `semantic search`. Additionally, it is compatible with `vector databases` used in `Large Language Models (LLMs)`.
+* **NEW:** Introducing support for ONNX Runtime in DeBertaForTokenClassification annotator
+* **NEW:** Introducing support for ONNX Runtime in DeBertaForSequenceClassification annotator
+* **NEW:** Introducing support for ONNX Runtime in DeBertaForQuestionAnswering annotator
+* Add a new notebook to show how to import any model from `T5` family into Spark NLP with TensorFlow format
+* Add a new notebook to show how to import any model from `T5` family into Spark NLP with ONNX format
+* Add a new notebook to show how to import any model from `MarianNMT` family into Spark NLP with ONNX format
+
+
+----------------
+Bug Fixes
+----------------
+* Fix serialization issue in `DocumentTokenSplitter` annotator failing to be saved and loaded in a Pipeline
+* Fix serialization issue in `DocumentCharacterTextSplitter` annotator failing to be saved and loaded in a Pipeline
+
+
 ========
 5.2.0
 ========