diff --git a/examples/README.md b/examples/README.md index a9004f047a855d..40b40b0d306f14 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,10 +1,71 @@ # Spark NLP Examples -Under construction +This is the directory for examples on how to use Spark NLP in various environments. -Required maintained examples +These include examples for Python, Scala, Java and Docker. -- Python -- Scala -- Java -- Docker +For an introduction into using Spark NLP, take a look at the [Quick +Start](python/quick_start.ipynb). If you are planning to use Spark NLP on Google Colab, +see [Quick Start on Google Colab](python/quick_start_google_colab.ipynb). The notebook +[Spark NLP Basics](python/annotation/text/english/spark-nlp-basics) covers the basics of +Spark NLP. + +For more use-cases and advanced examples, take a look at the following table of contents. + +## Table Of Contents + +- [Python Examples](python) + - [Using Annotators](python/annotation) + - [Audio Processing](python/annotation/audio) + - [Image Processing](python/annotation/image) + - [Text Processing](python/annotation/text) + - [Chinese](python/annotation/text/chinese) + - [English](python/annotation/text/english) + - [Assembling Documents](python/annotation/text/english/document-assembler) + - [Assembling Tokens to Documents](python/annotation/text/english/token-assembler) + - [Chunking](python/annotation/text/english/chunking) + - [Co-reference Resolution](python/annotation/text/english/coreference-resolution) + - [Document Normalization](python/annotation/text/english/document-normalizer) + - [Embeddings](python/annotation/text/english/embeddings) + - [Graph Extraction](python/annotation/text/english/graph-extraction) + - [Keyword Extraction](python/annotation/text/english/keyword-extraction) + - [Language Detection](python/annotation/text/english/language-detection) + - [Matching text using Regex](python/annotation/text/english/regex-matcher) + - [Model Downloader](python/annotation/text/english/model-downloader) + - [Named Entity Recognition](python/annotation/text/english/named-entity-recognition) + - [Pretrained Pipelines](python/annotation/text/english/pretrained-pipelines) + - [Question Answering](python/annotation/text/english/question-answering) + - [Sentence Detection](python/annotation/text/english/sentence-detection) + - [Sentiment Detection](python/annotation/text/english/sentiment-detection) + - [Stemming](python/annotation/text/english/stemmer) + - [Stop Words Cleaning](python/annotation/text/english/stop-words) + - [Text Matching](python/annotation/text/english/text-matcher-pipeline) + - [Text Similarity](python/annotation/text/english/text-similarity) + - [Tokenization Using Regex](python/annotation/text/english/regex-tokenizer) + - [French](python/annotation/text/french) + - [German](python/annotation/text/german) + - [Italian](python/annotation/text/italian) + - [Multilingual](python/annotation/text/multilingual) + - [Portuguese](python/annotation/text/portuguese) + - [Spanish](python/annotation/text/spanish) + - [Training Annotators](python/training) + - [Chinese](python/training/chinese) + - [English](python/training/english) + - [Document Embeddings with Doc2Vec](python/training/english/doc2vec) + - [Matching Entities with EntityRuler](python/training/english/entity-ruler) + - [Named Entity Recognition with CRF](python/training/english/crf-ner) + - [Named Entity Recognition with Deep Learning](python/training/english/dl-ner) + - [Creating NerDL Graphs](python/training/english/dl-ner/nerdl-graph) + - [Sentiment Analysis](python/training/english/sentiment-detection) + - [Text Classification](python/training/english/classification) + - [Word embeddings with Word2Vec](python/training/english/word2vec) + - [French](python/training/french) + - [Italian](python/training/italian) + - [Transformers in Spark NLP](python/transformers) + - [Logging](python/logging) +- [Scala Examples](scala) + - [Training Annotators](scala/training) + - [Using Annotators](scala/annotation) +- [Java Examples](java) +- [SparkNLP Setup with Docker](docker) +- [Utilities](util) diff --git a/examples/docker/README.md b/examples/docker/README.md new file mode 100644 index 00000000000000..353ecdc90176b5 --- /dev/null +++ b/examples/docker/README.md @@ -0,0 +1,95 @@ +# Running Spark NLP in Docker + +These example Dockerfiles get get you started with using Spark NLP in a Docker +container. + +The following examples set up Jupyter and Scala shells. If you want to run a shell +inside the containers instead, you can specify `bash` at the end of the `docker run` +commands. + +## Jupyter Notebook (CPU) + +The Dockerfile [SparkNLP-CPU.Dockerfile](SparkNLP-CPU.Dockerfile) sets up a docker +container with Jupyter Notebook. It is based on the official [Jupyter Docker +Images](https://jupyter-docker-stacks.readthedocs.io/en/latest/). To run the notebook on +the default port 8888, we can run + +```bash +# Build the Docker Image +docker build -f SparkNLP-CPU.Dockerfile -t sparknlp:latest . + +# Run the container and mount the current directory +docker run -it --name sparknlp-container \ + -p 8888:8888 \ + -v "${PWD}":/home/johnsnow/work \ + sparknlp:latest +``` + +### With GPU Support + +If you have compatible NVIDIA GPU, you can use it to leverage better performance on our +machine learning models. Docker provides support for GPU accelerated containers with +[nvidia-docker](https://github.com/NVIDIA/nvidia-docker). The linked repository contains +instructions on how to set it up for your system. (Note that on Windows, using WSL 2 +with Docker is +[recommended](https://www.docker.com/blog/wsl-2-gpu-support-for-docker-desktop-on-nvidia-gpus/)) + +After setting it up, we can use the Dockerfile +[SparkNLP-GPU.Dockerfile](SparkNLP-GPU.Dockerfile) to create an image with CUDA +support. Containers based on this image will then have access to Spark NLP with GPU +acceleration. + +The commands to set it up could look like this: + +```bash +# Build the image +docker build -f SparkNLP-GPU.Dockerfile -t sparknlp-gpu:latest . + +# Start a container with GPU support and mount the current folder +docker run -it --init --name sparknlp-gpu-container \ + -p 8888:8888 \ + -v "${PWD}":/home/johnsnow/work \ + --gpus all \ + --ipc=host \ + sparknlp-gpu:latest +``` + +*NOTE*: After running the container, don't forget to start Spark NLP with +`sparknlp.start(gpu=True)`! This will set up the right dependencies in Spark. + +## Scala Spark Shell + +To run Spark NLP in a Scala Spark Shell, we can use the same Dockerfile from Section +[Jupyter Notebook (CPU)](#jupyter-notebook-cpu). However, instead of using the default +entrypoint, we can specify the spark-shell as the command: + +```bash +# Run the container, mount the current directory and run spark-shell with Spark NLP +docker run -it --name sparknlp-container \ + -v "${PWD}":/home/johnsnow/work \ + sparknlp:latest \ + /usr/local/spark/bin/spark-shell \ + --conf "spark.driver.memory"="4g" \ + --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ + --conf "spark.kryoserializer.buffer.max"="2000M" \ + --conf "spark.driver.maxResultSize"="0" \ + --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.1" +``` + +To run the shell with GPU support, we use the image from [Jupyter Notebook with GPU +support](#with-gpu-support) and specify the correct package (`spark-nlp-gpu`). + +```bash +# Run the container, mount the current directory and run spark-shell with Spark NLP GPU +docker run -it --name sparknlp-container \ + -v "${PWD}":/home/johnsnow/work \ + --gpus all \ + --ipc=host \ + sparknlp-gpu:latest \ + /usr/local/bin/spark-shell \ + --conf "spark.driver.memory"="4g" \ + --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ + --conf "spark.kryoserializer.buffer.max"="2000M" \ + --conf "spark.driver.maxResultSize"="0" \ + --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.3.1" +``` diff --git a/examples/docker/SparkNLP-CPU.Dockerfile b/examples/docker/SparkNLP-CPU.Dockerfile new file mode 100644 index 00000000000000..881f5ecb6921d5 --- /dev/null +++ b/examples/docker/SparkNLP-CPU.Dockerfile @@ -0,0 +1,11 @@ +FROM jupyter/pyspark-notebook:java-11.0.15 + +ARG SPARKNLP_VERSION=4.3.1 +RUN pip install --no-cache-dir spark-nlp==${SPARKNLP_VERSION} + +# Create a new user +ENV NB_USER=johnsnow +ENV CHOWN_HOME=yes +ENV CHOWN_HOME_OPTS="-R" + +WORKDIR /home/${NB_USER} diff --git a/examples/docker/SparkNLP-GPU.Dockerfile b/examples/docker/SparkNLP-GPU.Dockerfile new file mode 100644 index 00000000000000..9cd55495812ecb --- /dev/null +++ b/examples/docker/SparkNLP-GPU.Dockerfile @@ -0,0 +1,39 @@ +FROM tensorflow/tensorflow:2.7.4-gpu + +# Fetch keys for apt +RUN rm /etc/apt/sources.list.d/cuda.list && \ + apt-key del 7fa2af80 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub + +# Install Java Dependency +RUN apt-get update && \ + apt-get -y --no-install-recommends install openjdk-8-jre \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Spark NLP and dependencies +ARG SPARKNLP_VERSION=4.3.1 +ARG PYSPARK_VERSION=3.3.0 +RUN pip install --no-cache-dir \ + pyspark==${PYSPARK_VERSION} spark-nlp==${SPARKNLP_VERSION} pandas numpy jupyterlab + +# Create Local User +ENV NB_USER johnsnow +ENV NB_UID 1000 + +RUN adduser --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + ${NB_USER} + +ENV HOME /home/${NB_USER} +RUN chown -R ${NB_UID} ${HOME} + +ENV PYSPARK_PYTHON=python3 +ENV PYSPARK_DRIVER_PYTHON=python3 + +USER ${NB_USER} +WORKDIR ${HOME} + +EXPOSE 8888 +CMD ["jupyter", "lab", "--ip", "0.0.0.0"] diff --git a/examples/python/annotation/audio/asr-wav2vec2/Automatic_Speech_Recognition_Wav2Vec2_(Wav2Vec2ForCTC).ipynb b/examples/python/annotation/audio/asr-wav2vec2/Automatic_Speech_Recognition_Wav2Vec2_(Wav2Vec2ForCTC).ipynb index ffefca4117d4a3..874498e61b05b7 100644 --- a/examples/python/annotation/audio/asr-wav2vec2/Automatic_Speech_Recognition_Wav2Vec2_(Wav2Vec2ForCTC).ipynb +++ b/examples/python/annotation/audio/asr-wav2vec2/Automatic_Speech_Recognition_Wav2Vec2_(Wav2Vec2ForCTC).ipynb @@ -2,15 +2,14 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/asr-wav2vec2/Automatic_Speech_Recognition_Wav2Vec2_(Wav2Vec2ForCTC).ipynb)" - ], - "metadata": { - "id": "g_w-gysjj7Jz" - } + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/audio/asr-wav2vec2/Automatic_Speech_Recognition_Wav2Vec2_(Wav2Vec2ForCTC).ipynb)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Automatic Speech Recognition in Spark NLP\n", "## Wav2Vec2 (Wav2Vec2ForCTC)\n", @@ -23,26 +22,16 @@ "\n", "- List of all available ASR [models](https://nlp.johnsnowlabs.com/models?task=Automatic+Speech+Recognition&type=model)\n", "- List of all available ASR [pipelines](https://nlp.johnsnowlabs.com/models?task=Automatic+Speech+Recognition&type=pipeline)" - ], - "metadata": { - "id": "9bXYSe5Mkjvq" - } + ] }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "tags": [], - "id": "ZJQS_XWcwn-r", - "outputId": "7841b324-247f-4742-f7d7-6db66d050f49", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2022-12-23 14:10:21-- https://setup.johnsnowlabs.com/colab.sh\n", "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", @@ -66,7 +55,7 @@ } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash\n", "\n", "# to process audio files\n", @@ -75,24 +64,14 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "jupyter": { - "source_hidden": true - }, - "tags": [], - "id": "kiarOJWswn-v", - "outputId": "bffbc8a5-e76e-4f28-b3c4-6819b84913d0", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "4.2.6\n" + "4.3.1\n" ] } ], @@ -106,9 +85,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-9hMpwKxwn-x" - }, + "metadata": {}, "source": [ "# Spark NLP ASR Pipeline & Model\n", "## Wav2Vec2 \n", @@ -117,73 +94,53 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Let's download a sample Wav file" - ], - "metadata": { - "id": "nlHtgU57zmpP" - } + ] }, { "cell_type": "code", - "source": [ - "!wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/wavs/ngm_12484_01067234848.wav" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MfP_NRN9zUq1", - "outputId": "693362c3-b9fd-4c87-f700-2efa2f2b0307" - }, - "execution_count": 26, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "--2022-12-23 14:10:33-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/wavs/ngm_12484_01067234848.wav\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.85.190, 52.217.106.62, 52.217.73.38, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.85.190|:443... connected.\n", + "--2023-02-17 15:55:34-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/wavs/ngm_12484_01067234848.wav\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.108.254, 52.217.163.192, 52.216.144.93, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.108.254|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 417836 (408K) [audio/wav]\n", - "Saving to: ‘ngm_12484_01067234848.wav.1’\n", + "Saving to: ‘ngm_12484_01067234848.wav’\n", "\n", - "ngm_12484_010672348 100%[===================>] 408.04K 1.15MB/s in 0.3s \n", + "ngm_12484_010672348 100%[===================>] 408,04K 857KB/s in 0,5s \n", "\n", - "2022-12-23 14:10:33 (1.15 MB/s) - ‘ngm_12484_01067234848.wav.1’ saved [417836/417836]\n", + "2023-02-17 15:55:35 (857 KB/s) - ‘ngm_12484_01067234848.wav’ saved [417836/417836]\n", "\n" ] } + ], + "source": [ + "!wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/wavs/ngm_12484_01067234848.wav" ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Let's listen to the audio" - ], - "metadata": { - "id": "A7qV7LOfzl_c" - } + ] }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 76 - }, - "id": "EsyMeFHDwn-y", - "outputId": "f0bc6cee-d669-4ea6-b1ed-003cc732dbf5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", " \n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 27 + "output_type": "execute_result" } ], "source": [ @@ -205,19 +166,15 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We will use `librosa` library to load/resample our Wav file" - ], - "metadata": { - "id": "3tKbKC8uzuCC" - } + ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "id": "oMDEb-hJwn-z" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import librosa\n", @@ -228,28 +185,19 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "This is how we can create PySpark DataFrame from the `librosa` results" - ], - "metadata": { - "id": "8FBb6Brdz7rJ" - } + ] }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "tags": [], - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ntjq5hAmwn-z", - "outputId": "fcbc9b04-e428-4b0b-b652-d5f22f571c93" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- audio_content: array (nullable = true)\n", @@ -284,71 +232,46 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Simplest and fastest way is to use a pre-trained [pipeline for ASR](https://nlp.johnsnowlabs.com/models?task=Automatic+Speech+Recognition&type=pipeline):\n", "\n", "\n", "\n" - ], - "metadata": { - "id": "d23Up3Cy1cGQ" - } + ] }, { "cell_type": "code", - "source": [ - "import sparknlp\n", - "from sparknlp.pretrained import PretrainedPipeline\n", - "# Download a pre-trained pipeline\n", - "pipeline = PretrainedPipeline('pipeline_asr_wav2vec2_base_960h', lang='en')\n", - "\n", - "pipelineDF = pipeline.transform(spark_df)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "vWGJvKOW1hD7", - "outputId": "d0a6674e-4048-49dc-b6d8-fdad17676a62" - }, - "execution_count": 30, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "pipeline_asr_wav2vec2_base_960h download started this may take some time.\n", "Approx size to download 217 MB\n", "[OK!]\n" ] } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.pretrained import PretrainedPipeline\n", + "# Download a pre-trained pipeline\n", + "pipeline = PretrainedPipeline('pipeline_asr_wav2vec2_base_960h', lang='en')\n", + "\n", + "pipelineDF = pipeline.transform(spark_df)" ] }, { "cell_type": "code", - "source": [ - "\n", - "# let's see what's inside out-of-the-box\n", - "pipelineDF.printSchema()\n", - "\n", - "pipelineDF.select(\"text.result\").show(1, False)\n", - "\n", - "pipelineDF.select(\"text.metadata\").show(1, False)\n", - "\n", - "pipelineDF.select(\"text\").show(1, False)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ahMQ7Eal1hBF", - "outputId": "a7d54d02-549d-45ab-ebf4-b1432bcff11a" - }, - "execution_count": 31, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- audio_content: array (nullable = true)\n", @@ -394,33 +317,35 @@ "\n" ] } + ], + "source": [ + "\n", + "# let's see what's inside out-of-the-box\n", + "pipelineDF.printSchema()\n", + "\n", + "pipelineDF.select(\"text.result\").show(1, False)\n", + "\n", + "pipelineDF.select(\"text.metadata\").show(1, False)\n", + "\n", + "pipelineDF.select(\"text\").show(1, False)" ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Custom Pipeline\n", "You can also construct your own custom Pipeline by using Spark NLP pretrained Models. This way you have more control and flexibility over the entire pipeline.\n" - ], - "metadata": { - "id": "KCAlZjsU2jv0" - } + ] }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "tags": [], - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wYLzw9Qdwn-0", - "outputId": "4cd93234-56c6-48de-b4f0-a44ef64be258" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "asr_wav2vec2_base_960h download started this may take some time.\n", "Approximate size to download 217 MB\n", @@ -451,34 +376,19 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Let's have a look:" - ], - "metadata": { - "id": "rv3i-Y0L01Ye" - } + ] }, { "cell_type": "code", - "source": [ - "pipelineDF.select(\"text.result\").show(1, False)\n", - "\n", - "pipelineDF.select(\"text.metadata\").show(1, False)\n", - "\n", - "pipelineDF.select(\"text\").show(1, False)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TwOuRO6j0wY8", - "outputId": "ccaf2ee6-c88a-49d6-de50-bee8e809cf8f" - }, - "execution_count": 33, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-----------------------------------------------+\n", "|result |\n", @@ -500,13 +410,18 @@ "\n" ] } + ], + "source": [ + "pipelineDF.select(\"text.result\").show(1, False)\n", + "\n", + "pipelineDF.select(\"text.metadata\").show(1, False)\n", + "\n", + "pipelineDF.select(\"text\").show(1, False)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "0ln8fzQCwn-1" - }, + "metadata": {}, "source": [ "# Spark NLP ASR-NER Pipeline\n", "## Wav2Vec2, OntoNotes NER, and BERT" @@ -514,55 +429,39 @@ }, { "cell_type": "code", - "source": [ - "!wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/1664116679869-voicemaker.in-speech.mp3" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mhMsm3k0GF8r", - "outputId": "0d6bac61-9e4a-4167-ec12-0dcc3f1f1ea3" - }, - "execution_count": 34, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "--2022-12-23 14:11:13-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/1664116679869-voicemaker.in-speech.mp3\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.172.64, 52.216.60.240, 54.231.170.152, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.172.64|:443... connected.\n", + "--2023-02-17 15:58:03-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/1664116679869-voicemaker.in-speech.mp3\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.135.80, 52.216.108.237, 52.216.162.13, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.135.80|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 40221 (39K) [audio/mp3]\n", - "Saving to: ‘1664116679869-voicemaker.in-speech.mp3.1’\n", + "Saving to: ‘1664116679869-voicemaker.in-speech.mp3’\n", "\n", - "1664116679869-voice 100%[===================>] 39.28K --.-KB/s in 0.09s \n", + "1664116679869-voice 100%[===================>] 39,28K --.-KB/s in 0,1s \n", "\n", - "2022-12-23 14:11:13 (458 KB/s) - ‘1664116679869-voicemaker.in-speech.mp3.1’ saved [40221/40221]\n", + "2023-02-17 15:58:04 (336 KB/s) - ‘1664116679869-voicemaker.in-speech.mp3’ saved [40221/40221]\n", "\n" ] } + ], + "source": [ + "!wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/audio/samples/1664116679869-voicemaker.in-speech.mp3" ] }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 76 - }, - "id": "KhPLDw3gwn-1", - "outputId": "820b4eff-b5ce-4f41-9f67-5d430e662051" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", " \n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 35 + "output_type": "execute_result" } ], "source": [ @@ -584,24 +487,9 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wwftXRMywn-2", - "outputId": "4bb3ed6e-8837-4e90-b4ff-ea5521be59df" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.8/dist-packages/librosa/core/audio.py:165: UserWarning: PySoundFile failed. Trying audioread instead.\n", - " warnings.warn(\"PySoundFile failed. Trying audioread instead.\")\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "data,sampleing_rate = librosa.load(FILE_PATH, sr=16000)\n", "data=[float(x) for x in data]" @@ -609,11 +497,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": { - "tags": [], - "id": "dftc1l5fwn-2" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#Create PySpark DataFrame from Pandas\n", @@ -633,19 +518,12 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": { - "tags": [], - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Yo12TVd5wn-3", - "outputId": "dff08f73-6fc5-4235-8111-ebf911a28da0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "asr_wav2vec2_base_960h download started this may take some time.\n", "Approximate size to download 217 MB\n", @@ -708,18 +586,12 @@ }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-uXT4sqpwn-4", - "outputId": "de68827b-509e-435e-b771-84670c050e13" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------------+\n", "|result |\n", @@ -760,29 +632,59 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Spark NLP ASR pipeline and model\n", "## HuggingFace Datasets\n", "\n", "Let's create a DataFrame from HuggingFace Datasets library" - ], - "metadata": { - "id": "snaQGhHkxBUl" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!pip install -q datasets" - ], - "metadata": { - "id": "Cy-J2g-ZxD1s" - }, - "execution_count": 40, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading builder script: 100%|██████████| 5.16k/5.16k [00:00<00:00, 1.63MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading and preparing dataset librispeech_asr_dummy/clean to /home/root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 3214.03it/s]\n", + "Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 308.20it/s]\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset librispeech_asr_dummy downloaded and prepared to /home/root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc. Subsequent calls will reuse this data.\n" + ] + } + ], "source": [ "import pandas as pd\n", "import librosa\n", @@ -792,51 +694,16 @@ "ds = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n", "pandas_dataframe = pd.DataFrame(ds['audio'])\n", "pandas_dataframe['array'] = pandas_dataframe['array'].apply(lambda row : [float(value) for value in row ])" - ], - "metadata": { - "id": "6aHJIGq7xDHm", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "90b395b0-3eaa-497a-8275-33e186242ee6" - }, - "execution_count": 41, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "WARNING:datasets.builder:Found cached dataset librispeech_asr_dummy (/root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)\n" - ] - } ] }, { "cell_type": "code", - "source": [ - "#Create PySpark DataFrame from Pandas\n", - "from pyspark.sql.types import *\n", - "import pyspark.sql.functions as F\n", - "\n", - "schema = StructType([StructField(\"path\", StringType()), \n", - " StructField(\"audio_content\", ArrayType(FloatType())),\n", - " StructField(\"sampling_rate\", LongType())])\n", - "spark_df=spark.createDataFrame(pandas_dataframe, schema)\n", - "spark_df.printSchema()\n", - "spark_df.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sNYwZjC-eXL_", - "outputId": "bda465e8-b12a-42bd-d066-875ddb7b75c0" - }, - "execution_count": 42, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- path: string (nullable = true)\n", @@ -847,135 +714,129 @@ "+--------------------+--------------------+-------------+\n", "| path| audio_content|sampling_rate|\n", "+--------------------+--------------------+-------------+\n", - "|/root/.cache/hugg...|[-4.8828125E-4, -...| 16000|\n", - "|/root/.cache/hugg...|[2.746582E-4, 0.0...| 16000|\n", - "|/root/.cache/hugg...|[-4.8828125E-4, -...| 16000|\n", - "|/root/.cache/hugg...|[-0.001739502, -0...| 16000|\n", - "|/root/.cache/hugg...|[6.1035156E-4, 3....| 16000|\n", - "|/root/.cache/hugg...|[0.0059814453, 0....| 16000|\n", - "|/root/.cache/hugg...|[0.002532959, 0.0...| 16000|\n", - "|/root/.cache/hugg...|[-4.272461E-4, -3...| 16000|\n", - "|/root/.cache/hugg...|[-7.019043E-4, -3...| 16000|\n", - "|/root/.cache/hugg...|[2.4414062E-4, 2....| 16000|\n", - "|/root/.cache/hugg...|[-1.5258789E-4, -...| 16000|\n", - "|/root/.cache/hugg...|[-8.239746E-4, -4...| 16000|\n", - "|/root/.cache/hugg...|[0.0029907227, 0....| 16000|\n", - "|/root/.cache/hugg...|[3.0517578E-5, -1...| 16000|\n", - "|/root/.cache/hugg...|[1.8310547E-4, 5....| 16000|\n", - "|/root/.cache/hugg...|[-0.0026550293, -...| 16000|\n", - "|/root/.cache/hugg...|[-9.460449E-4, -0...| 16000|\n", - "|/root/.cache/hugg...|[-5.79834E-4, -3....| 16000|\n", - "|/root/.cache/hugg...|[-6.713867E-4, -8...| 16000|\n", - "|/root/.cache/hugg...|[-0.00491333, 8.2...| 16000|\n", + "|/home/root/.cach...|[-1.8310547E-4, -...| 16000|\n", + "|/home/root/.cach...|[-0.0013427734, -...| 16000|\n", + "|/home/root/.cach...|[-3.9672852E-4, -...| 16000|\n", + "|/home/root/.cach...|[-0.006164551, -0...| 16000|\n", + "|/home/root/.cach...|[-0.001373291, -0...| 16000|\n", + "|/home/root/.cach...|[-0.004852295, 2....| 16000|\n", + "|/home/root/.cach...|[0.0011291504, 5....| 16000|\n", + "|/home/root/.cach...|[-0.0027160645, 0...| 16000|\n", + "|/home/root/.cach...|[0.002380371, 0.0...| 16000|\n", + "|/home/root/.cach...|[-0.0033874512, 0...| 16000|\n", + "|/home/root/.cach...|[-9.1552734E-4, -...| 16000|\n", + "|/home/root/.cach...|[2.4414062E-4, 0....| 16000|\n", + "|/home/root/.cach...|[-0.0029907227, -...| 16000|\n", + "|/home/root/.cach...|[0.0013122559, -6...| 16000|\n", + "|/home/root/.cach...|[4.272461E-4, 0.0...| 16000|\n", + "|/home/root/.cach...|[3.6621094E-4, -9...| 16000|\n", + "|/home/root/.cach...|[-0.0010986328, -...| 16000|\n", + "|/home/root/.cach...|[-0.006225586, -0...| 16000|\n", + "|/home/root/.cach...|[7.324219E-4, 0.0...| 16000|\n", + "|/home/root/.cach...|[-0.0010681152, -...| 16000|\n", "+--------------------+--------------------+-------------+\n", "only showing top 20 rows\n", "\n" ] } + ], + "source": [ + "#Create PySpark DataFrame from Pandas\n", + "from pyspark.sql.types import *\n", + "import pyspark.sql.functions as F\n", + "\n", + "schema = StructType([StructField(\"path\", StringType()), \n", + " StructField(\"audio_content\", ArrayType(FloatType())),\n", + " StructField(\"sampling_rate\", LongType())])\n", + "spark_df=spark.createDataFrame(pandas_dataframe, schema)\n", + "spark_df.printSchema()\n", + "spark_df.show()" ] }, { "cell_type": "code", - "source": [ - "import sparknlp\n", - "from sparknlp.pretrained import PretrainedPipeline\n", - "# Download a pre-trained pipeline\n", - "pipeline = PretrainedPipeline('pipeline_asr_wav2vec2_base_960h', lang='en')\n", - "\n", - "pipelineDF = pipeline.transform(spark_df)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Fy_RKA8ReaX7", - "outputId": "89efbe8c-55bf-4083-eeed-326ea232a40e" - }, - "execution_count": 43, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "pipeline_asr_wav2vec2_base_960h download started this may take some time.\n", "Approx size to download 217 MB\n", "[OK!]\n" ] } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.pretrained import PretrainedPipeline\n", + "# Download a pre-trained pipeline\n", + "pipeline = PretrainedPipeline('pipeline_asr_wav2vec2_base_960h', lang='en')\n", + "\n", + "pipelineDF = pipeline.transform(spark_df)" ] }, { "cell_type": "code", - "source": [ - "pipelineDF.select(\"text.result\").show(5, False)\n", - "\n", - "pipelineDF.select(\"text.metadata\").show(5, False)\n", - "\n", - "pipelineDF.select(\"text\").show(5, False)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Bw1zxpggehQS", - "outputId": "e50219f8-5a0e-4d6e-f750-7184de70a020" - }, - "execution_count": 44, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|result |\n", - "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[A MAN SAID TO THE UNIVERSE SIR I EXIST ] |\n", - "|[SWEAT COVERED BRION'S BODY TRICKLING INTO THE TIGHT LOWING CLOTH THAT WAS THE ONLY GARMENT HE WORE ] |\n", - "|[THE CUT ON HIS CHEST STIL DRIPING BLOD THE ACHE OF HIS OVERSTRAINED EYES EVEN THE SOARING ARENA AROUND HIM WITH THOUSANDS OF SPECTATORS WERE TRIVIALITIES NOT WORTH THINKING ABOUT ]|\n", - "|[HIS INSTANCT PANIC WAS FOLOWED BY A SMAL SHARP BLOW HIGH ON HIS CHEST ] |\n", - "|[ONE MINUTE A VOICE SAID AND THE TIMEBUZ ARE SOUNDED ] |\n", - "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|result |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[BECAUSE YOU ARE SLEPING INSTEAD OF CONQUERING THE LOVELY ROSE PRINCES HAS BECOME A FIDLE WITHOUT A BAW WHILE POR SHAGY SITS THERE A COING DOVE ] |\n", + "|[HE HAS GONE AND GONE FOR GOD ANSWERED POLYCHROME WHO HAD MANAGED TO SQUEZE INTO THE ROM BESIDE THE DRAGON AND HAD WITNESED THE OCURENCES WITH MUCH INTEREST ]|\n", + "|[I HAVE REMAINED A PRISONER ONLY BECAUSE I WISHED TO BE ONE AND WITH THIS HE STEPED FORWARD AND BURST THE STOUT CHAINS AS EASILY AS IF THEY HAD BEN THREADS ] |\n", + "|[THE LITLE GIRL HAD BEN ASLEP BUT SHE HEARD THE RAPS AND OPENED THE DOR ] |\n", + "|[THE KING HAS FLODIN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU ] |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 5 rows\n", "\n", "+-----------------------------------------------+\n", "|metadata |\n", "+-----------------------------------------------+\n", - "|[{audio -> 0, sentence -> 0, length -> 74400}] |\n", - "|[{audio -> 0, sentence -> 0, length -> 104560}]|\n", - "|[{audio -> 0, sentence -> 0, length -> 213360}]|\n", - "|[{audio -> 0, sentence -> 0, length -> 86720}] |\n", - "|[{audio -> 0, sentence -> 0, length -> 81440}] |\n", + "|[{audio -> 0, sentence -> 0, length -> 174160}]|\n", + "|[{audio -> 0, sentence -> 0, length -> 178080}]|\n", + "|[{audio -> 0, sentence -> 0, length -> 183600}]|\n", + "|[{audio -> 0, sentence -> 0, length -> 76080}] |\n", + "|[{audio -> 0, sentence -> 0, length -> 67600}] |\n", "+-----------------------------------------------+\n", "only showing top 5 rows\n", "\n", - "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|text |\n", - "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{document, 0, 38, A MAN SAID TO THE UNIVERSE SIR I EXIST , {audio -> 0, sentence -> 0, length -> 74400}, []}] |\n", - "|[{document, 0, 98, SWEAT COVERED BRION'S BODY TRICKLING INTO THE TIGHT LOWING CLOTH THAT WAS THE ONLY GARMENT HE WORE , {audio -> 0, sentence -> 0, length -> 104560}, []}] |\n", - "|[{document, 0, 178, THE CUT ON HIS CHEST STIL DRIPING BLOD THE ACHE OF HIS OVERSTRAINED EYES EVEN THE SOARING ARENA AROUND HIM WITH THOUSANDS OF SPECTATORS WERE TRIVIALITIES NOT WORTH THINKING ABOUT , {audio -> 0, sentence -> 0, length -> 213360}, []}]|\n", - "|[{document, 0, 69, HIS INSTANCT PANIC WAS FOLOWED BY A SMAL SHARP BLOW HIGH ON HIS CHEST , {audio -> 0, sentence -> 0, length -> 86720}, []}] |\n", - "|[{document, 0, 51, ONE MINUTE A VOICE SAID AND THE TIMEBUZ ARE SOUNDED , {audio -> 0, sentence -> 0, length -> 81440}, []}] |\n", - "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|text |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 142, BECAUSE YOU ARE SLEPING INSTEAD OF CONQUERING THE LOVELY ROSE PRINCES HAS BECOME A FIDLE WITHOUT A BAW WHILE POR SHAGY SITS THERE A COING DOVE , {audio -> 0, sentence -> 0, length -> 174160}, []}] |\n", + "|[{document, 0, 155, HE HAS GONE AND GONE FOR GOD ANSWERED POLYCHROME WHO HAD MANAGED TO SQUEZE INTO THE ROM BESIDE THE DRAGON AND HAD WITNESED THE OCURENCES WITH MUCH INTEREST , {audio -> 0, sentence -> 0, length -> 178080}, []}]|\n", + "|[{document, 0, 154, I HAVE REMAINED A PRISONER ONLY BECAUSE I WISHED TO BE ONE AND WITH THIS HE STEPED FORWARD AND BURST THE STOUT CHAINS AS EASILY AS IF THEY HAD BEN THREADS , {audio -> 0, sentence -> 0, length -> 183600}, []}] |\n", + "|[{document, 0, 70, THE LITLE GIRL HAD BEN ASLEP BUT SHE HEARD THE RAPS AND OPENED THE DOR , {audio -> 0, sentence -> 0, length -> 76080}, []}] |\n", + "|[{document, 0, 64, THE KING HAS FLODIN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU , {audio -> 0, sentence -> 0, length -> 67600}, []}] |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 5 rows\n", "\n" ] } + ], + "source": [ + "pipelineDF.select(\"text.result\").show(5, False)\n", + "\n", + "pipelineDF.select(\"text.metadata\").show(5, False)\n", + "\n", + "pipelineDF.select(\"text\").show(5, False)" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "Cn1FR5YkeqVk" - }, - "execution_count": 44, - "outputs": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -988,11 +849,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/image/ViTForImageClassification.ipynb b/examples/python/annotation/image/ViTForImageClassification.ipynb index e9b2e8876097ee..84478d8a011818 100644 --- a/examples/python/annotation/image/ViTForImageClassification.ipynb +++ b/examples/python/annotation/image/ViTForImageClassification.ipynb @@ -2,22 +2,17 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "v8xIEZ07QpRM", - "outputId": "b5f5db4b-bce4-4b62-883f-3b3e90a3f1cd" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/ViTForImageClassification.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/image/ViTForImageClassification.ipynb)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "mz6G5fxae3HW" - }, + "metadata": {}, "outputs": [], "source": [ "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.1 -s 4.1.0" @@ -25,38 +20,29 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "6KvNW4MU5rrF", - "outputId": "36cf722b-f3a6-4566-8217-615cc58dc549" - }, + "metadata": {}, "source": [ "## ViTForImageClassification Annotator" ] }, { "cell_type": "markdown", - "metadata": { - "id": "BshxwBPTe3Hc" - }, + "metadata": {}, "source": [ "In this notebok we are going to classify images using spark-nlp." ] }, { "cell_type": "markdown", - "metadata": { - "id": "FaN1OWV0NQ5T" - }, + "metadata": {}, "source": [ "### Downloading Images" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "jEHkswUjUfaU" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/images/images.zip" @@ -64,10 +50,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "k9F8WstLNXnS" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import shutil\n", @@ -76,19 +60,15 @@ }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Start Spark Session" - ], - "metadata": { - "id": "3a_shOYHfpOn" - } + ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "XLNO3Z9r6HgR" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -99,10 +79,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "4JfeD8Rj-as2" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spark = sparknlp.start()" @@ -110,38 +88,28 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "99AqJEThSBuT" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "data_df = spark.read.format(\"image\").option(\"dropInvalid\", value = True).load(path=\"/content/images/images/\")" + "data_df = spark.read.format(\"image\").option(\"dropInvalid\", value = True).load(path=\"images/images/\")" ] }, { "cell_type": "markdown", - "metadata": { - "id": "J86YU794UYEG" - }, + "metadata": {}, "source": [ "### Pipeline with ViTForImageClassification" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tRyju8D-6XJ1", - "outputId": "ad8658bb-8170-488a-f9a1-680c63ad0f80" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "image_classifier_vit_base_patch16_224 download started this may take some time.\n", "Approximate size to download 309.7 MB\n", @@ -167,10 +135,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "XIYjEhW3O_Uc" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "model = pipeline.fit(data_df)" @@ -178,18 +144,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gIZFLaUOPBnd", - "outputId": "a8cfe0c5-fe6a-4f0b-a4c1-e9cf5d1f22c0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+\n", "| image| image_assembler| class|\n", @@ -216,18 +176,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Rfp5MK1UxoNt" - }, + "metadata": {}, "source": [ "### Light Pipeline" ] }, { "cell_type": "markdown", - "metadata": { - "id": "-_6VJPS9xvfV" - }, + "metadata": {}, "source": [ "To use light pipeline in ViT transformer, we need to use the new method `fullAnnotateImage`, which can receive 3 kind of inputs:\n", "1. A path to a single image\n", @@ -236,24 +192,18 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XDQ6PrgbSJ8W", - "outputId": "a2b3159d-f929-429b-d7be-fe119470fea4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "dict_keys(['image_assembler', 'class'])" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } ], "source": [ @@ -264,31 +214,9 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "73PV--LdSU5-", - "outputId": "4a5f8730-f515-413d-ed0a-010b98d2d844" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "annotator_type: image\n", - "origin: images/images/hippopotamus.JPEG\n", - "height: 333\n", - "width: 500\n", - "nChannels: 3\n", - "mode: 16\n", - "result size: 499500\n", - "metadata: Map()\n", - "[Annotation(category, 0, 55, hippopotamus, hippo, river horse, Hippopotamus amphibius, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 7.2882756E-8, Some(beer glass) -> 9.0488925E-8, image -> 0, Some(damselfly) -> 1.9379786E-7, Some(turnstile) -> 6.8434524E-8, Some(cockroach, roach) -> 1.6622849E-7, height -> 333, Some(bulbul) -> 1.6930231E-7, Some(sea snake) -> 8.89582E-8, origin -> images/images/hippopotamus.JPEG, Some(mixing bowl) -> 1.2995402E-7, mode -> 16, None -> 1.3814622E-7, Some(whippet) -> 3.894023E-8, width -> 500, Some(buckle) -> 1.0061492E-7))]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "for result in annotations_result:\n", " image_assembler = result['image_assembler'][0]\n", @@ -305,33 +233,25 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "V37k8GQFySRW" - }, + "metadata": {}, "source": [ "To send a list of images, we just difine a set of images" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "asf3MZGzyXl5", - "outputId": "03db32ad-2ac2-4bb9-dd38-7c06c5d6a4b8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "dict_keys(['image_assembler', 'class'])" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 14 + "output_type": "execute_result" } ], "source": [ @@ -342,18 +262,12 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dfby3MJlymNV", - "outputId": "ef63a544-c995-429e-e965-302bc8781851" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "[Annotation(category, 0, 7, bluetick, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 1.3846728E-6, Some(beer glass) -> 1.1807944E-6, image -> 0, Some(damselfly) -> 3.6875622E-7, Some(turnstile) -> 2.023695E-6, Some(cockroach, roach) -> 6.2982855E-7, height -> 500, Some(bulbul) -> 5.417509E-7, Some(sea snake) -> 5.7421556E-7, origin -> images/images/bluetick.jpg, Some(mixing bowl) -> 5.4001305E-7, mode -> 16, None -> 4.5454306E-7, Some(whippet) -> 1.2101438E-6, width -> 333, Some(buckle) -> 1.1306514E-6))]\n", "[Annotation(category, 0, 5, palace, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 6.3918545E-5, Some(beer glass) -> 8.879939E-6, image -> 0, Some(damselfly) -> 9.565577E-6, Some(turnstile) -> 6.315168E-5, Some(cockroach, roach) -> 1.125408E-5, height -> 334, Some(bulbul) -> 3.321073E-5, Some(sea snake) -> 1.0886038E-5, origin -> images/images/palace.JPEG, Some(mixing bowl) -> 2.6202975E-5, mode -> 16, None -> 2.6134943E-5, Some(whippet) -> 1.3805137E-5, width -> 500, Some(buckle) -> 3.121459E-5))]\n", @@ -386,8 +300,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/chinese/word_segmentation/words_segmenter_demo.ipynb b/examples/python/annotation/text/chinese/word_segmentation/words_segmenter_demo.ipynb index 783f07ed3171ec..b8517d9265de93 100644 --- a/examples/python/annotation/text/chinese/word_segmentation/words_segmenter_demo.ipynb +++ b/examples/python/annotation/text/chinese/word_segmentation/words_segmenter_demo.ipynb @@ -2,27 +2,21 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "cpYpeEfnmWKd" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "xl3k8bt-mZIc" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/chinese/word_segmentation/words_segmenter_demo.ipynb)\n" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/chinese/word_segmentation/words_segmenter_demo.ipynb\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "xluzxinzKK-L" - }, + "metadata": {}, "source": [ "# [Word Segmenter](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/main/scala/com/johnsnowlabs/nlp/annotators/ws/WordSegmenterModel.scala)\n", "\n", @@ -32,22 +26,18 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "MdE588BiY3z1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "SBtn9YsW0eHz" - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -63,21 +53,15 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hJFV80wXyXiQ", - "outputId": "c1c1ef34-8604-482d-d845-11ed44d48275" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wordseg_gsd_ud_trad download started this may take some time.\n", - "Approximate size to download 1.3 MB\n", + "Approximate size to download 1.2 MB\n", "[OK!]\n", "+----------------------------+\n", "| result|\n", @@ -89,14 +73,14 @@ } ], "source": [ - "import pandas as pd \n", + "import pandas as pd\n", "document_assembler = DocumentAssembler()\\\n", " .setInputCol(\"text\")\\\n", " .setOutputCol(\"document\")\n", "\n", "word_segmenter = WordSegmenterModel.pretrained(\"wordseg_gsd_ud_trad\", \"zh\")\\\n", " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"words_segmented\") \n", + " .setOutputCol(\"words_segmented\")\n", "\n", "\n", "pipeline = Pipeline(stages=[document_assembler, word_segmenter])\n", @@ -107,15 +91,6 @@ "result.select('words_segmented.result').show()\n", "\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "06z9uTcD1RU8" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -139,8 +114,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb b/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb index b1c562b9e560b5..7dbb59f18e17d5 100644 --- a/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb +++ b/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb @@ -1,20 +1,28 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1abcab5d", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb)\n", + "\n", + "\n", + "# **Matching Dates**" + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "875519af", - "metadata": { - "id": "875519af", - "outputId": "1dff204a-0440-4609-a4a5-4a0247605fb4", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2022-12-23 14:46:21-- http://setup.johnsnowlabs.com/colab.sh\n", "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", @@ -38,124 +46,63 @@ "\n", "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 48 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 54.5 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 46.8 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" + "\u001b[K |████████████████████████████████| 281.5 MB 48 kB/s \n", + "\u001b[K |████████████████████████████████| 453 kB 54.5 MB/s \n", + "\u001b[K |████████████████████████████████| 199 kB 46.8 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "d1a9947b", - "metadata": { - "id": "d1a9947b" - }, - "outputs": [], - "source": [ - "from pyspark import *\n", - "import sparknlp" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1d695f9d", - "metadata": { - "id": "1d695f9d" - }, - "outputs": [], - "source": [ - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "6edb5c48", - "metadata": { - "id": "6edb5c48" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b072abfa", - "metadata": { - "id": "b072abfa", - "outputId": "10c42d89-441c-4e0f-94fe-a84f1d7a1cba", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } + "'4.3.1'" + ] }, + "execution_count": null, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], "source": [ + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "from pyspark import *\n", + "from pyspark.sql.types import StringType\n", + "\n", + "spark = sparknlp.start()\n", "sparknlp.version()" ] }, - { - "cell_type": "code", - "execution_count": 6, - "id": "84dc2c25", - "metadata": { - "id": "84dc2c25" - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType" - ] - }, { "cell_type": "markdown", "id": "8657d04e", - "metadata": { - "id": "8657d04e" - }, + "metadata": {}, "source": [ "## English formatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "6ee10683", - "metadata": { - "id": "6ee10683", - "outputId": "ffdcc110-4ae4-42ab-aadf-2ad79dab5aa0", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -175,24 +122,18 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "c2fd3c80", - "metadata": { - "id": "c2fd3c80", - "outputId": "bd8a00c3-84fd-4524-e22c-de28261ad781", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 14, 22, 05/13/2018, {sentence -> 0}, []}, {date, 40, 48, 05/18/2020, {sentence -> 0}, []}]|\n", + "|[[date, 14, 22, 05/13/2018, [sentence -> 0], []], [date, 40, 48, 05/18/2020, [sentence -> 0], []]]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -216,28 +157,20 @@ { "cell_type": "markdown", "id": "ab36411f", - "metadata": { - "id": "ab36411f" - }, + "metadata": {}, "source": [ "## English unformatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "8b688e34", - "metadata": { - "id": "8b688e34", - "outputId": "57bfb122-9aa1-405f-c189-cd0bf51e5e8f", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -257,24 +190,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "559e5288", - "metadata": { - "id": "559e5288", - "outputId": "56f75595-2a4c-474e-8c13-d77f754f1a16", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 10, 17, 12/30/2022, {sentence -> 0}, []}, {date, 32, 39, 12/29/2022, {sentence -> 0}, []}]|\n", + "|[[date, 10, 17, 02/24/2023, [sentence -> 0], []], [date, 32, 39, 02/23/2023, [sentence -> 0], []]]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -294,40 +221,12 @@ "assembled = document_assembler.transform(df)\n", "date_matcher.transform(assembled).select(\"date\").show(10, False)" ] - }, - { - "cell_type": "markdown", - "id": "b7e767c2", - "metadata": { - "id": "b7e767c2" - }, - "source": [ - "# A short guide to language support extension\n", - "\n", - "## In order to extend the date matchers language support for new languages, please follow the instructions below:\n", - "\n", - "1. Add the new dictionary into src/main/resources/date-matcher/translation-dictionaries/dynamic folder of the spark-nlp project\n", - "2. Add the same dictionary base of the other languages\n", - " * Add tests for the dictionary\n", - "3. Add other eventual specific expressions to the base\n", - " * Add tests for those specific expressions to avoid syntactic conflicts in parsing\n", - "4. Add a notebook like this one to show how to use the language extension\n", - "\n", - "Thank you for contributing! :)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "b8059be2", - "metadata": { - "id": "b8059be2" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -342,11 +241,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb b/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb index f5a543f9966e8d..353843da2f8824 100644 --- a/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb +++ b/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb @@ -2,18 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "dPbeRnw27_xs" - }, + "metadata": {}, "source": [ "This notebook shows how to export spaCy tokens and sentences to Spark NLP using SpacyToAnnotation component" ] }, { "cell_type": "markdown", - "metadata": { - "id": "m3d6rZ6uiV7c" - }, + "metadata": {}, "source": [ "### Exporting Spacy Tokens/Sentences" ] @@ -21,13 +17,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "50H8y_CbikAD", - "outputId": "f18c7fe1-0d6d-4c9e-8d29-4528d68afc47" - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -46,9 +36,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "Sy6kWGTVilaE" - }, + "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", @@ -61,9 +49,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "cdrUKcJIkb5p" - }, + "metadata": {}, "source": [ "Create a dictionary with the data and export to JSON file" ] @@ -71,9 +57,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "EqZvYR_jkSa6" - }, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -95,13 +79,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hf2r3fOikmSg", - "outputId": "9eae615b-4b4e-4ae3-c8ce-678564b7911e" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -132,16 +110,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HL7dLz15XTGr", - "outputId": "1cb63f4c-e59f-49dc-8cc1-b5dea82989f8", - "pycharm": { - "is_executing": true - } - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -155,14 +124,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - }, - "id": "DhM6c4ON8UHg", - "outputId": "166cda08-e449-407f-a0f1-a317a9ffe82e" - }, + "metadata": {}, "outputs": [ { "data": { @@ -193,7 +155,7 @@ "" ] }, - "execution_count": 11, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -206,14 +168,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "Pf-m9E9NmHNW", - "outputId": "8233206d-b76e-4159-ec9b-22764b334de7" - }, + "metadata": {}, "outputs": [ { "data": { @@ -224,7 +179,7 @@ "'3.2.1'" ] }, - "execution_count": 12, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -236,9 +191,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "DXVydy4LXbLY" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.training import SpacyToAnnotation\n", @@ -249,9 +202,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "X42PFLpOxqp8" - }, + "metadata": {}, "outputs": [], "source": [ "result = nlp_reader.readJsonFile(spark, \"./multi_doc_tokens.json\")" @@ -260,13 +211,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xtLzA0Hl6Dng", - "outputId": "f9177c6e-8cfc-408c-a7a8-c4abc5116142" - }, + "metadata": {}, "outputs": [ { "data": { @@ -274,7 +219,7 @@ "DataFrame[document: array,embeddings:array>>, sentence: array,embeddings:array>>, token: array,embeddings:array>>]" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -286,13 +231,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DxI83Pif40k7", - "outputId": "39e2df98-5a59-4b0e-bb75-fc61ba947eb5" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -343,13 +282,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kaQa02F040fV", - "outputId": "ee986c23-acd6-4d76-a623-8d0908fc6eec" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -387,8 +320,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/Text_Preprocessing_with_SparkNLP.ipynb b/examples/python/annotation/text/english/Text_Preprocessing_with_SparkNLP.ipynb index f109425ace44e0..1f8c1890603fee 100644 --- a/examples/python/annotation/text/english/Text_Preprocessing_with_SparkNLP.ipynb +++ b/examples/python/annotation/text/english/Text_Preprocessing_with_SparkNLP.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb)\n", "\n", "\n", "# **Text Preprocessing with Spark NLP**" @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -46,7 +46,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.3.0\n", + "Spark NLP version: 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -2207,7 +2207,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " for more examples : https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/document-normalizer/document_normalizer_notebook.ipynb" + " for more examples : https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/examples/annotation/english/document-normalizer/document_normalizer_notebook.ipynb" ] }, { @@ -4971,7 +4971,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -4985,11 +4985,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb b/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb index 71ea3d546cc28a..8f750909625db9 100644 --- a/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb +++ b/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb)\n", "\n", "\n", "# **Chunk Extraction with Chunker**" @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -415,7 +415,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "sparknlp", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -429,11 +429,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "8b81146af0a3e5653a315622171ee30f7af15821bda096dcb17032694ac0d21c" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb b/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb index 92061ec526efed..53a642aeb031cb 100644 --- a/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb +++ b/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb @@ -2,14 +2,11 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "TUZwF8vlj9rC" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/chunking/NgramGenerator.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/annotation/english/chunking/NgramGenerator.ipynb)\n", "\n", "## 0. Colab Setup" ] @@ -17,26 +14,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 187 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 81303, - "status": "ok", - "timestamp": 1589248273385, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "NHv8XYz5krHH", - "outputId": "07ac2d90-b64a-4324-bd04-b6fff5675606" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,27 +23,24 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 62kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 51.0MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 9.6MB/s \n", - "\u001B[?25hopenjdk version \"1.8.0_252\"\n", + "\u001b[K |████████████████████████████████| 215.7MB 62kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 51.0MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 9.6MB/s \n", + "\u001b[?25hopenjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "AIJKrwrTj9rE" - }, + "metadata": {}, "source": [ "\n", "### NGramGenerator\n", @@ -104,11 +79,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "kNdjlHq1j9rF" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -120,33 +91,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 94125, - "status": "ok", - "timestamp": 1589248286229, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "vW9z6v2aj9rJ", - "outputId": "c75e1cb2-7c2e-4197-d745-bb4d8e4a45b7" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -160,11 +112,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wr4OCNfSj9rN" - }, + "metadata": {}, "outputs": [], "source": [ "dfTest = spark.createDataFrame([\n", @@ -176,16 +124,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "B_6e50hIj9rR" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", " .setInputCol(\"text\")\n", - " \n", + "\n", "tokenizer = Tokenizer() \\\n", " .setInputCols([\"document\"]) \\\n", " .setOutputCol(\"token\")\n", @@ -198,11 +142,11 @@ "trigrams_cum = NGramGenerator() \\\n", " .setInputCols([\"token\"]) \\\n", " .setOutputCol(\"trigrams\") \\\n", - " .setN(3) \n", + " .setN(3)\n", "\n", "pipeline = Pipeline(stages=[\n", - " document_assembler, \n", - " tokenizer, \n", + " document_assembler,\n", + " tokenizer,\n", " bigrams,\n", " trigrams_cum\n", "])\n" @@ -210,10 +154,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "2LItphNIj9rU" - }, + "metadata": {}, "source": [ "#### Use the Pipeline in Spark (DataFrame)" ] @@ -221,11 +162,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "xp8cn-jqj9rV" - }, + "metadata": {}, "outputs": [], "source": [ "model = pipeline.fit(dfTest)\n", @@ -235,26 +172,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 101823, - "status": "ok", - "timestamp": 1589248294762, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "ol2OGJiBj9rY", - "outputId": "0fe4d6d1-b3b8-494a-defb-c902a20d97a2" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -277,26 +195,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 101922, - "status": "ok", - "timestamp": 1589248295073, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "NJKU1UTGj9re", - "outputId": "fd4583c5-d24f-4d9d-b7bd-bcdd2769f1f2" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -318,10 +217,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "bTia_S89j9rk" - }, + "metadata": {}, "source": [ "#### Use the Pipeline in Python (string)" ] @@ -329,11 +225,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "F7ZqgxOIj9rk" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.base import LightPipeline\n", @@ -344,11 +236,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "hcCmT6FDj9ro" - }, + "metadata": {}, "outputs": [], "source": [ "result = LightPipeline(model).annotate(text)" @@ -357,26 +245,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 100824, - "status": "ok", - "timestamp": 1589248295490, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "shGbYmXWj9rv", - "outputId": "335e5983-6d2e-426c-c5fb-304d860f7557" - }, + "metadata": {}, "outputs": [ { "data": { @@ -384,10 +253,8 @@ "['document', 'token', 'bigrams', 'trigrams']" ] }, - "execution_count": 12, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -398,26 +265,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 100496, - "status": "ok", - "timestamp": 1589248295492, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "VzNPa3wDj9rz", - "outputId": "35f1e30a-e153-402c-d212-b9a52580f863" - }, + "metadata": {}, "outputs": [ { "data": { @@ -430,10 +278,8 @@ " 'manufacturing companies']" ] }, - "execution_count": 13, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -444,26 +290,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 99968, - "status": "ok", - "timestamp": 1589248295493, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "81_n11fej9r4", - "outputId": "d803ebff-4d0c-4347-9a9a-a1c6ddf967de" - }, + "metadata": {}, "outputs": [ { "data": { @@ -475,27 +302,14 @@ " 'major manufacturing companies']" ] }, - "execution_count": 14, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], "source": [ "result['trigrams']" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "P2RK55a1j9r8" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -518,8 +332,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb b/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb index e946597ca610a0..c158fc85039ee6 100644 --- a/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb +++ b/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb @@ -2,16 +2,16 @@ "cells": [ { "cell_type": "markdown", + "id": "Dm865JXIqAQ9", + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb)" - ], - "metadata": { - "id": "Dm865JXIqAQ9" - }, - "id": "Dm865JXIqAQ9" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb)" + ] }, { "cell_type": "markdown", + "id": "ThzZq5KVsGcw", + "metadata": {}, "source": [ "# Coreference Resolution with SpanBertCorefModel\n", "\n", @@ -20,95 +20,75 @@ "the model will link \"he\" to \"John\" and \"her\" to \"Mary\".\n", "\n", "This example will show how to use a pretrained model." - ], - "metadata": { - "id": "ThzZq5KVsGcw" - }, - "id": "ThzZq5KVsGcw" + ] }, { "cell_type": "markdown", + "id": "s5--DnBP3Spa", + "metadata": {}, "source": [ "## 0. Colab Setup\n", "\n", "The following cell will install Spark NLP in a Colab notebook. If this notebook is run locally it should be skipped." - ], - "metadata": { - "id": "s5--DnBP3Spa" - }, - "id": "s5--DnBP3Spa" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "qrCJxuFts9nF", + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash\n", "\n", "# to process audio files\n", "!pip install -q pyspark librosa" - ], - "metadata": { - "id": "qrCJxuFts9nF" - }, - "id": "qrCJxuFts9nF", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "id": "zQ2JdVlT32iX", + "metadata": {}, "source": [ "Let's start a Spark NLP session:" - ], - "metadata": { - "id": "zQ2JdVlT32iX" - }, - "id": "zQ2JdVlT32iX" + ] }, { "cell_type": "code", - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "\n", - "print(sparknlp.version())" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "n88cWKtEtD0-", - "outputId": "8bfbd519-ab36-4c56-a663-d580654912b0" - }, + "execution_count": null, "id": "n88cWKtEtD0-", - "execution_count": 3, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "4.2.0\n" + "4.3.1\n" ] } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(sparknlp.version())" ] }, { "cell_type": "markdown", + "id": "8dEhKuzb3X3E", + "metadata": {}, "source": [ "## 1. Using a pretrained `SpanBertCorefModel` in a Pipeline" - ], - "metadata": { - "id": "8dEhKuzb3X3E" - }, - "id": "8dEhKuzb3X3E" + ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "ee0af780-5560-45fe-8d57-0ff2eb188b0e", - "metadata": { - "id": "ee0af780-5560-45fe-8d57-0ff2eb188b0e" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -118,29 +98,21 @@ }, { "cell_type": "markdown", + "id": "m57FA0xU3_AP", + "metadata": {}, "source": [ "SpanBertCorefModel requires `DOCUMENT` and `TOKEN` type annotations. these are extracted first before being fed to the pretrained model for classification." - ], - "metadata": { - "id": "m57FA0xU3_AP" - }, - "id": "m57FA0xU3_AP" + ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "8bf4c5ca-fda9-41b9-aaaf-833bde7ffeef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8bf4c5ca-fda9-41b9-aaaf-833bde7ffeef", - "outputId": "fe4398df-b20a-4800-b26d-9b6d3667e767" - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "spanbert_base_coref download started this may take some time.\n", "Approximate size to download 540.1 MB\n", @@ -176,53 +148,41 @@ }, { "cell_type": "markdown", + "id": "UJTUrmVs4K2R", + "metadata": {}, "source": [ "Let's create some data so we can test the pipeline:" - ], - "metadata": { - "id": "UJTUrmVs4K2R" - }, - "id": "UJTUrmVs4K2R" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "jid-XQAe39MO", + "metadata": {}, + "outputs": [], "source": [ "data = spark.createDataFrame([\n", " [\"John loves Mary because she knows how to treat him. She is also fond of him. John said something to Mary but she didn't respond to him.\"],\n", "]).toDF(\"text\")" - ], - "metadata": { - "id": "jid-XQAe39MO" - }, - "id": "jid-XQAe39MO", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "id": "0nylNATd4RiE", + "metadata": {}, "source": [ "The data is then fit to the pipeline and we can extract the coreferences with an example query like so" - ], - "metadata": { - "id": "0nylNATd4RiE" - }, - "id": "0nylNATd4RiE" + ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "dcc442a4-98b1-49a3-9c47-62d42f4daa07", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dcc442a4-98b1-49a3-9c47-62d42f4daa07", - "outputId": "1e970acf-d031-440d-efc7-9e30c1474fe3" - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-----+------------------------------------------------------------------------------------+\n", "|token|metadata |\n", @@ -250,19 +210,13 @@ " .selectExpr(\"coref.result as token\", \"coref.metadata\") \\\n", " .show(truncate=False)" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "CrFFcdbEwBdt" - }, - "id": "CrFFcdbEwBdt", - "execution_count": null, - "outputs": [] } ], "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -277,12 +231,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - }, - "colab": { - "provenance": [], - "collapsed_sections": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb b/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb index 3e7e991417d362..228a5fa14cd55a 100644 --- a/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb +++ b/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb)\n", "\n", "\n", "# **Loading Documents with DocumentAssembler**" @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -335,7 +335,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -349,11 +349,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb b/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb index 0bcfcf46e38be5..cd13b79c96f750 100644 --- a/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb +++ b/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb)\n", "\n", "# Loading Multiple Documents with MultiDocumentAssembler\n", "\n", @@ -27,7 +27,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -39,7 +39,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n", + "Spark NLP version 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -200,7 +200,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -213,13 +213,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12 (main, Apr 5 2022, 06:56:58) \n[GCC 7.5.0]" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb b/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb index 4c483b02c4d141..99066ae0e9f096 100644 --- a/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb +++ b/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb @@ -1,180 +1,108 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "gilhjL-xtel5" - }, + "metadata": {}, "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb)\n", + "\n", + "\n", "# Document Normalizer annotator notebook" ] }, { "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a7c65f64-07d6-4355-97a0-0a371d83116c", - "showTitle": false, - "title": "" - }, - "id": "a9z0Sk-wtel7" - }, + "metadata": {}, "source": [ "# Set up Colab environment" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "XJd1FYZEtel7", - "outputId": "76c387aa-a5f2-48a0-edda-1a4b2cc26f60", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-24 15:21:32-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-24 15:21:33-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-24 15:21:34-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-24 15:21:34 (61.0 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 49 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 59.8 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 46.0 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!wget http://ckl-it.de/wp-content/uploads/2022/12/docs.zip\n", - "!unzip docs.zip" - ], - "metadata": { - "id": "fQGr5EBmuUbh", - "outputId": "177b45fc-2934-402f-f465-ec526791d2f2", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-24 15:22:28-- http://ckl-it.de/wp-content/uploads/2022/12/docs.zip\n", - "Resolving ckl-it.de (ckl-it.de)... 217.160.0.108, 2001:8d8:100f:f000::209\n", - "Connecting to ckl-it.de (ckl-it.de)|217.160.0.108|:80... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 16670 (16K) [application/zip]\n", - "Saving to: ‘docs.zip’\n", - "\n", - "docs.zip 100%[===================>] 16.28K 65.6KB/s in 0.2s \n", - "\n", - "2022-12-24 15:22:29 (65.6 KB/s) - ‘docs.zip’ saved [16670/16670]\n", - "\n", - "Archive: docs.zip\n", - " inflating: html-docs/sample0.html \n", - " inflating: html-docs/sample1.html \n", - " inflating: html-docs/sample2.html \n", - " inflating: json-docs/sample0.json \n", - " inflating: xml-docs/C-CDAsample.xml \n" - ] - } + "!unzip -f docs.zip" ] }, { "cell_type": "markdown", - "metadata": { - "id": "l5sJmpLPtel8" - }, + "metadata": {}, "source": [ "# Start Spark NLP session" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "nKadS7-5tel8" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", "from sparknlp.base import *\n", "from sparknlp.annotator import *\n", - "import sparknlp \n", + "import sparknlp\n", "\n", "spark = sparknlp.start()" ] }, { "cell_type": "markdown", - "metadata": { - "id": "y460nHLitel8" - }, + "metadata": {}, "source": [ "# Document Normalizer annotator overview" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The DocumentNormalizer is an annotator that can be used after the DocumentAssembler to\n", + "normalizes documents once that they have been processed and indexed.\n", + "\n", + "It takes in input annotated documents of type `Array[AnnotatorType]` (DOCUMENT) and gives\n", + "as output annotated document of type AnnotatorType.DOCUMENT .\n", + "\n", + "Parameters are:\n", + "- inputCol: input column name string which targets a column of type\n", + " Array(AnnotatorType.DOCUMENT).\n", + "- outputCol: output column name string which targets a column of type\n", + " AnnotatorType.DOCUMENT.\n", + "- action: action string to perform applying regex patterns, i.e. (clean | extract).\n", + " Default is \"clean\".\n", + "- cleanupPatterns: normalization regex patterns which match will be removed from\n", + " document. Default is \"<[^>]*>\" (e.g., it removes all HTML tags).\n", + "- replacement: replacement string to apply when regexes match. Default is \" \".\n", + "- lowercase: whether to convert strings to lowercase. Default is False.\n", + "- removalPolicy: removalPolicy to remove patterns from text with a given policy. Valid\n", + " policy values are: \"all\", \"pretty_all\", \"first\", \"pretty_first\". Defaults is\n", + " \"pretty_all\".\n", + "- encoding: file encoding to apply on normalized documents. Supported encodings are:\n", + " UTF_8, UTF_16, US_ASCII, ISO-8859-1, UTF-16BE, UTF-16LE. Default is \"UTF-8\"." + ] + }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b4efb61f-6011-4ba1-a0ad-6c229f69e3d9", - "showTitle": true, - "title": "DocumentNormalizer overview and parameters" - }, - "id": "gicU4xN_tel9" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "# The DocumentNormalizer is an annotator that can be used after the DocumentAssembler to narmalize documents once that they have been processed and indexed .\n", - "# It takes in input annotated documents of type Array[AnnotatorType](DOCUMENT) and gives as output annotated document of type AnnotatorType.DOCUMENT .\n", - "#\n", - "# Parameters are:\n", - "# - inputCol: input column name string which targets a column of type Array(AnnotatorType.DOCUMENT).\n", - "# - outputCol: output column name string which targets a column of type AnnotatorType.DOCUMENT.\n", - "# - action: action string to perform applying regex patterns, i.e. (clean | extract). Default is \"clean\".\n", - "# - cleanupPatterns: normalization regex patterns which match will be removed from document. Default is \"<[^>]*>\" (e.g., it removes all HTML tags).\n", - "# - replacement: replacement string to apply when regexes match. Default is \" \".\n", - "# - lowercase: whether to convert strings to lowercase. Default is False.\n", - "# - removalPolicy: removalPolicy to remove patterns from text with a given policy. Valid policy values are: \"all\", \"pretty_all\", \"first\", \"pretty_first\". Defaults is \"pretty_all\".\n", - "# - encoding: file encoding to apply on normalized documents. Supported encodings are: UTF_8, UTF_16, US_ASCII, ISO-8859-1, UTF-16BE, UTF-16LE. Default is \"UTF-8\".\n", - "\n", - "\n", "documentAssembler = DocumentAssembler() \\\n", " .setInputCol('text') \\\n", " .setOutputCol('document')\n", @@ -201,38 +129,24 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_7QB7zgrtel9" - }, + "metadata": {}, "source": [ "# Data loading" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "58874c76-fc17-4d9e-9b4d-4e3db38cca95", - "showTitle": false, - "title": "" - }, - "id": "zBtpXZWZtel9", - "outputId": "92d3554b-2976-4f8d-a7e2-e2387067ade6", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", "+--------------------+\n", - "|\\r...|\n", + "|\\n...|\n", "|" + " You and John PER prefer the morning flight through Denver LOC" ], "text/plain": [ "" @@ -404,27 +292,21 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "HSvNig972xXC" - }, + "metadata": {}, "source": [ "# Graph Extraction" ] }, { "cell_type": "markdown", - "metadata": { - "id": "QkW7uQ4_cqAQ" - }, + "metadata": {}, "source": [ "We can leverage the output of Dependency Parser and NER to extract paths from a dependency tree to find relevant relationships between words and entities." ] }, { "cell_type": "markdown", - "metadata": { - "id": "976yIqRDCy6W" - }, + "metadata": {}, "source": [ "Using the parameter *setRelationshipTypes* we can set a list of token-ENTITY relationships we want to extract paths from. Following the Dependency Parser tree depicted above, we can extract paths for the following pair of tokens-ENTITIES:\n", "\n", @@ -439,23 +321,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "0Dms9keFa7K0" - }, + "metadata": {}, "source": [ "Here for example, we want to find the path between the token i.e.node *prefer* and all nodes that are tagged as LOC entity" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LRpKY22pAqlL", - "outputId": "a0f5f431-8656-47ab-d940-73e0fe65d68d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -483,7 +357,7 @@ " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n", " .setOutputCol(\"graph\") \\\n", " .setRelationshipTypes([\"prefer-LOC\"])\n", - " \n", + "\n", "graph_pipeline = Pipeline().setStages([document_assembler, tokenizer,\n", " word_embeddings, ner_tagger, pos_tagger,\n", " dep_parser, typed_dep_parser,\n", @@ -492,33 +366,25 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "lJV6x-Nqw442" - }, + "metadata": {}, "source": [ "The result dataset has a *graph* column with the paths between prefer,LOC relationship " ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Kh78KBe-63Dn", - "outputId": "c374ead7-c7eb-454a-ef83-31595a66ff4c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-----------------------------------------------------------------------------------------------------------------+\n", - "|graph |\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", - "|[{node, 13, 18, prefer, {relationship -> prefer,LOC, path1 -> prefer,nsubj,morning,flat,flight,flat,Denver}, []}]|\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|graph |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{node, 0, 2, You, {entities -> PER,LOC, left_path -> You,appos,John, right_path -> You,appos,John,parataxis,prefer,nsubj,morning,flat,flight,flat,Denver}, []}]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } @@ -531,28 +397,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Sy1Vz82kwvvy" - }, + "metadata": {}, "source": [ "**Graph Finisher**" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Ypd6GyMyxEvB" - }, + "metadata": {}, "source": [ "Graph Finisher annotator outputs the paths in a more generic format. In this case RDF triples" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "52fvrnW-TBK2" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_finisher = GraphFinisher() \\\n", @@ -565,24 +425,18 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FYGtVzbe3Qz2", - "outputId": "82bde0eb-e1eb-4e9c-a55e-b8a793bf72fe" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-----------------------------------------------------+-----------------------------------------------------------------------+\n", - "|text |finisher |\n", - "+-----------------------------------------------------+-----------------------------------------------------------------------+\n", - "|You and John prefer the morning flight through Denver|[[(prefer,nsubj,morning), (morning,flat,flight), (flight,flat,Denver)]]|\n", - "+-----------------------------------------------------+-----------------------------------------------------------------------+\n", + "+-----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+\n", + "|text |finisher |\n", + "+-----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+\n", + "|You and John prefer the morning flight through Denver|[[(You,appos,John)], [(You,appos,John), (John,parataxis,prefer), (prefer,nsubj,morning), (morning,flat,flight), (flight,flat,Denver)]]|\n", + "+-----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } @@ -600,7 +454,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -613,10 +467,9 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" + "pygments_lexer": "ipython3" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb index 9bee31c097e226..321cb1cb257dec 100644 --- a/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb +++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb @@ -11,22 +11,16 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UyjADbwO-kj7", - "outputId": "480cbf82-ae00-432e-d02a-ebd28a75495e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 281.3 MB 39 kB/s \n", - "\u001B[K |████████████████████████████████| 198 kB 59.2 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "\u001b[K |████████████████████████████████| 281.3 MB 39 kB/s \n", + "\u001b[K |████████████████████████████████| 198 kB 59.2 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Processing ./spark_nlp-4.2.7-py2.py3-none-any.whl\n", "Installing collected packages: spark-nlp\n", @@ -41,14 +35,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mxJniPtV_gqj", - "outputId": "1c039acc-e4f7-4785-d6a7-38ef86395757" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -69,14 +57,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zzy3PziR_654", - "outputId": "59d5a684-010e-4052-b0b5-ab017737fede" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -98,14 +80,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iCGGFS7c74gG", - "outputId": "ae2afd53-4519-492b-bd8b-4bae22d62d40" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -130,23 +106,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "CWIVz7CM9jKP" - }, + "metadata": {}, "source": [ "Graph Extraction requires POS, DependencyParsers and NER to extract information from a Dependency Tree. Check this [introductory notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/graph-extraction/graph_extraction_intro.ipynb)." ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VVFs6NDBlWsN", - "outputId": "5ff90889-6cba-48f2-929a-de9fb303234e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -183,18 +151,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "R-ZUgIhKCAjf" - }, + "metadata": {}, "source": [ "# Graph Extraction Default Values" ] }, { "cell_type": "markdown", - "metadata": { - "id": "QkW7uQ4_cqAQ" - }, + "metadata": {}, "source": [ "Graph Extraction by default will merge and explode entities. Which means:\n", "\n", @@ -206,10 +170,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "JfJZF5Xf770b" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_extraction = GraphExtraction() \\\n", @@ -219,10 +181,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "XxqysCFDg1aP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_pipeline = Pipeline().setStages([document_assembler, tokenizer,\n", @@ -232,14 +192,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LRpKY22pAqlL", - "outputId": "dc5673da-7dd0-4882-936e-b08c0c7ffb80" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -270,18 +224,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "a9LIAcw9Bz9S" - }, + "metadata": {}, "source": [ "## Entity Types" ] }, { "cell_type": "markdown", - "metadata": { - "id": "w89Wvi3jEGp6" - }, + "metadata": {}, "source": [ "**entitTypes** parameter allow us to find paths between a pair of entities. The pair of entities must be separated by hyphen. So, we must use this format:\n", "\n", @@ -290,10 +240,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "zu5F-xX_CFvb" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_extraction = GraphExtraction() \\\n", @@ -311,14 +259,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tq3P8a8XCY1f", - "outputId": "e9d5a49a-26f3-46f2-aa75-33da7083b57c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -340,18 +282,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "3b8tNUxTNgfQ" - }, + "metadata": {}, "source": [ "## Modifying Root Token" ] }, { "cell_type": "markdown", - "metadata": { - "id": "HaOsJlMsR_3e" - }, + "metadata": {}, "source": [ "We can set a different root. For that we need to check which words can be defined as root. Visualizing the first level of the dependency tree in [this notebook](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing), besides `born` those could be: `Peter`, `was`, `.` and `man`. However, some of those won't return a relationship.\n", "\n", @@ -374,19 +312,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "wVceQZyVqnoP" - }, + "metadata": {}, "source": [ "Now, if we let things by default. It won't output anything as we can see below:" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "NTnhdiFOrIa1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_extraction = GraphExtraction() \\\n", @@ -404,14 +338,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iE-FybUyrKOC", - "outputId": "26abf363-d6a7-4dd6-ea77-2c2f1871d60d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -437,37 +365,29 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "se6B3ZYnrV7v" - }, + "metadata": {}, "source": [ "The output is empty, because under `man` we only have `Mexico` as an entity. NER does not identify any other entity. So, `Mexico` does not have another pair to show a path. But, we can use `relationshipTypes` parameter to find a path between and unlabeled token and a labeled token, as we can see in the example below:" ] }, { "cell_type": "markdown", - "metadata": { - "id": "lgboHa-NzG9U" - }, + "metadata": {}, "source": [ "## Relationship Types" ] }, { "cell_type": "markdown", - "metadata": { - "id": "IWg_mkR7YjvG" - }, + "metadata": {}, "source": [ "**relationshipTypes** allows us to find a path between an unlabeled token and a labeled token. To use this parameter, we need to set **explodEntities** parameter to `false`" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "fenXqNoXR_Cn" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_extraction = GraphExtraction() \\\n", @@ -486,14 +406,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UQO4HtRaSkLi", - "outputId": "f945dbb9-aa37-4556-95be-5e1a3218d4b1" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -515,9 +429,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "VXvcMXbjwfoY" - }, + "metadata": {}, "source": [ "Currently, it searchs deep which means it will find relationships from the defined root to its labeled descendants. This means that if for example we set a relationship like `setRelationshipTypes([\"successful-LOC\"])` it won't output a path. \n", "\n", @@ -526,32 +438,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "dtpgB1z0zPwL" - }, + "metadata": {}, "source": [ "## More Entities more Relations" ] }, { "cell_type": "markdown", - "metadata": { - "id": "W-uI2-Tcp7ki" - }, + "metadata": {}, "source": [ "Following the example above, we can set a root token and let other parameters as default to get an output. However, we need a different sentence that produces a deeper dependency tree with descendants that have labeled tokens. If we tweak the sentence as shown below, we can make it work:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xXtmSwGgzg2z", - "outputId": "3e4edbbf-75f9-40f3-da14-d4427fdf21b3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -574,19 +476,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "n3TD7mTyzlXU" - }, + "metadata": {}, "source": [ "As we can see in this [visualization notebook ](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing), now we have a labeled token (`Queens`) at a deeper level. So, we can use it safely to get a path from another root." ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "kNBPEUbM0He2" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "graph_extraction = GraphExtraction() \\\n", @@ -604,14 +502,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "E97fajo20bl4", - "outputId": "013848b0-9db8-4fd8-da3d-48c9a4a87a7e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -650,8 +542,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb b/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb index 228ba41ff9e30d..855467f91aa733 100644 --- a/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb +++ b/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb @@ -12,7 +12,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb)" ] }, { @@ -29,7 +29,8 @@ "metadata": {}, "outputs": [], "source": [ - "! pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { @@ -56,7 +57,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n", + "Spark NLP version 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -70,7 +71,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -86,7 +87,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": null, @@ -205,13 +206,6 @@ "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before _validateStagesInputCols\n" - ] - }, { "data": { "text/plain": [ @@ -1132,13 +1126,6 @@ " display(HTML(r.highlighted_keywords))\n", " print(\"\\n\\n\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1147,7 +1134,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "sparknlp", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1161,11 +1148,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "8b81146af0a3e5653a315622171ee30f7af15821bda096dcb17032694ac0d21c" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb b/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb index 7fa6828f93b06f..01acf19bcf1c1c 100644 --- a/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb +++ b/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb @@ -1,31 +1,28 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "DryaQ76bhsVy" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/language-detection/Language_Detection_and_Indentification.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Language Detection and Identification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Colab" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "jaJI-cfjPnac", - "outputId": "729d2536-d49a-437b-d783-b1e59259ed49" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -38,39 +35,28 @@ } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ODtmoBwfoX3T" - }, + "metadata": {}, "source": [ "## 1. Start Spark Session" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "5q8yjPy8oUBj", - "outputId": "d3dca249-5180-4cac-f4c2-4d6bb297c86b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 2.7.2\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -86,10 +72,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "O4gGGYD6P6NN" - }, + "metadata": {}, "source": [ "## LanguageDetectorDL Pre-trained Models & Pipelines\n", "\n", @@ -109,10 +92,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "PmG3Pw0wQ4RR" - }, + "metadata": {}, "source": [ "# LanguageDetectorDL\n", "## Pre-trained Pipelines" @@ -120,12 +100,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "vvTiBzCwQ20J" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.pretrained import PretrainedPipeline" @@ -133,16 +109,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - }, - "colab_type": "code", - "id": "c4e1jc5ARGNv", - "outputId": "f50c1064-155a-4aa8-8e40-0653f621357f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -161,7 +129,7 @@ " 'language': ['bg']}" ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -177,10 +145,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "HueVA_myV_3T" - }, + "metadata": {}, "source": [ "# LanguageDetectorDL\n", "## Pre-trained Models" @@ -188,12 +153,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ttZ219E_WE52" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.base import *\n", @@ -202,16 +163,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "5iicvy3sWPu0", - "outputId": "9883d655-67d3-452d-e572-542c29d92687" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -235,19 +188,15 @@ ".setCoalesceSentences(True)\n", "\n", "languagePipeline = Pipeline(stages=[\n", - " documentAssembler, \n", + " documentAssembler,\n", " language_detector\n", "])" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5WT1pE_yYukK" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "test_df = spark.createDataFrame([\n", @@ -260,16 +209,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "id": "ITNTdXy-aRpF", - "outputId": "145fb725-2582-47ab-fba8-fe35354e4261" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -291,27 +232,19 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 156 - }, - "colab_type": "code", - "id": "-jgLXnlYdyUG", - "outputId": "4fcf0f8d-443f-46dd-82e9-1944d104fd56" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|metadata |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[[fr -> 1.5861607E-20, lv -> 0.0, pt -> 1.3417392E-18, cs -> 1.867664E-20, el -> 1.0063604E-37, it -> 5.571778E-19, nl -> 4.506842E-14, bg -> 0.0, et -> 1.1714899E-21, de -> 1.92503E-15, sv -> 7.8325875E-14, da -> 9.432577E-11, en -> 1.0, sk -> 4.056944E-20, es -> 2.1614831E-21, fi -> 9.728018E-28, ro -> 4.9039217E-21, lt -> 5.974043E-19, sl -> 3.4076286E-12, sentence -> 0, hu -> 1.5670255E-14, pl -> 1.0098746E-16]]|\n", - "|[[fr -> 1.0, lv -> 0.0, pt -> 1.3446618E-30, cs -> 0.0, el -> 0.0, it -> 1.713754E-27, nl -> 4.127939E-37, bg -> 0.0, et -> 0.0, de -> 0.0, sv -> 0.0, da -> 0.0, en -> 0.0, sk -> 0.0, es -> 8.686001E-30, fi -> 0.0, ro -> 9.285741E-25, lt -> 0.0, sl -> 7.775083E-34, sentence -> 0, hu -> 1.5921178E-30, pl -> 0.0]] |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|metadata |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{fr -> 1.5861486E-20, lv -> 0.0, pt -> 1.3417289E-18, cs -> 1.867664E-20, el -> 1.0063604E-37, it -> 5.571735E-19, nl -> 4.5068417E-14, bg -> 0.0, et -> 1.1714855E-21, de -> 1.9250226E-15, sv -> 7.832558E-14, da -> 9.4325055E-11, en -> 1.0, sk -> 4.056913E-20, es -> 2.1614585E-21, fi -> 9.727943E-28, ro -> 4.9038655E-21, lt -> 5.9740204E-19, sl -> 3.4076286E-12, sentence -> 0, hu -> 1.5670225E-14, pl -> 1.0098784E-16}]|\n", + "|[{fr -> 1.0, lv -> 0.0, pt -> 1.3446722E-30, cs -> 0.0, el -> 0.0, it -> 1.7137674E-27, nl -> 4.1279706E-37, bg -> 0.0, et -> 0.0, de -> 0.0, sv -> 0.0, da -> 0.0, en -> 0.0, sk -> 0.0, es -> 8.6860005E-30, fi -> 0.0, ro -> 9.285776E-25, lt -> 0.0, sl -> 7.775083E-34, sentence -> 0, hu -> 1.5921176E-30, pl -> 0.0}] |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } @@ -320,17 +253,6 @@ "# probabilities for other languages\n", "results.select(\"lang.metadata\").show(2, False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "JSgwz7g4dzpu" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -353,8 +275,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/model-downloader/Create custom pipeline - NerDL.ipynb b/examples/python/annotation/text/english/model-downloader/Create custom pipeline - NerDL.ipynb index f744b6581c47fd..d3049467a7e1e5 100644 --- a/examples/python/annotation/text/english/model-downloader/Create custom pipeline - NerDL.ipynb +++ b/examples/python/annotation/text/english/model-downloader/Create custom pipeline - NerDL.ipynb @@ -1,81 +1,31 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "vRMXlUMptinm" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Creating a Custom NerDL Pipeline" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 60993, - "status": "ok", - "timestamp": 1589250591196, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "bbzEH9u7tdxR", - "outputId": "5f930486-c1a7-4f87-9101-a74486be5855" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 55kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 47.8MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 3.4MB/s \n", - "\u001B[?25h" - ] - } - ], + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "BaUA9XiJtWov" - }, - "source": [ - "Show how to use pretrained assertion status" - ] - }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "h4RPEgjutWox" - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -100,33 +50,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 77953, - "status": "ok", - "timestamp": 1589250608173, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "6IRitqxOtWo5", - "outputId": "e2c35cc1-e919-4d1a-b357-e04afb304ab7" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -139,10 +70,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4QjmqB7ItWo_" - }, + "metadata": {}, "source": [ "Create some data for testing purposes" ] @@ -150,11 +78,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "V7IDvZMjtWpA" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import Row\n", @@ -164,10 +88,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "IeiOSDYHtWpF" - }, + "metadata": {}, "source": [ "Create a custom pipeline" ] @@ -175,26 +96,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 117752, - "status": "ok", - "timestamp": 1589250647984, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "4QDHUPNytWpH", - "outputId": "1d4691a5-4367-42b4-eb5d-22d0bea2b964" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -246,21 +148,18 @@ " .setIncludeMetadata(True)\n", "\n", "pipeline_fast_dl = Pipeline(stages = [\n", - " documentAssembler, \n", - " tokenizer, \n", - " lemmatizer, \n", - " spell, \n", - " embeddings, \n", - " ner_dl, \n", + " documentAssembler,\n", + " tokenizer,\n", + " lemmatizer,\n", + " spell,\n", + " embeddings,\n", + " ner_dl,\n", " finisher])" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "f__NmtvstWpO" - }, + "metadata": {}, "source": [ "Now let's use these pipelines and see the results" ] @@ -268,36 +167,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 139 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 122838, - "status": "ok", - "timestamp": 1589250653083, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "uQ4dO_kjtWpQ", - "outputId": "6a7e05ea-f1f0-435d-c9bd-f32f95e7db20" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|sentence |start|end|finished_ner_dl |finished_lemma |finished_spell |finished_ner_dl_metadata |finished_lemma_metadata |finished_spell_metadata |\n", - "+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|Peter is a good person, and he was working at IBM|0 |1 |[B-PER, O, O, O, O, O, O, O, O, O, O, B-ORG]|[Peter, be, a, good, person, ,, and, he, be, work, at, IBM]|[Peter, is, a, good, person, ,, and, he, was, working, at, IBM]|[[word, Peter], [word, is], [word, a], [word, good], [word, person], [word, ,], [word, and], [word, he], [word, was], [word, working], [word, at], [word, IBM]]|[[sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0], [sentence, 0]]|[[confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 0.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0], [confidence, 1.0], [sentence, 0]]|\n", - "+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|sentence |start|end|finished_ner_dl |finished_lemma |finished_spell |finished_ner_dl_metadata |finished_lemma_metadata |finished_spell_metadata |\n", + "+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|Peter is a good person, and he was working at IBM|0 |1 |[B-PER, O, O, O, O, O, O, O, O, O, O, B-ORG]|[Peter, be, a, good, person, ,, and, he, be, work, at, IBM]|[Peter, is, a, good, person, ,, and, he, was, working, at, IBM]|[{word, Peter}, {sentence, 0}, {word, is}, {sentence, 0}, {word, a}, {sentence, 0}, {word, good}, {sentence, 0}, {word, person}, {sentence, 0}, {word, ,}, {sentence, 0}, {word, and}, {sentence, 0}, {word, he}, {sentence, 0}, {word, was}, {sentence, 0}, {word, working}, {sentence, 0}, {word, at}, {sentence, 0}, {word, IBM}, {sentence, 0}]|[{sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}]|[{confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 0.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}, {confidence, 1.0}, {sentence, 0}]|\n", + "+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } @@ -305,17 +185,6 @@ "source": [ "pipeline_fast_dl.fit(test_data).transform(test_data).show(truncate=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "cBbxU2aytWpX" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -337,8 +206,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/model-downloader/ModelDownloaderExample.ipynb b/examples/python/annotation/text/english/model-downloader/ModelDownloaderExample.ipynb index a00499e6fa9462..d1d8bb281704c2 100644 --- a/examples/python/annotation/text/english/model-downloader/ModelDownloaderExample.ipynb +++ b/examples/python/annotation/text/english/model-downloader/ModelDownloaderExample.ipynb @@ -1,105 +1,48 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "M9ps18GDtt5j" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/model-downloader/Running_Pretrained_pipelines.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/ModelDownloaderExample.ipynb)\n", + "\n", + "# Running Pretrained models\n", "\n", - "## 0. Colab Setup" + "In the following example, we walk-through different use cases of some of our Pretrained models and pipelines which could be used off the shelf.\n", + "\n", + "There is BasicPipeline which will return tokens, normalized tokens, lemmas and part of speech tags. The AdvancedPipeline will return same as the BasicPipeline plus Stems, Spell Checked tokens and NER entities using the CRF model. All the pipelines and pre trained models are downloaded from internet at run time hence would require internet access. " ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gm0tZvJdtvgx", - "outputId": "442ebfa3-d968-4d74-b63d-2df16ee7de85" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 14:20:06-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 14:20:06-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 14:20:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:20:07 (26.6 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 54 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 68.6 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 57.0 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "cyumVtb_tt5k" - }, - "source": [ - "## Runing Pretrained models\n", - "\n", - "In the following example, we walk-through different use cases of some of our Pretrained models and pipelines which could be used off the shelf.\n", - "\n", - "There is BasicPipeline which will return tokens, normalized tokens, lemmas and part of speech tags. The AdvancedPipeline will return same as the BasicPipeline plus Stems, Spell Checked tokens and NER entities using the CRF model. All the pipelines and pre trained models are downloaded from internet at run time hence would require internet access. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Emh6GE1Ctt5l" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and create the spark session" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DYPs5MTqtt5m", - "outputId": "138fe46c-38dd-41a6-e975-ac5cba579676" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "3.8.16 (default, Dec 7 2022, 01:12:13) \n", + "3.8.10 (default, Jun 4 2021, 15:09:15) \n", "[GCC 7.5.0]\n" ] } @@ -120,21 +63,15 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nfoLeCq9tt5r", - "outputId": "d2d6a8e3-4e42-46cb-df7c-a213840e4358" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -147,19 +84,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "rr6G_81ftt5v" - }, + "metadata": {}, "source": [ "#### 2. Create a dummy spark dataframe" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "Ur8mKlQTtt5v" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "\n", @@ -173,27 +106,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TiWAq7-tt5z" - }, + "metadata": {}, "source": [ "#### 3. We use predefined BasicPipeline in order to annotate a dataframe with it" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OtpSOtKStt50", - "outputId": "857a4655-6aab-4e91-de62-7a2ec6eecdfd" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "explain_document_ml download started this may take some time.\n", "Approx size to download 9.2 MB\n", @@ -219,26 +144,17 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "dT-FqWFOtt54" - }, + "metadata": {}, "source": [ "#### We can also annotate a single string" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TQ76lDOTtt55", - "outputId": "29846ace-8e69-4bd8-a0f1-cd73324a1e37" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'document': ['This world is made up of good and bad things'],\n", @@ -286,8 +202,9 @@ " 'sentence': ['This world is made up of good and bad things']}" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 8 + "output_type": "execute_result" } ], "source": [ @@ -297,27 +214,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "6TG2d8N3tt5_" - }, + "metadata": {}, "source": [ "#### 4. Now we intend to use one of the fast pretrained models such as Preceptron model which is a POS model trained with ANC American Corpus " ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zSGo6qZbtt6A", - "outputId": "bd301220-3155-49d1-98e5-e5612c546687" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "glove_100d download started this may take some time.\n", "Approximate size to download 145.3 MB\n", @@ -363,27 +272,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "LPPaP1sxtt6G" - }, + "metadata": {}, "source": [ "#### 5. Now we proceed to download a Fast CRF Named Entity Recognitionl which is trained with Glove embeddings. Then, we retrieve the `advancedPipeline` and combine these models to use them appropriately meeting their requirements." ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MXo_zTNatt6H", - "outputId": "9109da40-024c-40f9-c776-7d3fb70a4d18" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "ner_crf download started this may take some time.\n", "Approximate size to download 10.2 MB\n", @@ -411,27 +312,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "5nuR8cxytt6L" - }, + "metadata": {}, "source": [ "#### 6. Finally, lets try a pre trained sentiment analysis pipeline" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CnjUFYqctt6L", - "outputId": "24e51559-8a3c-4a6a-a354-f4a42697bc12" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "analyze_sentiment download started this may take some time.\n", "Approx size to download 4.9 MB\n", @@ -439,7 +332,6 @@ ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "{'checked': ['This', 'is', 'a', 'good', 'movie', '!!!'],\n", @@ -449,22 +341,14 @@ " 'sentence': ['This is a good movie!!!']}" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } ], "source": [ "PretrainedPipeline(\"analyze_sentiment\").annotate(\"This is a good movie!!!\")" ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "H0sOfKV9tt6P" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -486,8 +370,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/onto-recognize-entities/Named entity recognition - OntoNotes.ipynb b/examples/python/annotation/text/english/named-entity-recognition/Named entity recognition - OntoNotes.ipynb similarity index 67% rename from examples/python/annotation/text/english/onto-recognize-entities/Named entity recognition - OntoNotes.ipynb rename to examples/python/annotation/text/english/named-entity-recognition/Named entity recognition - OntoNotes.ipynb index 719ab336886089..a8d1c13d013b00 100644 --- a/examples/python/annotation/text/english/onto-recognize-entities/Named entity recognition - OntoNotes.ipynb +++ b/examples/python/annotation/text/english/named-entity-recognition/Named entity recognition - OntoNotes.ipynb @@ -1,42 +1,21 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zdcOk-JUul_k" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/onto-recognize-entities/Named%20entity%20recognition%20-%20OntoNotes.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/named-entity-recognition/Named%20entity%20recognition%20-%20OntoNotes.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Named Entity Recognition on The OntoNotes Data Set" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 66903, - "status": "ok", - "timestamp": 1589250871656, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "fTnahztvupc-", - "outputId": "ac813ba7-18fc-4946-e228-dc22108c0559" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,35 +24,29 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 57kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 43.7MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.7MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 215.7MB 57kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 43.7MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 2.7MB/s \n", + "\u001b[?25h" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZPq2s9ePul_p" - }, + "metadata": {}, "source": [ "## Named-entity recognition with Deep Learning" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ax87Emhrul_s" - }, + "metadata": {}, "source": [ "

Named-Entity recognition is a well-known technique in information extraction it is also known as entity identificationentity chunking and entity extraction. Knowing the relevant tags for each article help in automatically categorizing the articles in defined hierarchies and enable smooth content discovery. This pipeline is based on NerDLApproach annotator with Char CNN - BiLSTM and GloVe Embeddings on the OntoNotes corpus and supports the identification of 18 entities.

Following NER types are supported in this pipeline:

TypeDescription
PERSONPeople, including fictional.
NORPNationalities or religious or political groups.
FACBuildings, airports, highways, bridges, etc.
ORGCompanies, agencies, institutions, etc.
GPECountries, cities, states.
LOCNon-GPE locations, mountain ranges, bodies of water.
PRODUCTObjects, vehicles, foods, etc. (Not services.)
EVENTNamed hurricanes, battles, wars, sports events, etc.
WORK_OF_ARTTitles of books, songs, etc.
LAWNamed documents made into laws.
LANGUAGEAny named language.
DATEAbsolute or relative dates or periods.
TIMETimes smaller than a day.
PERCENTPercentage, including ”%“.
MONEYMonetary values, including unit.
QUANTITYMeasurements, as of weight or distance.
ORDINAL“first”, “second”, etc.
CARDINALNumerals that do not fall under another type.
" ] @@ -81,38 +54,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 136725, - "status": "ok", - "timestamp": 1589250941490, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "2CBOsZONul_w", - "outputId": "14beca62-8476-4c1d-90c7-3d3f748f175c" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.0.2\n" ] } ], "source": [ - "import sparknlp \n", + "import sparknlp\n", "\n", "spark = sparknlp.start()\n", "\n", @@ -123,11 +77,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2qQl-zeVumAC" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.pretrained import PretrainedPipeline\n", @@ -136,10 +86,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "XGPJqRrsumAM" - }, + "metadata": {}, "source": [ "Now, we load a `onto_recognize_entities_sm` pipeline model which contains the following annotators:\n", "Tokenizer, GloVe embeddings, and NER model trained by Deep Learning" @@ -148,34 +95,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 165875, - "status": "ok", - "timestamp": 1589250970651, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "VOZ1IehqumAO", - "outputId": "a286d459-40df-4f36-b403-819129578b1f", - "scrolled": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "onto_recognize_entities_sm download started this may take some time.\n", - "Approx size to download 159 MB\n", + "Approx size to download 160.1 MB\n", "[OK!]\n" ] } @@ -186,20 +113,14 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "SsJkiJDzumAW" - }, + "metadata": {}, "source": [ "NOTE: We are using `onto_recognize_entities_sm` which is the smaller version. You can use `onto_recognize_entities_lg` which is a larger pipeline model if you have enough resources." ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "rxrqNQMBumAY" - }, + "metadata": {}, "source": [ "Let's annotate our `text` by pretrained `pipeline`:" ] @@ -207,11 +128,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CU9eVRdyumAa" - }, + "metadata": {}, "outputs": [], "source": [ "text = '''Barclays misled shareholders and the public about one of the biggest investments in the bank's history, a BBC Panorama investigation has found.\n", @@ -226,10 +143,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "qFXlpBUuumAj" - }, + "metadata": {}, "source": [ "We can see the output of each annotator below. This one is doing so many things at once!" ] @@ -237,26 +151,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 166705, - "status": "ok", - "timestamp": 1589250971491, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "LLoeWeb8umAl", - "outputId": "9266575f-751f-40ad-cfc4-d942b2b40388" - }, + "metadata": {}, "outputs": [ { "data": { @@ -264,10 +159,8 @@ "['entities', 'document', 'token', 'ner', 'embeddings', 'sentence']" ] }, - "execution_count": 6, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -278,26 +171,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 139 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 166698, - "status": "ok", - "timestamp": 1589250971492, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "LfbCg8lwumAt", - "outputId": "291a9913-7b01-406d-fdcf-7908ebdfceb1" - }, + "metadata": {}, "outputs": [ { "data": { @@ -310,10 +184,8 @@ " \"The S&P 500's price to earnings multiple is 71% higher than Apple's, and if Apple were simply valued at the same multiple, its share price would be $840, which is 52% higher than its current price.\"]" ] }, - "execution_count": 7, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -324,26 +196,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 476 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 166691, - "status": "ok", - "timestamp": 1589250971493, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "BAIXR3H7umA3", - "outputId": "2ae6fcad-e4cd-40e4-ca50-6d706f035934" - }, + "metadata": {}, "outputs": [ { "data": { @@ -354,33 +207,29 @@ " '2008',\n", " 'Manchester City',\n", " 'Sheikh Mansour',\n", - " 'more than £3bn',\n", " 'BBC',\n", " 'Barclays',\n", " 'British',\n", " 'Abu Dhabi',\n", " 'Barclays',\n", + " '\"a',\n", " 'RBS',\n", - " 'Lloyds TSB',\n", - " 'Barclays',\n", + " 'Lloyds TSB, Barclays',\n", " '2008',\n", - " '7bn',\n", " 'Gulf',\n", " 'Qatar',\n", - " 'Abu Dhabi',\n", + " 'Abu Dhabi.',\n", " 'S&P',\n", " \"500's\",\n", " '71%',\n", - " 'Apple',\n", + " \"Apple's,\",\n", " 'Apple',\n", " '$840',\n", " '52%']" ] }, - "execution_count": 8, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -391,26 +240,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 166684, - "status": "ok", - "timestamp": 1589250971494, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "PC5oWvzVumBC", - "outputId": "3183e81f-95ab-4981-cfe0-fa79f65b6bfd" - }, + "metadata": {}, "outputs": [ { "data": { @@ -429,17 +259,14 @@ " ('investments', 'O'),\n", " ('in', 'O'),\n", " ('the', 'O'),\n", - " ('bank', 'O'),\n", - " (\"'s\", 'O'),\n", - " ('history', 'O'),\n", - " (',', 'O'),\n", + " (\"bank's\", 'O'),\n", + " ('history,', 'O'),\n", " ('a', 'O'),\n", " ('BBC', 'B-ORG'),\n", " ('Panorama', 'I-ORG'),\n", " ('investigation', 'O'),\n", " ('has', 'O'),\n", - " ('found', 'O'),\n", - " ('.', 'O'),\n", + " ('found.', 'O'),\n", " ('The', 'O'),\n", " ('bank', 'O'),\n", " ('announced', 'O'),\n", @@ -455,19 +282,16 @@ " ('agreed', 'O'),\n", " ('to', 'O'),\n", " ('invest', 'O'),\n", - " ('more', 'B-MONEY'),\n", - " ('than', 'I-MONEY'),\n", - " ('£', 'I-MONEY'),\n", - " ('3bn', 'I-MONEY'),\n", - " ('.', 'O'),\n", + " ('more', 'O'),\n", + " ('than', 'O'),\n", + " ('£3bn.', 'O'),\n", " ('But', 'O'),\n", " ('the', 'O'),\n", " ('BBC', 'B-ORG'),\n", " ('found', 'O'),\n", " ('that', 'O'),\n", " ('the', 'O'),\n", - " ('money', 'O'),\n", - " (',', 'O'),\n", + " ('money,', 'O'),\n", " ('which', 'O'),\n", " ('helped', 'O'),\n", " ('Barclays', 'B-ORG'),\n", @@ -476,16 +300,14 @@ " ('bailout', 'O'),\n", " ('by', 'O'),\n", " ('British', 'B-NORP'),\n", - " ('taxpayers', 'O'),\n", - " (',', 'O'),\n", + " ('taxpayers,', 'O'),\n", " ('actually', 'O'),\n", " ('came', 'O'),\n", " ('from', 'O'),\n", " ('the', 'O'),\n", " ('Abu', 'B-GPE'),\n", " ('Dhabi', 'I-GPE'),\n", - " ('government', 'O'),\n", - " ('.', 'O'),\n", + " ('government.', 'O'),\n", " ('Barclays', 'B-ORG'),\n", " ('said', 'O'),\n", " ('the', 'O'),\n", @@ -494,19 +316,15 @@ " ('its', 'O'),\n", " ('accounts', 'O'),\n", " ('was', 'O'),\n", - " ('\"', 'O'),\n", - " ('a', 'O'),\n", + " ('\"a', 'B-PERSON'),\n", " ('drafting', 'O'),\n", - " ('error', 'O'),\n", - " ('\"', 'O'),\n", - " ('.', 'O'),\n", + " ('error\".', 'O'),\n", " ('Unlike', 'O'),\n", " ('RBS', 'B-ORG'),\n", " ('and', 'O'),\n", " ('Lloyds', 'B-ORG'),\n", - " ('TSB', 'I-ORG'),\n", - " (',', 'O'),\n", - " ('Barclays', 'B-ORG'),\n", + " ('TSB,', 'I-ORG'),\n", + " ('Barclays', 'I-ORG'),\n", " ('narrowly', 'O'),\n", " ('avoided', 'O'),\n", " ('having', 'O'),\n", @@ -523,13 +341,11 @@ " ('was', 'O'),\n", " ('rescued', 'O'),\n", " ('by', 'O'),\n", - " ('£', 'O'),\n", - " ('7bn', 'B-MONEY'),\n", + " ('£7bn', 'O'),\n", " ('worth', 'O'),\n", " ('of', 'O'),\n", " ('new', 'O'),\n", - " ('investment', 'O'),\n", - " (',', 'O'),\n", + " ('investment,', 'O'),\n", " ('most', 'O'),\n", " ('of', 'O'),\n", " ('which', 'O'),\n", @@ -542,8 +358,7 @@ " ('Qatar', 'B-GPE'),\n", " ('and', 'O'),\n", " ('Abu', 'B-GPE'),\n", - " ('Dhabi', 'I-GPE'),\n", - " ('.', 'O'),\n", + " ('Dhabi.', 'I-GPE'),\n", " ('The', 'O'),\n", " ('S&P', 'B-ORG'),\n", " (\"500's\", 'B-DATE'),\n", @@ -556,9 +371,7 @@ " ('%', 'I-PERCENT'),\n", " ('higher', 'O'),\n", " ('than', 'O'),\n", - " ('Apple', 'B-ORG'),\n", - " (\"'s\", 'O'),\n", - " (',', 'O'),\n", + " (\"Apple's,\", 'B-CARDINAL'),\n", " ('and', 'O'),\n", " ('if', 'O'),\n", " ('Apple', 'B-ORG'),\n", @@ -568,8 +381,7 @@ " ('at', 'O'),\n", " ('the', 'O'),\n", " ('same', 'O'),\n", - " ('multiple', 'O'),\n", - " (',', 'O'),\n", + " ('multiple,', 'O'),\n", " ('its', 'O'),\n", " ('share', 'O'),\n", " ('price', 'O'),\n", @@ -585,31 +397,17 @@ " ('than', 'O'),\n", " ('its', 'O'),\n", " ('current', 'O'),\n", - " ('price', 'O'),\n", - " ('.', 'O')]" + " ('price.', 'O')]" ] }, - "execution_count": 9, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(zip(result['token'], result['ner']))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2kjeflxwumBM" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -632,8 +430,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/named-entity-recognition/ZeroShot_NER.ipynb b/examples/python/annotation/text/english/named-entity-recognition/ZeroShot_NER.ipynb index 220ea367064c0a..85dc68c4e9cecf 100644 --- a/examples/python/annotation/text/english/named-entity-recognition/ZeroShot_NER.ipynb +++ b/examples/python/annotation/text/english/named-entity-recognition/ZeroShot_NER.ipynb @@ -3,18 +3,11 @@ { "attachments": {}, "cell_type": "markdown", - "id": "db5f4f9a-7776-42b3-8758-85624d4c15ea", - "metadata": {}, - "source": [ - "![JohnSnowLabs](https://johnsnowlabs.com/assets/images/logo.png)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "21e9eafb", + "id": "36f39d62", "metadata": {}, "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/named-entity-recognition/ZeroShot_NER.ipynb)" ] }, @@ -42,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -55,7 +48,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.2.8\n", + "Spark NLP version: 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -69,7 +62,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -85,11 +78,12 @@ " " ], "text/plain": [ - "" + "" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -135,7 +129,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "finner_roberta_zeroshot download started this may take some time.\n", + "roberta_base_qa_squad2 download started this may take some time.\n", + "Approximate size to download 442.6 MB\n", "[OK!]\n" ] } @@ -249,7 +244,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "nlpdev", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -263,11 +258,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "cf73c0c97d90b2660ff29b0c9bed4b851524d3484a00df4555e25832aa5cf188" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/pretrained-pipelines/Explain Document DL.ipynb b/examples/python/annotation/text/english/pretrained-pipelines/Explain Document DL.ipynb new file mode 100644 index 00000000000000..54f6568bb88ad5 --- /dev/null +++ b/examples/python/annotation/text/english/pretrained-pipelines/Explain Document DL.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/pretrained-pipelines/Explain%20Document%20DL.ipynb)\n", + "\n", + "# Explain Documents with Deep Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "openjdk version \"1.8.0_252\"\n", + "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", + "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", + "\u001b[K |████████████████████████████████| 215.7MB 55kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 45.2MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 9.4MB/s \n", + "\u001b[?25h" + ] + } + ], + "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we import the necessary modules." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Spark NLP version: \", sparknlp.version())\n", + "print(\"Apache Spark version: \", spark.version)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.pretrained import PretrainedPipeline\n", + "from sparknlp.base import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we load a pipeline model which contains the following annotators:\n", + "Tokenizer, Deep Sentence Detector, Lemmatizer, Stemmer, Part of Speech (POS) and Context Spell Checker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "explain_document_dl download started this may take some time.\n", + "Approx size to download 169.4 MB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "pipeline = PretrainedPipeline('explain_document_dl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We simply annotate our text (string) and the pipeline does the rest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = 'He would love to visit many beautful cities wth you. He lives in an amazing country.'\n", + "result = pipeline.annotate(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see the output of each annotator below. This one is doing so many things at once!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['entities',\n", + " 'stem',\n", + " 'checked',\n", + " 'lemma',\n", + " 'document',\n", + " 'pos',\n", + " 'token',\n", + " 'ner',\n", + " 'embeddings',\n", + " 'sentence']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(result.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['He would love to visit many beautful cities wth you.',\n", + " 'He lives in an amazing country.']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result['sentence']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['He',\n", + " 'would',\n", + " 'love',\n", + " 'to',\n", + " 'visit',\n", + " 'many',\n", + " 'beautiful',\n", + " 'city',\n", + " 'wth',\n", + " 'you',\n", + " '.',\n", + " 'He',\n", + " 'life',\n", + " 'in',\n", + " 'an',\n", + " 'amazing',\n", + " 'country',\n", + " '.']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result['lemma']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('He', 'PRP'),\n", + " ('would', 'MD'),\n", + " ('love', 'VB'),\n", + " ('to', 'TO'),\n", + " ('visit', 'VB'),\n", + " ('many', 'JJ'),\n", + " ('beautiful', 'JJ'),\n", + " ('cities', 'NNS'),\n", + " ('wth', 'NN'),\n", + " ('you', 'PRP'),\n", + " ('.', '.'),\n", + " ('He', 'PRP'),\n", + " ('lives', 'VBZ'),\n", + " ('in', 'IN'),\n", + " ('an', 'DT'),\n", + " ('amazing', 'JJ'),\n", + " ('country', 'NN'),\n", + " ('.', '.')]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(zip(result['checked'], result['pos']))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Explain Document DL.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/annotation/text/english/match-datetime-pipeline/Pretrained-MatchDateTime-Pipeline.ipynb b/examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchDateTime-Pipeline.ipynb similarity index 51% rename from examples/python/annotation/text/english/match-datetime-pipeline/Pretrained-MatchDateTime-Pipeline.ipynb rename to examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchDateTime-Pipeline.ipynb index 3c746d9a783ed3..790531850b7582 100644 --- a/examples/python/annotation/text/english/match-datetime-pipeline/Pretrained-MatchDateTime-Pipeline.ipynb +++ b/examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchDateTime-Pipeline.ipynb @@ -1,42 +1,21 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "gML0pvu2qlbu" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/match-datetime-pipeline/Pretrained-MatchDateTime-Pipeline.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchDateTime-Pipeline.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Use pretrained `match_datetime` Pipeline" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 187 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 93559, - "status": "ok", - "timestamp": 1589249898570, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "oVCjj9-kqtLc", - "outputId": "4e56a4df-8649-41d8-c47c-c3085a2ca6d5" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,38 +24,28 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 60kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 48.7MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 3.3MB/s \n", - "\u001B[?25hopenjdk version \"1.8.0_252\"\n", + "\u001b[K |████████████████████████████████| 215.7MB 60kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 48.7MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 3.3MB/s \n", + "\u001b[?25hopenjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "uq86_c_aqlbx" - }, - "source": [ - "# Use pretrained `match_datetime` Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "wzvUCOmYqlb1" - }, + "metadata": {}, "source": [ + "The pipeline consists of:\n", + "\n", "* DocumentAssembler\n", "* SentenceDetector\n", "* Tokenizer\n", @@ -86,11 +55,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CZ4l-YI4qlb7" - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -110,10 +75,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zwBh1szrqlcJ" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] @@ -121,35 +83,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 93542, - "status": "ok", - "timestamp": 1589249898572, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "PXZGs275qlcM", - "outputId": "dfd95116-4a09-4ddf-8b5b-e191dcdbb2ec" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n", - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -163,36 +104,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 96514, - "status": "ok", - "timestamp": 1589249901555, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "-W3lzq-qqlcd", - "outputId": "f5561aa4-3b82-4dce-f138-db46178a1667" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "match_datetime download started this may take some time.\n", - "Approx size to download 12.9 KB\n", - "[OK!]\n", - "match_datetime download started this may take some time.\n", - "Approx size to download 12.9 KB\n", + "Approx size to download 9.7 KB\n", "[OK!]\n" ] } @@ -204,11 +123,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "yzP2tZ_2qlcn" - }, + "metadata": {}, "outputs": [], "source": [ "result=pipeline.annotate(\"Let's meet on 20th of February.\")" @@ -217,37 +132,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 96493, - "status": "ok", - "timestamp": 1589249901556, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "1--gYXBoqlct", - "outputId": "aacb0da1-23c1-4def-cfe3-b9b99cea8ab0" - }, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['2020/02/20']" + "['2023/02/20']" ] }, - "execution_count": 13, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -258,11 +152,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "BQKiE2Puqldg" - }, + "metadata": {}, "outputs": [], "source": [ "dfTest = spark.createDataFrame([\"I would like to come over and see you in 01/02/2019.\"], StringType()).toDF(\"text\")\n" @@ -271,11 +161,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "esgPWuhxqldm" - }, + "metadata": {}, "outputs": [], "source": [ "result=pipeline.transform(dfTest)" @@ -284,26 +170,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 101024, - "status": "ok", - "timestamp": 1589249906109, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "Lohgu6rbqldr", - "outputId": "e55c5f2e-6d3d-4d4f-a08f-136517bd52ce" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -343,8 +210,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/match-pattern-pipeline/Pretrained-MatchPattern-Pipeline.ipynb b/examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchPattern-Pipeline.ipynb similarity index 51% rename from examples/python/annotation/text/english/match-pattern-pipeline/Pretrained-MatchPattern-Pipeline.ipynb rename to examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchPattern-Pipeline.ipynb index f37984c6e1c289..056be1d309f59c 100644 --- a/examples/python/annotation/text/english/match-pattern-pipeline/Pretrained-MatchPattern-Pipeline.ipynb +++ b/examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchPattern-Pipeline.ipynb @@ -1,42 +1,21 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "5Wzk8GWnsQQi" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/match-pattern-pipeline/Pretrained-MatchPattern-Pipeline.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples//home/root/Workspace/scala/spark-nlp/examples/python/annotation/text/english/pretrained-pipelines/Pretrained-MatchPattern-Pipeline.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Use pretrained `match_pattern` Pipeline" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 65174, - "status": "ok", - "timestamp": 1589250264692, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "ImZIKSr3sUHt", - "outputId": "06c4a08a-39c1-40f6-c76e-64404363a3e5" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,36 +24,25 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 50kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 37.8MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.9MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 215.7MB 50kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 37.8MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 2.9MB/s \n", + "\u001b[?25h" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "RvS7xyNgsQQk" - }, - "source": [ - "# Use pretrained `match_pattern` Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "0drIUpwqsQQl" - }, + "metadata": {}, "source": [ + "It consists of:\n", "\n", "* DocumentAssembler\n", "* SentenceDetector\n", @@ -84,12 +52,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2bOIPKPYsQQm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -112,44 +76,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jJIZWbxfsQQt" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 144701, - "status": "ok", - "timestamp": 1589250344253, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "Bq5s2HZ9sQQv", - "outputId": "7db672db-7d1f-406d-8093-f0e5395c8aba" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.6.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.0.2\n" ] } ], @@ -162,10 +104,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6PflkNDIsQQ1" - }, + "metadata": {}, "source": [ "This Pipeline can extract `phone numbers` in these formats:\n", "```\n", @@ -184,34 +123,15 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 148727, - "status": "ok", - "timestamp": 1589250348299, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "QhgIJnUtsQQ2", - "outputId": "7f9d74e9-50be-4b53-df33-ff9a349c99f0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "match_pattern download started this may take some time.\n", - "Approx size to download 19.6 KB\n", + "Approx size to download 28.3 KB\n", "[OK!]\n" ] } @@ -222,12 +142,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Lham7OvgsQQ6" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "result=pipeline.annotate(\"You should call Mr. Jon Doe at +33 1 79 01 22 89\")" @@ -235,27 +151,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 148685, - "status": "ok", - "timestamp": 1589250348301, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "aeZQ49HtsQQ-", - "outputId": "24d4631e-d6db-4a82-b65e-78e471330839" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -263,7 +160,7 @@ "['+33 1 79 01 22 89']" ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -274,12 +171,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "nuPASMc9sQRE" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "result=pipeline.annotate(\"Ring me up dude! +1-334-179-1466\")" @@ -287,27 +180,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 793, - "status": "ok", - "timestamp": 1589250424332, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "xtO7DD0MsQRK", - "outputId": "eba847d0-4cf8-4ff5-f79b-3f0d0dea7e44" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -315,7 +189,7 @@ "['+1-334-179-1466']" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -323,17 +197,6 @@ "source": [ "result['regex']" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5-IYQUsWsQRP" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -356,8 +219,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/explain-document-ml/explain_document_ml.ipynb b/examples/python/annotation/text/english/pretrained-pipelines/explain_document_ml.ipynb similarity index 89% rename from examples/python/annotation/text/english/explain-document-ml/explain_document_ml.ipynb rename to examples/python/annotation/text/english/pretrained-pipelines/explain_document_ml.ipynb index b31a23a38cd171..bbfffb677aa6e5 100644 --- a/examples/python/annotation/text/english/explain-document-ml/explain_document_ml.ipynb +++ b/examples/python/annotation/text/english/pretrained-pipelines/explain_document_ml.ipynb @@ -1,32 +1,25 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "9GvBuvbHphxE" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/explain-document-ml/explain_document_ml.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/annotation/english/explain-document-ml/explain_document_ml.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Use pretrained `explain_document_ml` Pipeline" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5zlh6MnQpl26", - "outputId": "e78f64d3-89d4-444e-92ad-d1efed8b8e24" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2022-12-23 12:46:29-- http://setup.johnsnowlabs.com/colab.sh\n", "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", @@ -50,32 +43,21 @@ "\n", "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 52 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 63.0 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 50.0 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" + "\u001b[K |████████████████████████████████| 281.5 MB 52 kB/s \n", + "\u001b[K |████████████████████████████████| 453 kB 63.0 MB/s \n", + "\u001b[K |████████████████████████████████| 199 kB 50.0 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "ThpzCfo3phxF" - }, - "source": [ - "# Use pretrained `explain_document_ml` Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "29fSDt6uphxG" - }, + "metadata": {}, "source": [ "### Stages\n", "\n", @@ -90,10 +72,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "tSOcfkWQphxI" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -102,30 +82,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FbgOOhTNphxT" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ooCPVDd5phxU", - "outputId": "aa3e0bf5-4f07-48af-b744-dc758acd7b10" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -139,19 +111,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9ExcAgCAphxb" - }, + "metadata": {}, "source": [ "#### This is our testing document, we'll use it to exemplify all different pipeline stages." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "0g_N8k2Gphxc" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "testDoc = spark.createDataFrame([\n", @@ -164,18 +132,12 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "aZUUGsFiphxi", - "outputId": "67c6ac9e-ec13-46bb-b117-b051af4cce3f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -192,18 +154,12 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LyOW2GbUphxn", - "outputId": "be4f49cd-e591-414a-aaaa-af017d177d44" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "explain_document_ml download started this may take some time.\n", "Approx size to download 9.2 MB\n", @@ -217,27 +173,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "vJcN-nkZphxv" - }, + "metadata": {}, "source": [ "#### We are not interested in handling big datasets, let's switch to LightPipelines for speed." ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e0RqKnZ7phxx", - "outputId": "f656f962-c8d2-4379-bc33-f01ab50d5473" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", @@ -336,27 +284,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "qSvvdw_5phx2" - }, + "metadata": {}, "source": [ "#### Let's analyze these results - first let's see what sentences we detected" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MBBIBR68phx3", - "outputId": "b1bfaa12-b5b4-4682-bf88-9764053a5509" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|result |\n", @@ -373,27 +313,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "EZ_h7d5aphx-" - }, + "metadata": {}, "source": [ "#### Now let's see how those sentences were tokenized" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qKkjO_Qvphx-", - "outputId": "3574ade0-654e-4d6a-d690-2a4f0a11e530" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|result |\n", @@ -410,27 +342,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "EHxFY0cOphyE" - }, + "metadata": {}, "source": [ "#### Notice some spelling errors? the pipeline takes care of that as well" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e9gZbHGuphyF", - "outputId": "229c9e80-0339-4f2b-fbd5-4d6d83a48061" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|spell |\n", @@ -447,27 +371,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "7MhN2mknphyK" - }, + "metadata": {}, "source": [ "#### Now let's see the lemmas" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eJX66qMKphyL", - "outputId": "3432d095-cfc5-4e2e-bb55-7491268f7a49" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|result |\n", @@ -484,27 +400,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "l-ftnStIphyW" - }, + "metadata": {}, "source": [ "#### Let's check the stems, any difference with the lemmas shown bebore?" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "k8XiWIwNphyY", - "outputId": "fa9eac10-454e-4793-a48c-cf7a187299dd" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|result |\n", @@ -521,27 +429,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "yYCYaoT2phyc" - }, + "metadata": {}, "source": [ "#### Let's look at Part Of Speech (POS) results" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hGpwnZ4Yphye", - "outputId": "c7c01934-c24e-447c-d926-280316d3d00d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|result |\n", @@ -555,17 +455,6 @@ "source": [ "result.select(\"pos.result\").show(1, False)\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3dds_6bwphyl" - }, - "outputs": [], - "source": [ - " " - ] } ], "metadata": { @@ -587,8 +476,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/question-answering/Question_Answering_and_Summarization_with_T5.ipynb b/examples/python/annotation/text/english/question-answering/Question_Answering_and_Summarization_with_T5.ipynb index 7d144f36eb52af..c07130e4378302 100644 --- a/examples/python/annotation/text/english/question-answering/Question_Answering_and_Summarization_with_T5.ipynb +++ b/examples/python/annotation/text/english/question-answering/Question_Answering_and_Summarization_with_T5.ipynb @@ -12,7 +12,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/question-answering/Question_Answering_and_Summarization_with_T5.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/question-answering/Question_Answering_and_Summarization_with_T5.ipynb)" ] }, { @@ -90,7 +90,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.2.1 spark-nlp==4.3.0" + "!pip install -q pyspark==3.2.1 spark-nlp==4.3.1" ] }, { @@ -102,7 +102,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n", + "Spark NLP version 4.3.1\n", "Apache Spark version: 3.2.1\n" ] }, @@ -624,13 +624,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]" - }, - "vscode": { - "interpreter": { - "hash": "cf73c0c97d90b2660ff29b0c9bed4b851524d3484a00df4555e25832aa5cf188" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb b/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb index 791c4f45d29caa..f65d387b46ca90 100644 --- a/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb +++ b/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb)\n", "\n", "\n", "# **Matching Text with RegexMatcher**" @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -46,7 +46,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.2.8\n", + "Spark NLP version: 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -60,7 +60,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -76,7 +76,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": null, @@ -219,8 +219,8 @@ { "data": { "text/plain": [ - "{Param(parent='RegexMatcher_85ce3493b69d', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,\n", - " Param(parent='RegexMatcher_85ce3493b69d', name='strategy', doc='MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE'): 'MATCH_ALL'}" + "{Param(parent='RegexMatcher_d761d66f2182', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,\n", + " Param(parent='RegexMatcher_d761d66f2182', name='strategy', doc='MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE'): 'MATCH_ALL'}" ] }, "execution_count": null, @@ -325,7 +325,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "sparknlp", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -339,11 +339,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "8b81146af0a3e5653a315622171ee30f7af15821bda096dcb17032694ac0d21c" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/regex-tokenizer/regex_tokenizer_examples.ipynb b/examples/python/annotation/text/english/regex-tokenizer/regex_tokenizer_examples.ipynb index 71d878637c3011..9efc28c987aeff 100644 --- a/examples/python/annotation/text/english/regex-tokenizer/regex_tokenizer_examples.ipynb +++ b/examples/python/annotation/text/english/regex-tokenizer/regex_tokenizer_examples.ipynb @@ -2,92 +2,37 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "FoiHtKZMdIji" - }, + "metadata": {}, "source": [ - "# Document Normalizer annotator notebook" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a7c65f64-07d6-4355-97a0-0a371d83116c", - "showTitle": false, - "title": "" - }, - "id": "IR5fSl51dIjk" - }, - "source": [ - "# Set up Colab environment" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-tokenizer/regex_tokenizer_examples.ipynb)\n", + "\n", + "# Tokenization using RegexTokenizer" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "HMoSFm4YdIjl", - "outputId": "07d0a493-0513-4d90-862a-d561c506b611", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 14:48:19-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 14:48:20-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 14:48:20-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:48:20 (51.4 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 52 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 76.7 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 64.9 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "FJDSju19dIjl" - }, + "metadata": {}, "source": [ - "# Start Spark NLP session" + "## Start Spark NLP session" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "ZFg6pYqrdIjl" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", @@ -100,22 +45,11 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "1bnQU73ydIjm", - "outputId": "93785975-8f3d-4029-c6fd-8f390c21a4e5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -124,11 +58,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v3.2.3
\n", + "
v3.3.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -138,10 +72,14 @@ " \n", "
\n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ @@ -149,34 +87,21 @@ ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "5G_onsRFdIjm" - }, + "metadata": {}, "source": [ - "# Regex Tokenizer annotator" + "## Regex Tokenizer annotator" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b4efb61f-6011-4ba1-a0ad-6c229f69e3d9", - "showTitle": true, - "title": "DocumentNormalizer overview and parameters" - }, - "id": "XfDocvQ7dIjm", - "outputId": "463d4ed3-9a15-4328-e248-b7b5b897f819", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -198,7 +123,6 @@ "from pyspark.sql.types import StringType\n", "from sparknlp.base import *\n", "from sparknlp.annotator import *\n", - "import sparknlp\n", "\n", "content = \"1. T1-T2 DATE**[12/24/13] $1.99 () (10/12), ph+ 90%\"\n", "pattern = \"\\\\s+|(?=[-.:;*+,$&%\\\\[\\\\]])|(?<=[-.:;*+,$&%\\\\[\\\\]])\"\n", @@ -242,10 +166,13 @@ "notebookOrigID": 3142402907558969, "widgets": {} }, + "colab": { + "provenance": [] + }, "kernelspec": { - "display_name": "Python [conda env:spknlp270] *", + "display_name": "Python 3", "language": "python", - "name": "conda-env-spknlp270-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -256,11 +183,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb b/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb index bd4f72c6f2bba3..0f39f3f53d0bdd 100644 --- a/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb +++ b/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb @@ -1,29 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "cpYpeEfnmWKd" - }, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xl3k8bt-mZIc" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/sentence-detection/SentenceDetector_advanced_examples.ipynb)\n", - "\n" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb)\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "xluzxinzKK-L" - }, + "metadata": {}, "source": [ "# [Sentence Detector](https://nlp.johnsnowlabs.com/docs/en/annotators#sentencedetector)\n", "\n", @@ -55,11 +44,9 @@ ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "h4fQwZ46x4fu" - }, + "metadata": {}, "source": [ "Only run this block if you are inside Google Colab to set up Spark NLP otherwise\n", "skip it." @@ -67,10 +54,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "MdE588BiY3z1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -85,16 +70,14 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "SBtn9YsW0eHz" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 3.4.2\n", + "Spark NLP version: 4.3.1\n", "Apache Spark version: 3.0.2\n" ] } @@ -131,14 +114,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hJFV80wXyXiQ", - "outputId": "c1c1ef34-8604-482d-d845-11ed44d48275" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -190,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -242,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -295,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -317,7 +294,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -385,8 +362,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/dictionary-sentiment/sentiment_rb.ipynb b/examples/python/annotation/text/english/sentiment-detection/sentiment_rb.ipynb similarity index 63% rename from examples/python/annotation/text/english/dictionary-sentiment/sentiment_rb.ipynb rename to examples/python/annotation/text/english/sentiment-detection/sentiment_rb.ipynb index 025f0538606c8c..1cd1d1a5824389 100644 --- a/examples/python/annotation/text/english/dictionary-sentiment/sentiment_rb.ipynb +++ b/examples/python/annotation/text/english/sentiment-detection/sentiment_rb.ipynb @@ -1,42 +1,21 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "lo0OnOM9m4pT" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/dictionary-sentiment/sentiment_rb.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentiment-detection/sentiment_rb.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Sentiment Analysis Pipeline" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 60335, - "status": "ok", - "timestamp": 1589248853524, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "Wyp_iig_m8an", - "outputId": "93b22049-e862-4903-ed3d-99145e49cd88" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,27 +24,25 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 54kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 49.8MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.8MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 215.7MB 54kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 49.8MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 2.8MB/s \n", + "\u001b[?25h" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "JcsRGNEgm4pY" - }, + "metadata": {}, "source": [ - "## Sentiment Analysis Pipeline\n", + "\n", "\n", "This pipeline will be used to explain a number of important features of the Spark-NLP library; Sentence Detection, Tokenization, Spell Checking, and Sentiment Detection.\n", "The idea is to start with natural language as could have been entered by a user, and get sentiment associated to it. Let's walk through each of the stages!\n" @@ -73,22 +50,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "xgQI0l_jm4pa" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource path to read local data files" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "QieWhPT2m4pd" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#Imports\n", @@ -105,44 +75,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "8ePrBDnUm4pu" - }, + "metadata": {}, "source": [ "#### 2. Load SparkSession if not already there" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 78682, - "status": "ok", - "timestamp": 1589248871889, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "AjQlmFfFm4pv", - "outputId": "eb49dbe9-ed34-4879-b784-35ace8501081" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.6.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.0.2\n" ] } ], @@ -157,37 +105,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "x3X2BgGKm4p0" - }, + "metadata": {}, "source": [ "#### 3. Load our predefined pipeline containing all the important annotators." ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 98038, - "status": "ok", - "timestamp": 1589248891253, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "wA6SHyHXm4p1", - "outputId": "61d3bce7-2c76-4e13-ff30-1ca7e932693c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -205,22 +131,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tGvqjVvIm4p8" - }, + "metadata": {}, "source": [ "#### 4. Create some user opinions for some movies, keep an eye on the spelling, we'll get back to that soon." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gLokJdDdm4p9" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "testDocs = [\n", @@ -230,27 +149,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 98673, - "status": "ok", - "timestamp": 1589248891898, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "Gkr4zEVbm4qC", - "outputId": "a6513da5-84dd-4d98-f0ac-934a9267816a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -261,7 +161,7 @@ " (['This was movie was amesome, everything was nice.'], ['negative'])]" ] }, - "execution_count": 9, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -273,10 +173,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "N0NtXdFOm4qH" - }, + "metadata": {}, "source": [ " #### [Optional] - inspect intermmediate stages - spell checking\n", " As you can see, it suggests `avoid` instead of `awoid`" @@ -284,27 +181,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 98668, - "status": "ok", - "timestamp": 1589248891900, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "ImwLl1g8m4qJ", - "outputId": "0e93d064-dec6-407d-ee9a-74a3b03f2f70" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -382,7 +260,7 @@ " 'sentence': ['This was movie was amesome, everything was nice.']}]" ] }, - "execution_count": 11, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -390,13 +268,6 @@ "source": [ "result" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -419,8 +290,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/spark-nlp-basics/playground-dataFrames.ipynb b/examples/python/annotation/text/english/spark-nlp-basics/playground-dataFrames.ipynb index ebeec3e4eab301..b139e4191897a4 100644 --- a/examples/python/annotation/text/english/spark-nlp-basics/playground-dataFrames.ipynb +++ b/examples/python/annotation/text/english/spark-nlp-basics/playground-dataFrames.ipynb @@ -2,70 +2,48 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "S7dCfLqzwneQ" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/spark-nlp-basics/playground-dataFrames.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/spark-nlp-basics/playground-dataFrames.ipynb)\n", "\n", - "## 0. Colab Setup" + "## Playground - DataFrames" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 65668, - "status": "ok", - "timestamp": 1589251785012, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "NCexM404wrf8", - "outputId": "df28f5f2-e335-4c1d-e2aa-0005d8f016d6" - }, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 57kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 31.7MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 9.5MB/s \n", - "\u001B[?25h" + "env: PYSPARK_PYTHON=/home/ducha/.conda/envs/sparknlp/bin/python\n", + "env: PYSPARK_DRIVER_PYTHON=/home/ducha/.conda/envs/sparknlp/bin/python\n" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + "%env PYSPARK_PYTHON=/home/ducha/.conda/envs/sparknlp/bin/python\n", + "%env PYSPARK_DRIVER_PYTHON=/home/ducha/.conda/envs/sparknlp/bin/python" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "alGaB2c0wlv2" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -73,155 +51,58 @@ "from sparknlp.base import *\n", "from sparknlp.annotator import *\n", "\n", - "from pyspark.ml import Pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gtskmiLowlv-" - }, - "outputs": [], - "source": [ + "from pyspark.ml import Pipeline\n", + "\n", "spark = sparknlp.start()" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8KsDucaQwlwG" - }, - "outputs": [], - "source": [ - "document = DocumentAssembler().setInputCol('text').setOutputCol('document')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oa6NxmwNwlwQ" - }, - "outputs": [], - "source": [ - "tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 93013, - "status": "ok", - "timestamp": 1589251812433, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "SRckqv9ZwlwU", - "outputId": "73aa0ee4-be9b-43f2-d18d-3ffa85a46de7" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pos_anc download started this may take some time.\n", - "Approximate size to download 4.3 MB\n", + "Approximate size to download 3.9 MB\n", "[OK!]\n" ] } ], "source": [ - "pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8WYP4RgLwlwc" - }, - "outputs": [], - "source": [ - "pipeline = Pipeline().setStages([document, tokenizer, pos])" + "document = DocumentAssembler().setInputCol('text').setOutputCol('document')\n", + "tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')\n", + "pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')\n", + "\n", + "pipeline = Pipeline().setStages([document, tokenizer, pos])\n", + "\n", + "data = spark.read.text('./sample-sentences-en.txt').toDF('text')" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 93010, - "status": "error", - "timestamp": 1589251812458, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "2QywH5GGwlwh", - "outputId": "dbadfa64-def9-41f0-d66e-8de1bb8b6700" - }, + "metadata": {}, "outputs": [ { - "ename": "AnalysisException", - "evalue": "ignored", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mPy4JJavaError\u001B[0m Traceback (most recent call last)", - "\u001B[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/sql/utils.py\u001B[0m in \u001B[0;36mdeco\u001B[0;34m(*a, **kw)\u001B[0m\n\u001B[1;32m 62\u001B[0m \u001B[0;32mtry\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 63\u001B[0;31m \u001B[0;32mreturn\u001B[0m \u001B[0mf\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m*\u001B[0m\u001B[0ma\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkw\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 64\u001B[0m \u001B[0;32mexcept\u001B[0m \u001B[0mpy4j\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mprotocol\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mPy4JJavaError\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0me\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;32m/usr/local/lib/python3.6/dist-packages/py4j/protocol.py\u001B[0m in \u001B[0;36mget_return_value\u001B[0;34m(answer, gateway_client, target_id, name)\u001B[0m\n\u001B[1;32m 327\u001B[0m \u001B[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 328\u001B[0;31m format(target_id, \".\", name), value)\n\u001B[0m\u001B[1;32m 329\u001B[0m \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;31mPy4JJavaError\u001B[0m: An error occurred while calling o75.text.\n: org.apache.spark.sql.AnalysisException: Path does not exist: file:/content/sample-sentences-en.txt;\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:558)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.foreach(List.scala:392)\n\tat scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.flatMap(List.scala:355)\n\tat org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)\n\tat org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)\n\tat org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)\n\tat org.apache.spark.sql.DataFrameReader.text(DataFrameReader.scala:714)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)", - "\u001B[0;32m\u001B[0m in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0mdata\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mspark\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mread\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'./sample-sentences-en.txt'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtoDF\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'text'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m", - "\u001B[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/sql/readwriter.py\u001B[0m in \u001B[0;36mtext\u001B[0;34m(self, paths, wholetext, lineSep)\u001B[0m\n\u001B[1;32m 341\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mpaths\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mbasestring\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 342\u001B[0m \u001B[0mpaths\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m[\u001B[0m\u001B[0mpaths\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 343\u001B[0;31m \u001B[0;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_df\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_jreader\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_spark\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_sc\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_jvm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mPythonUtils\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtoSeq\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mpaths\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 344\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 345\u001B[0m \u001B[0;34m@\u001B[0m\u001B[0msince\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;36m2.0\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;32m/usr/local/lib/python3.6/dist-packages/py4j/java_gateway.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self, *args)\u001B[0m\n\u001B[1;32m 1255\u001B[0m \u001B[0manswer\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mgateway_client\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msend_command\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mcommand\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 1256\u001B[0m return_value = get_return_value(\n\u001B[0;32m-> 1257\u001B[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001B[0m\u001B[1;32m 1258\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 1259\u001B[0m \u001B[0;32mfor\u001B[0m \u001B[0mtemp_arg\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mtemp_args\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/sql/utils.py\u001B[0m in \u001B[0;36mdeco\u001B[0;34m(*a, **kw)\u001B[0m\n\u001B[1;32m 67\u001B[0m e.java_exception.getStackTrace()))\n\u001B[1;32m 68\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0ms\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstartswith\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'org.apache.spark.sql.AnalysisException: '\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 69\u001B[0;31m \u001B[0;32mraise\u001B[0m \u001B[0mAnalysisException\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0ms\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msplit\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m': '\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;36m1\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstackTrace\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 70\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0ms\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstartswith\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'org.apache.spark.sql.catalyst.analysis'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 71\u001B[0m \u001B[0;32mraise\u001B[0m \u001B[0mAnalysisException\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0ms\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msplit\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m': '\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;36m1\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstackTrace\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;31mAnalysisException\u001B[0m: 'Path does not exist: file:/content/sample-sentences-en.txt;'" + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| text|\n", + "+--------------------+\n", + "|Peter is a very g...|\n", + "|My life in Russia...|\n", + "|John and Peter ar...|\n", + "|Lucas Nogal Dunbe...|\n", + "|Europe is very cu...|\n", + "+--------------------+\n", + "\n" ] } ], - "source": [ - "data = spark.read.text('./sample-sentences-en.txt').toDF('text')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "w-u1tRNUwlwl" - }, - "outputs": [], "source": [ "data.show(5)" ] @@ -229,11 +110,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4uNRZD1Twlwr" - }, + "metadata": {}, "outputs": [], "source": [ "model = pipeline.fit(data)" @@ -242,11 +119,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "pKrDgH4nwlww" - }, + "metadata": {}, "outputs": [], "source": [ "result = model.transform(data)" @@ -255,12 +128,25 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wkFPiHP5wlw0" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+\n", + "| text| document| token| pos|\n", + "+--------------------+--------------------+--------------------+--------------------+\n", + "|Peter is a very g...|[{document, 0, 27...|[{token, 0, 4, Pe...|[{pos, 0, 4, NNP,...|\n", + "|My life in Russia...|[{document, 0, 37...|[{token, 0, 1, My...|[{pos, 0, 1, PRP$...|\n", + "|John and Peter ar...|[{document, 0, 76...|[{token, 0, 3, Jo...|[{pos, 0, 3, NNP,...|\n", + "|Lucas Nogal Dunbe...|[{document, 0, 67...|[{token, 0, 4, Lu...|[{pos, 0, 4, NNP,...|\n", + "|Europe is very cu...|[{document, 0, 68...|[{token, 0, 5, Eu...|[{pos, 0, 5, NNP,...|\n", + "+--------------------+--------------------+--------------------+--------------------+\n", + "\n" + ] + } + ], "source": [ "result.show(5)" ] @@ -268,11 +154,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "qBgoqXGswlw3" - }, + "metadata": {}, "outputs": [], "source": [ "stored = result\\\n", @@ -284,12 +166,28 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "TCH7lgcGwlw7" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- text: string (nullable = true)\n", + " |-- pos_begin: array (nullable = true)\n", + " | |-- element: integer (containsNull = true)\n", + " |-- pos_end: array (nullable = true)\n", + " | |-- element: integer (containsNull = true)\n", + " |-- pos_result: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- pos_meta: array (nullable = true)\n", + " | |-- element: map (containsNull = true)\n", + " | | |-- key: string\n", + " | | |-- value: string (valueContainsNull = true)\n", + "\n" + ] + } + ], "source": [ "stored.printSchema()" ] @@ -297,22 +195,32 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "S32Z9gROwlw_" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+--------------------+\n", + "| text| pos_begin| pos_end| pos_result| pos_meta|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+\n", + "|Peter is a very g...|[0, 6, 9, 11, 16,...|[4, 7, 9, 14, 19,...|[NNP, VBZ, DT, RB...|[{word -> Peter, ...|\n", + "|My life in Russia...|[0, 3, 8, 11, 18,...|[1, 6, 9, 16, 19,...|[PRP$, NN, IN, NN...|[{word -> My, sen...|\n", + "|John and Peter ar...|[0, 5, 9, 15, 19,...|[3, 7, 13, 17, 26...|[NNP, CC, NNP, VB...|[{word -> John, s...|\n", + "|Lucas Nogal Dunbe...|[0, 6, 12, 23, 26...|[4, 10, 21, 24, 2...|[NNP, NNP, NNP, V...|[{word -> Lucas, ...|\n", + "|Europe is very cu...|[0, 7, 10, 15, 23...|[5, 8, 13, 21, 26...|[NNP, VBZ, RB, RB...|[{word -> Europe,...|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+\n", + "\n" + ] + } + ], "source": [ "stored.show(5)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Xz8Ma_2zwlxE" - }, + "metadata": {}, "source": [ "---------\n", "## Spark SQL Functions" @@ -321,11 +229,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "JcON68CzwlxF" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.functions import *" @@ -334,12 +238,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "M1BAS8IawlxJ" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+---------+-------+----------+--------+\n", + "|text|pos_begin|pos_end|pos_result|pos_meta|\n", + "+----+---------+-------+----------+--------+\n", + "+----+---------+-------+----------+--------+\n", + "\n" + ] + } + ], "source": [ "stored.filter(array_contains('pos_result', 'VBD')).show(5)" ] @@ -347,12 +259,25 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "jpmQRXj-wlxR" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+\n", + "| pos_result|token_count|\n", + "+--------------------+-----------+\n", + "|[NNP, VBZ, DT, RB...| 7|\n", + "|[PRP$, NN, IN, NN...| 8|\n", + "|[NNP, CC, NNP, VB...| 15|\n", + "|[NNP, NNP, NNP, V...| 15|\n", + "|[NNP, VBZ, RB, RB...| 15|\n", + "+--------------------+-----------+\n", + "\n" + ] + } + ], "source": [ "stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)" ] @@ -360,12 +285,25 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "pOOi9H8QwlxX" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------------+\n", + "| text|array_max(pos_end)|\n", + "+--------------------+------------------+\n", + "|Peter is a very g...| 27|\n", + "|My life in Russia...| 37|\n", + "|John and Peter ar...| 76|\n", + "|Lucas Nogal Dunbe...| 67|\n", + "|Europe is very cu...| 68|\n", + "+--------------------+------------------+\n", + "\n" + ] + } + ], "source": [ "stored.select('text', array_max('pos_end')).show(5)" ] @@ -373,12 +311,25 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "6pnSCfWUwlxa" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| pos_result| unique_pos|\n", + "+--------------------+--------------------+\n", + "|[NNP, VBZ, DT, RB...|[NNP, VBZ, DT, RB...|\n", + "|[PRP$, NN, IN, NN...|[PRP$, NN, IN, NN...|\n", + "|[NNP, CC, NNP, VB...|[NNP, CC, VBP, NN...|\n", + "|[NNP, NNP, NNP, V...|[NNP, VBZ, DT, RB...|\n", + "|[NNP, VBZ, RB, RB...|[NNP, VBZ, RB, JJ...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], "source": [ "stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)" ] @@ -386,22 +337,32 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "eS-9UFo6wlxd" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+\n", + "|array_sort(array_distinct(pos_result), lambdafunction((IF(((left IS NULL) AND (right IS NULL)), 0, (IF((left IS NULL), 1, (IF((right IS NULL), -1, (IF((left < right), -1, (IF((left > right), 1, 0)))))))))), left, right))|count|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+\n", + "| [., CC, EX, JJ, N...| 1|\n", + "| [., IN, JJ, NN, N...| 1|\n", + "| [., CC, DT, IN, J...| 1|\n", + "| [., DT, IN, JJ, N...| 1|\n", + "| [., DT, JJ, NN, N...| 1|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+\n", + "\n" + ] + } + ], "source": [ "stored.groupBy(array_sort(array_distinct('pos_result'))).count().show(10)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Bs7eJ_Auwlxh" - }, + "metadata": {}, "source": [ "----------------\n", "### SQL Functions with `col`" @@ -410,11 +371,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "YKMoYU6Bwlxi" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.functions import col" @@ -423,22 +380,32 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3jCKEk_pwlxl" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------+\n", + "|pos_meta[0][word]|\n", + "+-----------------+\n", + "| Peter|\n", + "| My|\n", + "| John|\n", + "| Lucas|\n", + "| Europe|\n", + "+-----------------+\n", + "\n" + ] + } + ], "source": [ "stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "_Yh3K8Ldwlxq" - }, + "metadata": {}, "source": [ "-------------\n", "### Spark NLP Annotation UDFs" @@ -447,12 +414,22 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "I6MCiYiLwlxq" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|pos |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{pos, 0, 4, NNP, {word -> Peter, sentence -> 0}, []}, {pos, 6, 7, VBZ, {word -> is, sentence -> 0}, []}, {pos, 9, 9, DT, {word -> a, sentence -> 0}, []}, {pos, 11, 14, RB, {word -> very, sentence -> 0}, []}, {pos, 16, 19, JJ, {word -> good, sentence -> 0}, []}, {pos, 21, 26, NN, {word -> person, sentence -> 0}, []}, {pos, 27, 27, ., {word -> ., sentence -> 0}, []}]|\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "only showing top 1 row\n", + "\n" + ] + } + ], "source": [ "result.select('pos').show(1, truncate=False)" ] @@ -460,71 +437,44 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "E11dYbVrwlxv" - }, - "outputs": [], - "source": [ - "def nn_tokens(annotations):\n", - " nn_annotations = list(\n", - " filter(lambda annotation: annotation.result == 'NN', annotations)\n", - " )\n", - " return list(\n", - " map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "aeoIE-dSwlxz" - }, - "outputs": [], - "source": [ - "from sparknlp.functions import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "n6GJPlH3wlx8" - }, + "metadata": {}, "outputs": [], "source": [ + "from sparknlp.functions import *\n", "from pyspark.sql.types import ArrayType, StringType" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4BgHti3lwlyA" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------------------------------------------+\n", + "|nn_tokens |\n", + "+--------------------------------------------------------+\n", + "|[{pos, 21, 26, NN, {sentence -> 0, word -> person}, []}]|\n", + "|[{pos, 3, 6, NN, {sentence -> 0, word -> life}, []}] |\n", + "|[] |\n", + "|[{pos, 57, 59, NN, {sentence -> 0, word -> car}, []}] |\n", + "|[] |\n", + "+--------------------------------------------------------+\n", + "\n" + ] + } + ], "source": [ - "result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)" + "def nn_tokens(annotations):\n", + " nn_annotations = list(\n", + " filter(lambda annotation: annotation.result == 'NN', annotations)\n", + " )\n", + " return nn_annotations\n", + "\n", + "result.select(map_annotations(nn_tokens, Annotation.arrayType())('pos').alias('nn_tokens')).show(truncate=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "NJSrPZ0CwlyH" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -547,8 +497,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/spark-nlp-basics/spark_nlp_basics_functions.ipynb b/examples/python/annotation/text/english/spark-nlp-basics/spark_nlp_basics_functions.ipynb index 3bb08fc71890c9..8c747f162f1d86 100644 --- a/examples/python/annotation/text/english/spark-nlp-basics/spark_nlp_basics_functions.ipynb +++ b/examples/python/annotation/text/english/spark-nlp-basics/spark_nlp_basics_functions.ipynb @@ -1,59 +1,15 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/spark-nlp-basics/spark-nlp-basics-functions.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/spark-nlp-basics/spark_nlp_basics_functions.ipynb)\n", "\n", - "## 0. Colab Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2022-12-23 14:47:28-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 14:47:28-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 14:47:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "\r", - "- 0%[ ] 0 --.-KB/s \r", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:47:28 (70.5 MB/s) - written to stdout [1191/1191]\n", - "\n", - "\u001B[K |████████████████████████████████| 281.5 MB 64 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 70.0 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 67.5 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + "# Spark NLP Basic Funtions" ] }, { @@ -62,7 +18,8 @@ "metadata": {}, "outputs": [], "source": [ - "from sparknlp.annotator import *" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { @@ -74,27 +31,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.0\n", + "Apache Spark version: 3.3.0\n" ] } ], "source": [ - "import sparknlp \n", + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.pretrained import *\n", "\n", "spark = sparknlp.start()\n", "\n", "print(\"Spark NLP version: \", sparknlp.version())\n", - "print(\"Apache Spark version: \", spark.version)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sparknlp.pretrained import *" + "print(\"Apache Spark version: \", spark.version)" ] }, { @@ -238,13 +188,6 @@ "\n", "explode_annotations_col(result, 'lemmas.result', 'exploded').select('exploded').show()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -253,7 +196,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -267,11 +210,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/spell-check-ml-pipeline/Pretrained-SpellCheckML-Pipeline.ipynb b/examples/python/annotation/text/english/spell-check-ml-pipeline/Pretrained-SpellCheckML-Pipeline.ipynb index 0ac894483a3cc8..84d5c7909992d1 100644 --- a/examples/python/annotation/text/english/spell-check-ml-pipeline/Pretrained-SpellCheckML-Pipeline.ipynb +++ b/examples/python/annotation/text/english/spell-check-ml-pipeline/Pretrained-SpellCheckML-Pipeline.ipynb @@ -2,14 +2,11 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fVz3NatXxzIf" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/spell-check-ml-pipeline/Pretrained-SpellCheckML-Pipeline.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/annotation/english/spell-check-ml-pipeline/Pretrained-SpellCheckML-Pipeline.ipynb)\n", "\n", "## 0. Colab Setup" ] @@ -17,26 +14,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 60194, - "status": "ok", - "timestamp": 1589251753927, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "nd7ytKhcx_-r", - "outputId": "9ffeccbf-d907-486c-d82a-7dc1fad96c89" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,35 +23,29 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 60kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 42.6MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.8MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 215.7MB 60kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 42.6MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 122kB 2.8MB/s \n", + "\u001b[?25h" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "OPMadhksxzIi" - }, + "metadata": {}, "source": [ "# Use pretrained `spell_check_ml` Pipeline" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "kkC_fjhrxzIj" - }, + "metadata": {}, "source": [ "\n", "* DocumentAssembler\n", @@ -85,11 +57,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "trAejfMCxzIl" - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -110,10 +78,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "iczPX28QxzIr" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] @@ -121,26 +86,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 78653, - "status": "ok", - "timestamp": 1589251772401, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "VEoi9ufNxzIt", - "outputId": "ed755219-10ed-435f-a885-85eafd47096c" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -161,26 +107,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 90422, - "status": "ok", - "timestamp": 1589251784178, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "HRb1Lk1QxzI4", - "outputId": "4b1abb73-9b97-465c-adea-e96f12616aba" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -199,11 +126,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "HMm4pp8ZxzJB" - }, + "metadata": {}, "outputs": [], "source": [ "result=pipeline.annotate(\"Yestarday I lost my blue unikorn and I wass really sad! This is an exampe of how wrog my english is.\")" @@ -212,26 +135,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 408 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 91022, - "status": "ok", - "timestamp": 1589251784794, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "PLKwH8gSxzJI", - "outputId": "c02bbce9-25b4-4e81-c3cc-95b4b7c70a84" - }, + "metadata": {}, "outputs": [ { "data": { @@ -261,7 +165,7 @@ " ('.', '.')]" ] }, - "execution_count": 6, + "execution_count": null, "metadata": { "tags": [] }, @@ -274,24 +178,10 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-bkZvv85xzJS" - }, + "metadata": {}, "source": [ "We fixed the spelling of `yesterday`, `unicorn`, `was`, `example`, and `wrong` with `check_spelling` Pipeline." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Xx1vtq4lxzJT" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -314,8 +204,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb b/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb index 4e927105830374..713e247325d627 100644 --- a/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb +++ b/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb)\n", "\n", "\n", "# **Word Stemming with Stemmer**" @@ -28,13 +28,21 @@ "## **0. Colab Setup**" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Only run this block if you are inside Google Colab to set up Spark NLP otherwise\n", + "skip it." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { @@ -46,7 +54,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.2.8\n", + "Spark NLP version: 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -60,7 +68,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -76,7 +84,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": null, @@ -374,7 +382,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "sparknlp", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -388,11 +396,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "8b81146af0a3e5653a315622171ee30f7af15821bda096dcb17032694ac0d21c" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb b/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb index d8f4110eb939d7..b1aa879c1a389d 100644 --- a/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb +++ b/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb @@ -1,31 +1,21 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "DryaQ76bhsVy" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/language-detection/Language_Detection_and_Indentification.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Cleaning Stop Words" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "id": "dcHpCkBsg2ma", - "outputId": "5665479d-c7b2-4b2a-88bd-a90a80f4593b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -34,48 +24,37 @@ "openjdk version \"1.8.0_252\"\n", "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 54kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 47.1MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 133kB 4.4MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 215.7MB 54kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 47.1MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 133kB 4.4MB/s \n", + "\u001b[?25h" ] } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ODtmoBwfoX3T" - }, + "metadata": {}, "source": [ "## 1. Start Spark Session" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "brRueZ25g2mf", - "outputId": "89e7cf8b-60e3-43e9-c162-c62c0aed9620" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 2.5.5\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version 4.3.1\n", + "Apache Spark version: 3.0.2\n" ] } ], @@ -93,10 +72,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "O4gGGYD6P6NN" - }, + "metadata": {}, "source": [ "## StopWordsCleaner Pre-trained Models\n", "\n", @@ -126,12 +102,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "opGVYbNlg2mj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", @@ -144,16 +116,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "1-eGocORg2ml", - "outputId": "1db46153-797a-46dd-bc0b-cc2261bec8ff" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -195,16 +159,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "aS6YMOr0g2mo", - "outputId": "f486299e-5246-4ced-cae4-1534369176fb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -228,16 +184,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "_qIM3Xb_g2mq", - "outputId": "0e15463f-b118-4c1e-d53e-52f5294f65f1" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -815,27 +763,14 @@ " 'zero']" ] }, - "execution_count": 5, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_words.getStopWords()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "KNZHIQLZg2mt" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -858,8 +793,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb b/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb index 7bdfc80d2d031f..e0b3dbd3d479d3 100644 --- a/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb +++ b/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb @@ -1,75 +1,33 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "tYF_5Dmcx1vE" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/text-matcher-pipeline/extractor.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb\n", + ")\n", "\n", - "## 0. Colab Setup" + "# Simple Text Matching" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ki5VC3Cvx7Aj", - "outputId": "81d75603-e597-4577-c93c-c971076e8f0d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 14:46:26-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 14:46:26-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 14:46:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 0%[ ] 0 --.-KB/s Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:46:26 (37.1 MB/s) - written to stdout [1191/1191]\n", - "\n", - "\u001B[K |████████████████████████████████| 281.5 MB 45 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 49.7 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 35.5 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "5wDMa90xx1vF" - }, + "metadata": {}, "source": [ - "## Simple Text Matching\n", - "\n", "In the following example, we walk-through our straight forward Text Matcher Annotator.\n", "\n", "This annotator will take a list of sentences from a text file and look them up in the given target dataset.\n", @@ -79,43 +37,16 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "6NjQrLZux1vH" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource path to read local data files" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Lx0KPGX1x1vI", - "outputId": "aafdf19b-65cd-4031-dc40-f19bb8b0b129" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 14:47:22-- https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/text-matcher-pipeline/entities.txt\n", - "Resolving github.com (github.com)... 140.82.113.4\n", - "Connecting to github.com (github.com)|140.82.113.4|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [text/html]\n", - "Saving to: ‘entities.txt’\n", - "\n", - "entities.txt [ <=> ] 149.13K --.-KB/s in 0.04s \n", - "\n", - "Last-modified header missing -- time-stamps turned off.\n", - "2022-12-23 14:47:22 (3.83 MB/s) - ‘entities.txt’ saved [152712]\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import os\n", "import sys\n", @@ -127,28 +58,20 @@ "import sparknlp\n", "from sparknlp.annotator import *\n", "from sparknlp.common import *\n", - "from sparknlp.base import *\n", - "\n", - "! wget -N https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/text-matcher-pipeline/entities.txt " + "from sparknlp.base import *" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rmMkmM_1x1vP", - "outputId": "0a9f0c42-4438-4f15-f890-a84c9b1242cb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.0.2\n" ] } ], @@ -161,19 +84,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "P-vYbBJlx1vU" - }, + "metadata": {}, "source": [ "#### 3. Create appropriate annotators. We are using Sentence Detection and Tokenizing the sentence. The Finisher will clean the annotations and exclude the metadata." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "9gcfIPUbx1vV" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "documentAssembler = DocumentAssembler()\\\n", @@ -210,34 +129,32 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "O3tM-gQgx1vb" - }, + "metadata": {}, "source": [ "#### 4. Load the input data to be annotated" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "380SHnKjx1vb", - "outputId": "b8147bec-633d-4931-9431-00ac204db772" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "rm: cannot remove '/tmp/sentiment.parquet.zip': No such file or directory\n", - "--2022-12-23 14:53:58-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.134.229, 52.216.37.120, 52.216.226.235, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.134.229|:443... connected.\n", - "HTTP request sent, awaiting response... 304 Not Modified\n", - "File ‘sentiment.parquet.zip’ not modified on server. Omitting download.\n", + "--2023-02-20 11:50:49-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.60.24, 52.216.220.80, 54.231.233.48, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.60.24|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 76127532 (73M) [application/zip]\n", + "Saving to: ‘sentiment.parquet.zip’\n", + "\n", + "sentiment.parquet.z 100%[===================>] 72,60M 22,8MB/s in 3,7s \n", + "\n", + "2023-02-20 11:50:54 (19,6 MB/s) - ‘sentiment.parquet.zip’ saved [76127532/76127532]\n", "\n", "Archive: sentiment.parquet.zip\n", " creating: sentiment.parquet/\n", @@ -261,42 +178,36 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "O4gu_eMYx1vg", - "outputId": "a33fbdc5-caef-4a13-b345-13eb0538b068" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+\n", "|itemid|sentiment| text|\n", "+------+---------+--------------------+\n", - "| 1| 0| ...|\n", - "| 2| 0| ...|\n", - "| 3| 1| omg...|\n", - "| 4| 0| .. Omga...|\n", - "| 5| 0| i think ...|\n", - "| 6| 0| or i jus...|\n", - "| 7| 1| Juuuuuuuuu...|\n", - "| 8| 0| Sunny Agai...|\n", - "| 9| 1| handed in m...|\n", - "| 10| 1| hmmmm.... i...|\n", - "| 11| 0| I must thin...|\n", - "| 12| 1| thanks to a...|\n", - "| 13| 0| this weeken...|\n", - "| 14| 0| jb isnt show...|\n", - "| 15| 0| ok thats it ...|\n", - "| 16| 0| <-------- ...|\n", - "| 17| 0| awhhe man.......|\n", - "| 18| 1| Feeling stran...|\n", - "| 19| 0| HUGE roll of ...|\n", - "| 20| 0| I just cut my...|\n", + "|799033| 0|@FrankomQ8 What's...|\n", + "|799034| 1|@FranKoUK guitar ...|\n", + "|799035| 0|@frankparenteau u...|\n", + "|799036| 1|@frankparenteau w...|\n", + "|799037| 1|@FrankPatris dude...|\n", + "|799038| 0|@FrankRamblings a...|\n", + "|799039| 1|@frankroberts ni...|\n", + "|799040| 0|@frankroberts ur ...|\n", + "|799041| 1|@FrankS Breaking ...|\n", + "|799042| 1|@frankschultelad ...|\n", + "|799043| 0|@frankshorter Wol...|\n", + "|799044| 0|@franksting - its...|\n", + "|799045| 1|@franksting Ha! D...|\n", + "|799046| 1|@franksting yeah,...|\n", + "|799047| 1|@franksting yes, ...|\n", + "|799048| 1|@FrankSylar arn't...|\n", + "|799049| 1| @frankules WO ? |\n", + "|799050| 0|@frankwkelly I'm ...|\n", + "|799051| 1|@FrankXSalinas Th...|\n", + "|799052| 1|@frankybhoy93 tha...|\n", "+------+---------+--------------------+\n", "only showing top 20 rows\n", "\n" @@ -306,34 +217,26 @@ "source": [ "data = spark. \\\n", " read. \\\n", - " parquet(\"/content/sentiment.parquet\"). \\\n", + " parquet(\"sentiment.parquet\"). \\\n", " limit(1000).cache()\n", "data.show(20)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "VZeXrk3Yx1vj" - }, + "metadata": {}, "source": [ "#### 5. Running the fit for sentence detection and tokenization." ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BHqv0yWGx1vk", - "outputId": "93873fcb-409c-4d41-c972-4049ec12d267" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Start fitting\n", "Fitting is ended\n" @@ -348,58 +251,57 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BSUwZM4ux1vp" - }, + "metadata": {}, "source": [ "#### 6. Runing the transform on data to do text matching. It will append a new coloumns with matched entities." ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CgkYE0V_x1vq", - "outputId": "eacd08cf-2a74-4fea-df05-be832cbba501" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+----------------+\n", "|itemid|sentiment| text|finished_entites|\n", "+------+---------+--------------------+----------------+\n", - "| 1| 0| ...| []|\n", - "| 2| 0| ...| []|\n", - "| 3| 1| omg...| []|\n", - "| 4| 0| .. Omga...| []|\n", - "| 5| 0| i think ...| []|\n", - "| 6| 0| or i jus...| []|\n", - "| 7| 1| Juuuuuuuuu...| []|\n", - "| 8| 0| Sunny Agai...| []|\n", - "| 9| 1| handed in m...| []|\n", - "| 10| 1| hmmmm.... i...| []|\n", - "| 11| 0| I must thin...| []|\n", - "| 12| 1| thanks to a...| []|\n", - "| 13| 0| this weeken...| []|\n", - "| 14| 0| jb isnt show...| []|\n", - "| 15| 0| ok thats it ...| []|\n", - "| 16| 0| <-------- ...| []|\n", - "| 17| 0| awhhe man.......| []|\n", - "| 18| 1| Feeling stran...| []|\n", - "| 19| 0| HUGE roll of ...| []|\n", - "| 20| 0| I just cut my...| []|\n", + "|799033| 0|@FrankomQ8 What's...| []|\n", + "|799034| 1|@FranKoUK guitar ...|[guitar lessons]|\n", + "|799035| 0|@frankparenteau u...| []|\n", + "|799036| 1|@frankparenteau w...| []|\n", + "|799037| 1|@FrankPatris dude...| []|\n", + "|799038| 0|@FrankRamblings a...| []|\n", + "|799039| 1|@frankroberts ni...| []|\n", + "|799040| 0|@frankroberts ur ...| []|\n", + "|799041| 1|@FrankS Breaking ...| []|\n", + "|799042| 1|@frankschultelad ...| []|\n", + "|799043| 0|@frankshorter Wol...| []|\n", + "|799044| 0|@franksting - its...| []|\n", + "|799045| 1|@franksting Ha! D...| []|\n", + "|799046| 1|@franksting yeah,...| []|\n", + "|799047| 1|@franksting yes, ...| []|\n", + "|799048| 1|@FrankSylar arn't...| []|\n", + "|799049| 1| @frankules WO ? | []|\n", + "|799050| 0|@frankwkelly I'm ...| []|\n", + "|799051| 1|@FrankXSalinas Th...| []|\n", + "|799052| 1|@frankybhoy93 tha...| []|\n", "+------+---------+--------------------+----------------+\n", "only showing top 20 rows\n", "\n", - "+------+---------+----+----------------+\n", - "|itemid|sentiment|text|finished_entites|\n", - "+------+---------+----+----------------+\n", - "+------+---------+----+----------------+\n", + "+------+---------+--------------------+----------------+\n", + "|itemid|sentiment| text|finished_entites|\n", + "+------+---------+--------------------+----------------+\n", + "|799034| 1|@FranKoUK guitar ...|[guitar lessons]|\n", + "|799065| 0|@Frannyd oh lame....| [i think]|\n", + "|799173| 1|i am seriously sl...| [i think]|\n", + "|799869| 1|@FrazzleYeah yea...| [i think]|\n", + "|799898| 0|@freakyfudge that...| [i think]|\n", + "|799957| 1|@FreddyMallet Hi!...| [i think]|\n", + "|800003| 0|@freecloud but i ...| [i think]|\n", + "+------+---------+--------------------+----------------+\n", "\n" ] } @@ -416,19 +318,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "hs52YfLwx1vv" - }, + "metadata": {}, "source": [ "#### 7. The model could be saved locally and reloaded to run again" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "5_g760_Fx1vw" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "\n", @@ -437,23 +335,24 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hAPuXvl9x1v1", - "outputId": "cee8b6c7-06f0-45d6-d311-9b4dc558f372" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "+------+---------+----+----------------+\n", - "|itemid|sentiment|text|finished_entites|\n", - "+------+---------+----+----------------+\n", - "+------+---------+----+----------------+\n", + "+------+---------+--------------------+----------------+\n", + "|itemid|sentiment| text|finished_entites|\n", + "+------+---------+--------------------+----------------+\n", + "|799034| 1|@FranKoUK guitar ...|[guitar lessons]|\n", + "|799065| 0|@Frannyd oh lame....| [i think]|\n", + "|799173| 1|i am seriously sl...| [i think]|\n", + "|799869| 1|@FrazzleYeah yea...| [i think]|\n", + "|799898| 0|@freakyfudge that...| [i think]|\n", + "|799957| 1|@FreddyMallet Hi!...| [i think]|\n", + "|800003| 0|@freecloud but i ...| [i think]|\n", + "+------+---------+--------------------+----------------+\n", "\n" ] } @@ -467,15 +366,6 @@ ".filter(\"size(finished_entites) != 0\") \\\n", ".show()" ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "05lj_hTdx1v7" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -498,8 +388,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb b/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb index 2aefd8c688fc41..654bfa8e87e7a1 100644 --- a/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb +++ b/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb @@ -1,250 +1,62 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "5vxzYTW-cTv8", - "outputId": "9a870958-d998-4d8d-f641-30aeced7e784", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 14:55:47-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 14:55:48-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 14:55:48-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:55:48 (45.3 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 46 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 57.4 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 63.2 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "9FfkFoYHcTv-", - "outputId": "30bb3d43-163b-4a22-af32-a72ad517f706", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "openjdk version \"11.0.17\" 2022-10-18\n", - "OpenJDK Runtime Environment (build 11.0.17+8-post-Ubuntu-1ubuntu218.04)\n", - "OpenJDK 64-Bit Server VM (build 11.0.17+8-post-Ubuntu-1ubuntu218.04, mixed mode, sharing)\n", - " total used free shared buff/cache available\n", - "Mem: 12 0 6 0 5 11\n", - "Swap: 0 0 0\n" - ] - } - ], - "source": [ - "!java -version\n", - "!free -g" - ] - }, - { - "cell_type": "code", + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, "source": [ - "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/text-similarity/file1.csv\n", - "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/text-similarity/file2.csv\n" - ], - "metadata": { - "id": "y70C2BPeesRK", - "outputId": "fe18cfbc-6e2f-4cc5-c1ff-aa24f3bc6e9a", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 14:56:43-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/text-similarity/file1.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 297 [text/plain]\n", - "Saving to: ‘file1.csv’\n", - "\n", - "\rfile1.csv 0%[ ] 0 --.-KB/s \rfile1.csv 100%[===================>] 297 --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:56:43 (9.62 MB/s) - ‘file1.csv’ saved [297/297]\n", - "\n", - "--2022-12-23 14:56:43-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/text-similarity/file2.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 262 [text/plain]\n", - "Saving to: ‘file2.csv’\n", - "\n", - "file2.csv 100%[===================>] 262 --.-KB/s in 0s \n", - "\n", - "2022-12-23 14:56:43 (9.84 MB/s) - ‘file2.csv’ saved [262/262]\n", - "\n" - ] - } + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb)\n", + "\n", + "# Calculating Text Similarity" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "Lja137H2cTv-" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "#!pip install --ignore-installed -q pyspark==2.4.5\n", - "#!gsutil cp gs://hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar /opt/conda/lib/python3.7/site-packages/pyspark/jars/\n", - " \n", - "#!pip install --ignore-installed spark-nlp==2.5.1" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "meJjUs07cTv_", - "outputId": "d4292729-d1c6-46c6-ab87-78fa8e5d02ef", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "root 587 241 0 14:56 ? 00:00:00 /bin/bash -c ps -ef | grep spark\n", - "root 589 587 0 14:56 ? 00:00:00 grep spark\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], "source": [ - "!ps -ef | grep spark" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "Niauzii1cTv_" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "import json\n", - "import os\n", "from pyspark.ml import Pipeline\n", - "from pyspark.sql import SparkSession\n", - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "from pyspark.sql.functions import from_unixtime\n", - "from pyspark.sql.functions import unix_timestamp\n", + "from pyspark.ml.feature import *\n", "from pyspark.sql.functions import *\n", - "from pyspark.sql.functions import explode, col\n", - "from pyspark.sql.functions import from_unixtime, to_date, asc, year, udf, explode, split, col, desc, length, rank, dense_rank, avg, sum\n", - "from pyspark.sql.window import Window\n", - "from pyspark.ml.linalg import Vectors\n", - "from pyspark.ml.feature import VectorAssembler, StandardScaler\n", - "from pyspark.ml.stat import Correlation\n", - "from pyspark.ml.clustering import BisectingKMeans\n", - "from pyspark.ml.evaluation import ClusteringEvaluator\n", - "from pyspark.ml import Pipeline\n", - "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", - "from pyspark.sql.functions import col, to_timestamp,date_format\n", - "from pyspark import StorageLevel\n", - "import pyspark.sql.functions as F\n", - "from sparknlp.pretrained import PretrainedPipeline\n", - "from collections import Counter\n", - "from sparknlp.base import Finisher, DocumentAssembler\n", - "from sparknlp.annotator import (Tokenizer, Normalizer,LemmatizerModel, StopWordsCleaner)\n", - "from pyspark.ml import Pipeline\n", - "import matplotlib.pyplot as plt\n", - "from pyspark.ml import Pipeline\n", - "\n", "from sparknlp.annotator import *\n", - "from sparknlp.common import *\n", "from sparknlp.base import *\n", - "from pyspark.ml.feature import Normalizer, SQLTransformer\n", - "from pyspark.ml.feature import BucketedRandomProjectionLSH\n", - "from pyspark.sql.functions import monotonically_increasing_id\n", + "from sparknlp.common import *\n", + "from sparknlp.functions import *\n", "\n", - "import pandas as pd\n", + "import sparknlp\n", "\n", + "spark = sparknlp.start()\n", "\n", - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "j47WpVxCcTv_", - "outputId": "2900b78b-b4a7-469c-b94c-d80d22987217", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "root 604 241 91 14:56 ? 00:00:46 /usr/lib/jvm/java-11-openjdk-amd64/bin/java -cp /usr/local/lib/python3.8/dist-packages/pyspark/conf:/usr/local/lib/python3.8/dist-packages/pyspark/jars/* -Xmx16G org.apache.spark.deploy.SparkSubmit --conf spark.master=local[*] --conf spark.driver.memory=16G --conf spark.kryoserializer.buffer.max=2000M --conf spark.driver.maxResultSize=0 --conf spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.6 --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.app.name=Spark NLP pyspark-shell\n", - "root 1314 241 0 14:57 ? 00:00:00 /bin/bash -c ps -ef | grep spark\n", - "root 1316 1314 0 14:57 ? 00:00:00 grep spark\n" - ] - } - ], - "source": [ - "spark.version\n", - "!ps -ef | grep spark" + "print(\"Spark NLP version: \", sparknlp.version())\n", + "print(\"Apache Spark version: \", spark.version)" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "wlZc3EE3cTwA" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "primaryCorpus = spark.read.option(\"header\",\"true\").csv(\"file1.csv\")\n", @@ -253,18 +65,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "BiazE3F3cTwA", - "outputId": "93e6a4d2-15cb-4a2e-832a-115b6d2c7b39", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "bert_base_cased download started this may take some time.\n", "Approximate size to download 389.1 MB\n", @@ -273,27 +79,28 @@ } ], "source": [ + "from pyspark.ml.feature import Normalizer\n", + "\n", "documentAssembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", "\n", "sentence = SentenceDetector()\\\n", - " .setInputCols(\"document\")\\\n", - " .setOutputCol(\"sentence\")\\\n", - " .setExplodeSentences(False)\n", + " .setInputCols(\"document\")\\\n", + " .setOutputCol(\"sentence\")\\\n", + " .setExplodeSentences(False)\n", "\n", "tokenizer = Tokenizer()\\\n", " .setInputCols(['sentence'])\\\n", " .setOutputCol('token')\n", "\n", - "bertEmbeddings = BertEmbeddings\\\n", - " .pretrained('bert_base_cased', 'en') \\\n", - " .setInputCols([\"sentence\",'token'])\\\n", - " .setOutputCol(\"bert\")\\\n", - " .setCaseSensitive(False)\n", + "bertEmbeddings = BertEmbeddings.pretrained('bert_base_cased', 'en') \\\n", + " .setInputCols([\"sentence\",'token'])\\\n", + " .setOutputCol(\"bert\")\\\n", + " .setCaseSensitive(False)\n", "\n", "embeddingsSentence = SentenceEmbeddings() \\\n", - " .setInputCols([\"sentence\", \"bert\"]) \\\n", - " .setOutputCol(\"sentence_embeddings\") \\\n", - " .setPoolingStrategy(\"AVERAGE\")\n", + " .setInputCols([\"sentence\", \"bert\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\") \\\n", + " .setPoolingStrategy(\"AVERAGE\")\n", "\n", "embeddingsFinisher = EmbeddingsFinisher() \\\n", " .setInputCols([\"sentence_embeddings\",\"bert\"]) \\\n", @@ -303,43 +110,39 @@ "\n", "\n", "explodeVectors = SQLTransformer() \\\n", - ".setStatement(\"SELECT EXPLODE(sentence_embeddings_vectors) AS features, * FROM __THIS__\")\n", + " .setStatement(\"SELECT EXPLODE(sentence_embeddings_vectors) AS features, * FROM __THIS__\")\n", "\n", "vectorNormalizer = Normalizer() \\\n", " .setInputCol(\"features\") \\\n", " .setOutputCol(\"normFeatures\") \\\n", " .setP(1.0)\n", "\n", - "similartyChecker = BucketedRandomProjectionLSH(inputCol=\"features\", outputCol=\"hashes\", bucketLength=6.0,numHashTables=6)\n", - " " + "similarityChecker = BucketedRandomProjectionLSH(inputCol=\"features\", outputCol=\"hashes\", bucketLength=6.0,numHashTables=6)" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "zgOBgMt_cTwA" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "pipeline = Pipeline() \\\n", - " .setStages([documentAssembler,\n", - " sentence,\n", - " tokenizer,\n", - " bertEmbeddings,\n", - " embeddingsSentence,\n", - " embeddingsFinisher,\n", - " explodeVectors,\n", - " vectorNormalizer,\n", - " similartyChecker])" + "pipeline = Pipeline().setStages([\n", + " documentAssembler,\n", + " sentence,\n", + " tokenizer,\n", + " bertEmbeddings,\n", + " embeddingsSentence,\n", + " embeddingsFinisher,\n", + " explodeVectors,\n", + " vectorNormalizer,\n", + " similarityChecker\n", + "])" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "I-b3JkN7cTwB" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipelineModel = pipeline.fit(primaryCorpus)\n", @@ -349,18 +152,12 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "GBvI8DT6cTwB", - "outputId": "13e2900c-7c81-45b5-935b-88c02f1ae233", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+---+\n", "| text| features| normFeatures| lookupKey| id|\n", @@ -379,18 +176,12 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "oCtCzkSIcTwB", - "outputId": "7a1850c7-ec92-4d97-aa2d-a23130d8fd7e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+---+\n", "| text| features| normFeatures| id|\n", @@ -409,32 +200,18 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "VwpGN_dwcTwB", - "outputId": "d51c037b-cc40-4dc6-90a3-420bc8de87b7", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:125: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n", - " warnings.warn(\n" - ] - }, - { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+------------------+\n", "| idA| idB| distance|\n", "+--------------------+--------------------+------------------+\n", - "|iphone charger ph...|iphone case Apple...| 5.666233511624179|\n", "|Wall Decals Lamp ...|Curtains & Valanc...|3.7816639073044893|\n", + "|iphone charger ph...|iphone case Apple...| 5.666233511624179|\n", "+--------------------+--------------------+------------------+\n", "\n" ] @@ -451,27 +228,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "z8dpMJrpcTwB" - }, + "metadata": {}, "source": [ "## Approach 2" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "lS08MF9ucTwC", - "outputId": "71ba245a-d088-4291-ac1b-1e4f71390f54", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| primaryText| primaryFeatures| lookupKey| secondaryText| secondaryFeatures|\n", @@ -498,45 +267,13 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "IuAh7w2fcTwC", - "outputId": "3cd51280-724d-47c4-ee19-b418fdf7db64", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 191 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " primaryText \\\n", - "0 Wall Decals Lamp Shades Armchairs Bed Sheets N... \n", - "1 iphone charger phone Gift case iPhone holder s... \n", - "\n", - " primaryFeatures \\\n", - "0 [0.042425647377967834, -0.226881206035614, -0.... \n", - "1 [0.37093448638916016, 0.07500777393579483, -0.... \n", - "\n", - " lookupKey \\\n", - "0 bbc5a89d7cf3354ea4887c3690404ad8 \n", - "1 37c2b6ab956f9ebd6dccebd7623bf8c1 \n", - "\n", - " secondaryText \\\n", - "0 Curtains & Valances Wall Decals & Stickers Bed... \n", - "1 iphone case Apple ipod \n", - "\n", - " secondaryFeatures cosine \n", - "0 [0.3003387153148651, -0.022465573623776436, -0... 0.942328 \n", - "1 [0.4401525557041168, -0.09592525660991669, 0.0... 0.885493 " - ], "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", + "
\n", "\n", + " primaryFeatures \\\n", + "0 [0.042425647377967834, -0.226881206035614, -0.... \n", + "1 [0.37093448638916016, 0.07500777393579483, -0.... \n", "\n", - " \n", - "
\n", - "
\n", - " " + " secondaryFeatures cosine \n", + "0 [0.3003387153148651, -0.022465573623776436, -0... 0.942328 \n", + "1 [0.4401525557041168, -0.09592525660991669, 0.0... 0.885493 " ] }, + "execution_count": null, "metadata": {}, - "execution_count": 17 + "output_type": "execute_result" } ], "source": [ @@ -674,27 +357,12 @@ "finalDF['cosine'] = finalDF.apply(lambda row: 1-cosine(row['primaryFeatures'], row['secondaryFeatures']), axis=1)\n", "finalDF" ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "2uzNr89ocTwC" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "qyOkhPvEcTwC" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "environment": { "name": "tf2-2-2-cpu.2-2.m48", "type": "gcloud", @@ -714,11 +382,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb b/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb index e0aa7268259b7e..33af57b1afef72 100644 --- a/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb +++ b/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb @@ -7,7 +7,7 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb)\n", "\n", "\n", "# **Assembling Tokens to Documents**\n", @@ -27,7 +27,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "!pip install -q pyspark==3.3.0 spark-nlp==4.3.1" ] }, { @@ -39,7 +39,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.2.8\n", + "Warning::Spark Session already created, some configs may not take.\n", + "Spark NLP version: 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -69,7 +70,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": null, @@ -485,7 +486,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "nlpdev", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -499,11 +500,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "cf73c0c97d90b2660ff29b0c9bed4b851524d3484a00df4555e25832aa5cf188" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/french/MultiDateMatcherMultiLanguage_fr.ipynb b/examples/python/annotation/text/french/MultiDateMatcherMultiLanguage_fr.ipynb index ca5de61aa3bbb3..db0795764e6fec 100644 --- a/examples/python/annotation/text/french/MultiDateMatcherMultiLanguage_fr.ipynb +++ b/examples/python/annotation/text/french/MultiDateMatcherMultiLanguage_fr.ipynb @@ -1,161 +1,74 @@ { "cells": [ { - "cell_type": "code", - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "TysiusM56OOA", - "outputId": "95a17a2c-ffa6-42bf-85c4-364e2e12a461", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "id": "TysiusM56OOA", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:15:57-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:15:57-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:15:58-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:15:58 (62.4 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 46 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 55.7 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 69.7 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d1a9947b", - "metadata": { - "id": "d1a9947b" - }, - "outputs": [], - "source": [ - "from pyspark import *\n", - "import sparknlp" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1d695f9d", - "metadata": { - "id": "1d695f9d" - }, - "outputs": [], + "attachments": {}, + "cell_type": "markdown", + "id": "6324b870", + "metadata": {}, "source": [ - "spark = sparknlp.start()" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/french/MultiDateMatcherMultiLanguage_fr.ipynb)\n", + "\n", + "# MultiDateMatcher in French" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "6edb5c48", - "metadata": { - "id": "6edb5c48" - }, + "execution_count": null, + "id": "TysiusM56OOA", + "metadata": {}, "outputs": [], "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "b072abfa", - "metadata": { - "id": "b072abfa", - "outputId": "2bbe1b6a-0f20-4382-9440-60d0b8858d8f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "id": "d1a9947b", + "metadata": {}, "outputs": [ { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 5 + "name": "stdout", + "output_type": "stream", + "text": [ + "4.3.1\n", + "3.3.0\n" + ] } ], "source": [ - "sparknlp.version()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "84dc2c25", - "metadata": { - "id": "84dc2c25" - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType" + "from pyspark import *\n", + "from pyspark.sql.types import StringType\n", + "\n", + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spark = sparknlp.start()\n", + "print(sparknlp.version())\n", + "print(spark.version)" ] }, { "cell_type": "markdown", "id": "bfed9d58", - "metadata": { - "id": "bfed9d58" - }, + "metadata": {}, "source": [ "## French formatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "0eb63b0a", - "metadata": { - "id": "0eb63b0a", - "outputId": "ef719ebb-929b-4a44-cb94-6f5dc0dbca86", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -175,19 +88,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "3a13f2b6", - "metadata": { - "id": "3a13f2b6", - "outputId": "b3bde19c-d2ad-493e-882d-55844ae80144", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", @@ -216,28 +123,20 @@ { "cell_type": "markdown", "id": "cd208c73", - "metadata": { - "id": "cd208c73" - }, + "metadata": {}, "source": [ "## French unformatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "bb2f7595", - "metadata": { - "id": "bb2f7595", - "outputId": "3ff6c695-9afe-4ea5-8bff-dd763b14ffa3", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -257,24 +156,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "14a99318", - "metadata": { - "id": "14a99318", - "outputId": "e31de038-a328-4e37-fc04-ebcd34d6c27f", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 28, 37, 12/21/2022, {sentence -> 0}, []}, {date, 80, 88, 12/30/2022, {sentence -> 0}, []}]|\n", + "|[{date, 28, 37, 02/18/2023, {sentence -> 0}, []}, {date, 80, 88, 02/27/2023, {sentence -> 0}, []}]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -298,9 +191,7 @@ { "cell_type": "markdown", "id": "d60d29c5", - "metadata": { - "id": "d60d29c5" - }, + "metadata": {}, "source": [ "# A short guide to language support extension\n", "\n", @@ -315,19 +206,12 @@ "\n", "Thank you for contributing! :)" ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "d739a26c", - "metadata": { - "id": "d739a26c" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -342,11 +226,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/french/date_matcher_multi_language_fr.ipynb b/examples/python/annotation/text/french/date_matcher_multi_language_fr.ipynb index a844c57d3ec4f1..7219df098e050f 100644 --- a/examples/python/annotation/text/french/date_matcher_multi_language_fr.ipynb +++ b/examples/python/annotation/text/french/date_matcher_multi_language_fr.ipynb @@ -1,77 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "EIKUgFUO6Ks7" - }, + "metadata": {}, "source": [ - "# DateMatcher multi-language\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/french/date_matcher_multi_language_fr.ipynb)\n", "\n", - "#### This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." + "# DateMatcher multi-language (French)\n", + "This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "wEzGFRu06LRA", - "outputId": "16f76462-9b3f-4810-8c62-0526a05dace2", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:15:39-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:15:39-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:15:39-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:15:39 (35.7 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 57 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 51.2 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 83.1 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "943a272c-0686-4e02-a8d9-b2849721c829", - "showTitle": false, - "title": "" - }, - "id": "snWEWQPW6Ks9" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", @@ -89,28 +44,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b200e2aa-6280-4f51-9eb4-e30f660e2ba4", - "showTitle": false, - "title": "" - }, - "id": "xDQ3AELm6Ks-", - "outputId": "547e834b-8ccb-45c9-8653-178beb988bf9", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -119,11 +57,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v3.2.3
\n", + "
v3.3.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -133,10 +71,14 @@ " \n", "
\n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], "source": [ @@ -145,34 +87,18 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c0b759a0-346f-4d9f-9f01-383124c0aa05", - "showTitle": false, - "title": "" - }, - "id": "cYA0Xhws6Ks_", - "outputId": "4d0bcc8c-6cc1-4236-ecee-f183668c306d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } + "'4.3.1'" + ] }, + "execution_count": null, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ @@ -181,34 +107,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "DM91YCJJ6Ks_" - }, + "metadata": {}, "source": [ "# French examples" ] }, { "cell_type": "markdown", - "metadata": { - "id": "8_mlITBN6Ks_" - }, + "metadata": {}, "source": [ "### Let's import some articoles sentences from the news where relative dates are present." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a91c2626-5ef8-4e01-9563-120daf4f63f3", - "showTitle": false, - "title": "" - }, - "id": "gedTbW8-6Ks_" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "fr_articles = [\n", @@ -219,33 +133,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "8LR9O-Ck6KtA" - }, + "metadata": {}, "source": [ "### Let's fill a DataFrame with the text column" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "cfe3f9e0-4a96-44bb-b056-0b4a5407c6dc", - "showTitle": false, - "title": "" - }, - "id": "9Aaa1EMg6KtA", - "outputId": "94220058-9895-4391-930f-d4e83cbe2e69", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", @@ -271,25 +171,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-CouoUbh6KtB" - }, + "metadata": {}, "source": [ "### Now, let's create a simple pipeline to apply the DateMatcher, specifying the source language" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f4baf2a1-3e75-479e-9e9b-2b071624ee3d", - "showTitle": false, - "title": "" - }, - "id": "p0g2aabO6KtB" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -305,10 +195,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "5zcbvoMJ6KtB" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Let's transform the Data" @@ -316,24 +204,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "10380fbb-43c1-42c3-b6d0-f02e55d75a24", - "showTitle": false, - "title": "" - }, - "id": "bxLOMmBn6KtC", - "outputId": "027b1e63-e724-4831-a160-4adae85fb8f6", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------------------------------------------------+\n", "|date |\n", @@ -362,6 +238,9 @@ "notebookOrigID": 2439167545177012, "widgets": {} }, + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -376,11 +255,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/german/MultiDateMatcherMultiLanguage_de.ipynb b/examples/python/annotation/text/german/MultiDateMatcherMultiLanguage_de.ipynb index 6956a10fca8c00..3975073b6f4c56 100644 --- a/examples/python/annotation/text/german/MultiDateMatcherMultiLanguage_de.ipynb +++ b/examples/python/annotation/text/german/MultiDateMatcherMultiLanguage_de.ipynb @@ -1,161 +1,74 @@ { "cells": [ { - "cell_type": "code", - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "gTr7G0V76VmI", - "outputId": "7edf4b90-6608-461b-e72c-db6f9f137389", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "id": "gTr7G0V76VmI", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:16:14-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:16:14-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:16:14-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "\r- 0%[ ] 0 --.-KB/s \r- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:16:14 (40.3 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 54 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 58.8 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 63.8 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d1a9947b", - "metadata": { - "id": "d1a9947b" - }, - "outputs": [], - "source": [ - "from pyspark import *\n", - "import sparknlp" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1d695f9d", - "metadata": { - "id": "1d695f9d" - }, - "outputs": [], + "attachments": {}, + "cell_type": "markdown", + "id": "91762c58", + "metadata": {}, "source": [ - "spark = sparknlp.start()" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/german/MultiDateMatcherMultiLanguage_de.ipynb)\n", + "\n", + "# MultiDateMatcher in German" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "6edb5c48", - "metadata": { - "id": "6edb5c48" - }, + "execution_count": null, + "id": "gTr7G0V76VmI", + "metadata": {}, "outputs": [], "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "b072abfa", - "metadata": { - "id": "b072abfa", - "outputId": "52330838-26ff-4e14-9f4e-c4de20229eaf", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "id": "ec96c63b", + "metadata": {}, "outputs": [ { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 10 + "name": "stdout", + "output_type": "stream", + "text": [ + "4.3.1\n", + "3.3.0\n" + ] } ], "source": [ - "sparknlp.version()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "84dc2c25", - "metadata": { - "id": "84dc2c25" - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType" + "from pyspark import *\n", + "from pyspark.sql.types import StringType\n", + "\n", + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spark = sparknlp.start()\n", + "print(sparknlp.version())\n", + "print(spark.version)" ] }, { "cell_type": "markdown", "id": "10a075ae", - "metadata": { - "id": "10a075ae" - }, + "metadata": {}, "source": [ "## German formatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "3c6f02e6", - "metadata": { - "id": "3c6f02e6", - "outputId": "77eaa30d-3f9f-4468-84d8-b983c8c8dc4a", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -175,19 +88,13 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "02a6ce5c", - "metadata": { - "id": "02a6ce5c", - "outputId": "97c203db-847e-49ca-d28d-d4d831326fda", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", @@ -216,28 +123,20 @@ { "cell_type": "markdown", "id": "5d3c8b51", - "metadata": { - "id": "5d3c8b51" - }, + "metadata": {}, "source": [ "## German unformatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "454e4569", - "metadata": { - "id": "454e4569", - "outputId": "9d8e6032-1126-4977-d412-32e47ddc217a", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -257,24 +156,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "0f7ad78b", - "metadata": { - "id": "0f7ad78b", - "outputId": "4683281e-90d8-4eed-bf25-40fc24247603", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 14, 23, 12/21/2022, {sentence -> 0}, []}, {date, 85, 93, 12/30/2022, {sentence -> 0}, []}]|\n", + "|[{date, 14, 23, 02/18/2023, {sentence -> 0}, []}, {date, 85, 93, 02/27/2023, {sentence -> 0}, []}]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -298,9 +191,7 @@ { "cell_type": "markdown", "id": "9bb16cf4", - "metadata": { - "id": "9bb16cf4" - }, + "metadata": {}, "source": [ "# A short guide to language support extension\n", "\n", @@ -315,19 +206,12 @@ "\n", "Thank you for contributing! :)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d604d5d", - "metadata": { - "id": "8d604d5d" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -342,11 +226,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/german/date_matcher_multi_language_de.ipynb b/examples/python/annotation/text/german/date_matcher_multi_language_de.ipynb index dbe4bd597aa18d..45d1df5c9adbbd 100644 --- a/examples/python/annotation/text/german/date_matcher_multi_language_de.ipynb +++ b/examples/python/annotation/text/german/date_matcher_multi_language_de.ipynb @@ -1,77 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "xqh_a_ng6bdm" - }, + "metadata": {}, "source": [ - "# DateMatcher multi-language\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/german/date_matcher_multi_language_de.ipynb)\n", "\n", - "#### This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." + "# DateMatcher multi-language (German)\n", + "This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "TnwUoG4a6bvA", - "outputId": "2e54a2a7-b447-4acd-929f-ab4eb9813c1e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:16:55-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:16:56-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:16:57-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:16:57 (56.9 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 50 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 45.0 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 47.9 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "943a272c-0686-4e02-a8d9-b2849721c829", - "showTitle": false, - "title": "" - }, - "id": "lm9NuDi16bdp" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", @@ -89,28 +44,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b200e2aa-6280-4f51-9eb4-e30f660e2ba4", - "showTitle": false, - "title": "" - }, - "id": "qebaKHlY6bdp", - "outputId": "eff22721-b886-4635-ce8b-983bacb5243f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -119,11 +57,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v3.2.3
\n", + "
v3.3.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -133,10 +71,14 @@ " \n", "
\n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], "source": [ @@ -145,34 +87,18 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c0b759a0-346f-4d9f-9f01-383124c0aa05", - "showTitle": false, - "title": "" - }, - "id": "L0P0Q19h6bdq", - "outputId": "d2cca706-a6e5-4758-9831-86f4e76ed955", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } + "'4.3.1'" + ] }, + "execution_count": null, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ @@ -181,34 +107,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "XI5qtquJ6bdq" - }, + "metadata": {}, "source": [ "# German examples" ] }, { "cell_type": "markdown", - "metadata": { - "id": "b8YhBTes6bdr" - }, + "metadata": {}, "source": [ "### Let's import some articoles sentences from the news where relative dates are present." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a91c2626-5ef8-4e01-9563-120daf4f63f3", - "showTitle": false, - "title": "" - }, - "id": "g-f8stEN6bdr" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "de_articles = [\n", @@ -219,33 +133,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mAZEaSKk6bdr" - }, + "metadata": {}, "source": [ "### Let's fill a DataFrame with the text column" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "cfe3f9e0-4a96-44bb-b056-0b4a5407c6dc", - "showTitle": false, - "title": "" - }, - "id": "pBqjZrL86bdr", - "outputId": "68aa6228-76bb-42ca-a930-88849e90b39e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", @@ -271,25 +171,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "3meR5x4c6bds" - }, + "metadata": {}, "source": [ "### Now, let's create a simple pipeline to apply the DateMatcher, specifying the source language" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f4baf2a1-3e75-479e-9e9b-2b071624ee3d", - "showTitle": false, - "title": "" - }, - "id": "Hzo5woaF6bds" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -305,10 +195,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "yNqvoyZt6bds" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Let's transform the Data" @@ -316,24 +204,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "10380fbb-43c1-42c3-b6d0-f02e55d75a24", - "showTitle": false, - "title": "" - }, - "id": "DsxAdzNy6bds", - "outputId": "39105d32-c166-4a31-8d37-3bb00af2cc17", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------------------------------------------------+\n", "|date |\n", @@ -349,15 +225,6 @@ "assembled = document_assembler.transform(df)\n", "date_matcher.transform(assembled).select('date').show(10, False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51_6lDzf6bds" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -371,6 +238,9 @@ "notebookOrigID": 2439167545177012, "widgets": {} }, + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -385,11 +255,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/german/pretrained_german_models.ipynb b/examples/python/annotation/text/german/pretrained_german_models.ipynb index b2a6f149961fb7..86cd7810fbc52c 100644 --- a/examples/python/annotation/text/german/pretrained_german_models.ipynb +++ b/examples/python/annotation/text/german/pretrained_german_models.ipynb @@ -1,72 +1,30 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "XA9scnGrLCn2" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/german/pretrained-german-models.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/german/pretrained_german_models.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Running Pretrained Pipelines For German Language Texts" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Dm-qYk1nH9Qx", - "outputId": "35ae2816-3b9a-46bf-88b9-8b5704d40a56" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:20:19-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:20:20-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:20:20-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:20:20 (41.1 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 54 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 56.3 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 64.7 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "gTvXsYwYGrXA" - }, + "metadata": {}, "source": [ "### German models specs\n", "\n", @@ -79,10 +37,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "QGc8b0-yGrXC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -95,21 +51,15 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SY5sbRNUGrXI", - "outputId": "f3c56aed-8c58-47dd-f7c5-c6c7604c829e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -122,10 +72,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "fNfAQEa2GrXP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "dfTest = spark.createDataFrame([\n", @@ -136,9 +84,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "DgmHq9mYGrXV" - }, + "metadata": {}, "source": [ "### Pretrained Pipelines in German\n", "#### explain_document_md (glove_6B_300)" @@ -146,18 +92,12 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iYU-OmoJGrXW", - "outputId": "644181e5-9975-4061-8c75-3fc315cc4b59" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "explain_document_md download started this may take some time.\n", "Approx size to download 452.4 MB\n", @@ -171,18 +111,12 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2wfeCpX7GrXa", - "outputId": "bdf98869-59df-4ac9-dd04-cefe64ac4588" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------+----------+----------+----------+----------+----------+----------+----------+----------+\n", "| text| document| sentence| token| lemma| pos|embeddings| ner| entities|\n", @@ -200,19 +134,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0_M6Ks9lGrXe", - "outputId": "366be6d4-5e6b-4a7c-9224-e2aeaa919483", - "scrolled": true - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------------------------------------------------------------------+\n", "| result|\n", @@ -246,27 +173,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "xISxmUMhGrX2" - }, + "metadata": {}, "source": [ "#### entity_recognizer_md (glove_6B_300)" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "--rX-7QNGrX3", - "outputId": "4a7d2d8e-357e-42f5-bd76-2fa6f702b4f8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "entity_recognizer_md download started this may take some time.\n", "Approx size to download 443.7 MB\n", @@ -280,18 +199,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wvWLZAsAGrX8", - "outputId": "7cdd2ef0-f5c1-439c-9ae2-794f2688bc7b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------+----------+----------+----------+----------+----------+----------+\n", "| text| document| sentence| token|embeddings| ner| entities|\n", @@ -309,18 +222,12 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "M3vlrh_vGrYC", - "outputId": "5069b648-df0b-47bd-aa0c-b0e0ec085bd1" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------------------------------------------------------------------+\n", "| result|\n", @@ -354,27 +261,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "0QdmUQdSGrYI" - }, + "metadata": {}, "source": [ "#### entity_recognizer_lg (glove_840B_300)" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kjBzcacjGrYJ", - "outputId": "ee0b9106-d1cc-4a57-a0b7-f7db9a6d07a7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "entity_recognizer_lg download started this may take some time.\n", "Approx size to download 2.3 GB\n", @@ -388,18 +287,12 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "n10-XfpQGrYN", - "outputId": "391607a8-656e-433a-ee06-8fed46eb3826" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------+----------+----------+----------+----------+----------+----------+\n", "| text| document| sentence| token|embeddings| ner| entities|\n", @@ -417,18 +310,12 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zZqLfnw9GrYV", - "outputId": "13e7c4bf-6175-4b08-b533-730c77665c70" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------------------------------------------------------------------+\n", "| result|\n", @@ -462,27 +349,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "XKPV6SQFGrYa" - }, + "metadata": {}, "source": [ "### Pretrained Models in German" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "IVfLuZ78GrYb", - "outputId": "b078f853-a19e-4d9f-d4af-30d135297fa5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "lemma download started this may take some time.\n", "Approximate size to download 4 MB\n", @@ -542,10 +421,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "Kmc2VBqhGrYf" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "prediction = prediction_pipeline.fit(dfTest).transform(dfTest)" @@ -553,18 +430,12 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lH4RDQIEGrYi", - "outputId": "cc590d5f-94a6-4a86-dcb1-ef417820561b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+----------------------------------------------------------------------+\n", "| result|\n", @@ -596,15 +467,6 @@ "prediction.select(\"pos.result\").show(2, truncate=70)\n", "prediction.select(\"ner.result\").show(2, truncate=70)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MCovIRpoGrYm" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -627,8 +489,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/italian/MultiDateMatcherMultiLanguage_it.ipynb b/examples/python/annotation/text/italian/MultiDateMatcherMultiLanguage_it.ipynb index d9e7a0060d3a31..3791d2f576c0bb 100644 --- a/examples/python/annotation/text/italian/MultiDateMatcherMultiLanguage_it.ipynb +++ b/examples/python/annotation/text/italian/MultiDateMatcherMultiLanguage_it.ipynb @@ -1,161 +1,73 @@ { "cells": [ { - "cell_type": "code", - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "LtDsLK407X92", - "outputId": "13bbdd9c-63cc-430f-fe93-aa8e57d941a1", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "id": "LtDsLK407X92", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:22:23-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:22:23-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:22:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:22:24 (43.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 51 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 45.1 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 69.3 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d1a9947b", - "metadata": { - "id": "d1a9947b" - }, - "outputs": [], - "source": [ - "from pyspark import *\n", - "import sparknlp" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1d695f9d", - "metadata": { - "id": "1d695f9d" - }, - "outputs": [], + "cell_type": "markdown", + "id": "cf3d94d0", + "metadata": {}, "source": [ - "spark = sparknlp.start()" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/italian/MultiDateMatcherMultiLanguage_it.ipynb)\n", + "\n", + "# MultiDateMatcher in Italian" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "6edb5c48", - "metadata": { - "id": "6edb5c48" - }, + "execution_count": null, + "id": "LtDsLK407X92", + "metadata": {}, "outputs": [], "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "b072abfa", - "metadata": { - "id": "b072abfa", - "outputId": "7d825d81-b240-41de-a451-2f8a7245d05b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "id": "48558d29", + "metadata": {}, "outputs": [ { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 5 + "name": "stdout", + "output_type": "stream", + "text": [ + "4.3.1\n", + "3.3.0\n" + ] } ], "source": [ - "sparknlp.version()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "84dc2c25", - "metadata": { - "id": "84dc2c25" - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType" + "from pyspark import *\n", + "from pyspark.sql.types import StringType\n", + "\n", + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spark = sparknlp.start()\n", + "print(sparknlp.version())\n", + "print(spark.version)" ] }, { "cell_type": "markdown", "id": "dab1ddd2", - "metadata": { - "id": "dab1ddd2" - }, + "metadata": {}, "source": [ "## Italian formatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "6d6b87ad", - "metadata": { - "id": "6d6b87ad", - "outputId": "230f84af-b89f-4a41-a9d3-95ea352c5f76", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -175,19 +87,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "6c68565a", - "metadata": { - "id": "6c68565a", - "outputId": "0d05d6d1-782a-4dcc-aa16-9d7589feab9f", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", @@ -216,28 +122,20 @@ { "cell_type": "markdown", "id": "f0c2c655", - "metadata": { - "id": "f0c2c655" - }, + "metadata": {}, "source": [ "## Italian unformatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "20f7f76a", - "metadata": { - "id": "20f7f76a", - "outputId": "e428063a-e653-4ffe-a40b-0790c4dce137", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -257,24 +155,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "6060220a", - "metadata": { - "id": "6060220a", - "outputId": "de16b3d3-d4b3-46eb-8fcc-e38f325a9ca9", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 20, 29, 12/21/2022, {sentence -> 0}, []}, {date, 69, 77, 12/30/2022, {sentence -> 0}, []}]|\n", + "|[{date, 20, 29, 02/18/2023, {sentence -> 0}, []}, {date, 69, 77, 02/27/2023, {sentence -> 0}, []}]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -298,9 +190,7 @@ { "cell_type": "markdown", "id": "51a37f93", - "metadata": { - "id": "51a37f93" - }, + "metadata": {}, "source": [ "# A short guide to language support extension\n", "\n", @@ -315,19 +205,12 @@ "\n", "Thank you for contributing! :)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f5f0959", - "metadata": { - "id": "9f5f0959" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -342,11 +225,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/italian/date_matcher_multi_language_it.ipynb b/examples/python/annotation/text/italian/date_matcher_multi_language_it.ipynb index a845f50b6d7be7..6f67aa07ad6503 100644 --- a/examples/python/annotation/text/italian/date_matcher_multi_language_it.ipynb +++ b/examples/python/annotation/text/italian/date_matcher_multi_language_it.ipynb @@ -1,77 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Yo-UWYoz8DLl" - }, + "metadata": {}, "source": [ - "# DateMatcher multi-language\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/italian/date_matcher_multi_language_it.ipynb)\n", "\n", - "#### This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." + "# DateMatcher multi-language (Italian)\n", + "This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "tlt9hbXG8De3", - "outputId": "33cc8225-608e-43e8-a4be-da07d95fee3e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:23:48-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:23:48-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:23:49-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "2022-12-23 12:23:49 (71.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 48 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 57.5 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 59.9 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "943a272c-0686-4e02-a8d9-b2849721c829", - "showTitle": false, - "title": "" - }, - "id": "0i2rXCHF8DLn" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", @@ -89,28 +44,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b200e2aa-6280-4f51-9eb4-e30f660e2ba4", - "showTitle": false, - "title": "" - }, - "id": "nBNHpwA68DLo", - "outputId": "21b4193a-af49-482f-ea59-c9845a206f3c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -119,11 +57,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v3.2.3
\n", + "
v3.3.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -133,10 +71,14 @@ " \n", "
\n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], "source": [ @@ -145,34 +87,18 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c0b759a0-346f-4d9f-9f01-383124c0aa05", - "showTitle": false, - "title": "" - }, - "id": "rKc77PnJ8DLp", - "outputId": "0e0447dd-b679-4507-fe63-6c04cf0263e4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } + "'4.3.1'" + ] }, + "execution_count": null, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ @@ -181,34 +107,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "K-9w5xeN8DLp" - }, + "metadata": {}, "source": [ "# Italian examples" ] }, { "cell_type": "markdown", - "metadata": { - "id": "RZCNgyNz8DLp" - }, + "metadata": {}, "source": [ "### Let's import some articoles sentences from the news where relative dates are present." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a91c2626-5ef8-4e01-9563-120daf4f63f3", - "showTitle": false, - "title": "" - }, - "id": "h-dS7rld8DLq" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "it_articles = [\n", @@ -220,33 +134,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "vHQaGOwf8DLq" - }, + "metadata": {}, "source": [ "### Let's fill a DataFrame with the text column" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "cfe3f9e0-4a96-44bb-b056-0b4a5407c6dc", - "showTitle": false, - "title": "" - }, - "id": "ZHePNCMp8DLq", - "outputId": "6b842c58-a715-43fc-d177-d6940cea64d9", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", @@ -273,25 +173,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "D6o9j43A8DLq" - }, + "metadata": {}, "source": [ "### Now, let's create a simple pipeline to apply the DateMatcher, specifying the source language" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f4baf2a1-3e75-479e-9e9b-2b071624ee3d", - "showTitle": false, - "title": "" - }, - "id": "9CA3uIza8DLr" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -307,40 +197,26 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "S5vQ97qx8DLr" - }, + "metadata": {}, "source": [ "### Let's transform the DataFrame content to extract the dates" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "10380fbb-43c1-42c3-b6d0-f02e55d75a24", - "showTitle": false, - "title": "" - }, - "id": "7OL5QJoI8DLr", - "outputId": "01608b02-66cc-44cb-de12-bdea626000f8", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+---------------------------------------------------+\n", "|date |\n", "+---------------------------------------------------+\n", - "|[{date, 175, 183, 12/23/2022, {sentence -> 0}, []}]|\n", + "|[{date, 175, 183, 02/20/2023, {sentence -> 0}, []}]|\n", "|[{date, 91, 102, 07/13/2021, {sentence -> 0}, []}] |\n", - "|[{date, 61, 69, 12/22/2022, {sentence -> 0}, []}] |\n", + "|[{date, 61, 69, 02/19/2023, {sentence -> 0}, []}] |\n", "+---------------------------------------------------+\n", "\n" ] @@ -350,15 +226,6 @@ "assembled = document_assembler.transform(df)\n", "date_matcher.transform(assembled).select('date').show(10, False)" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "eCz5i2D48lhu" - }, - "execution_count": null, - "outputs": [] } ], "metadata": { @@ -372,6 +239,9 @@ "notebookOrigID": 2439167545177012, "widgets": {} }, + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -386,11 +256,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb b/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb index 2822286cdec893..e6e41e1c48298a 100644 --- a/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb +++ b/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb @@ -12,7 +12,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/multilingual/SentenceDetectorDL.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb)" ] }, { @@ -94,7 +94,8 @@ "metadata": {}, "outputs": [], "source": [ - "! pip install -q pyspark==3.3.0 spark-nlp==4.3.0" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { @@ -106,7 +107,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n", + "Spark NLP version 4.3.1\n", "Apache Spark version: 3.3.0\n" ] }, @@ -120,7 +121,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -136,7 +137,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": null, @@ -215,7 +216,7 @@ ], "source": [ "text = \"\"\"John loves Mary.mary loves Peter\n", - " Peter loves Helen .Helen loves John; \n", + " Peter loves Helen .Helen loves John;\n", " Total: four. people involved.\"\"\"\n", "\n", "for anno in sd_model.fullAnnotate(text)[0][\"sentences\"]:\n", @@ -258,16 +259,16 @@ ], "source": [ "text = '''\n", - "There are many NLP tasks like text summarization, question-answering, sentence prediction to name a few. One method to get\\n these tasks done is using a pre-trained model. Instead of training \n", - "a model from scratch for NLP tasks using millions of annotated texts each time, a general language representation is created by training a model on a huge amount of data. This is called a pre-trained model. This pre-trained model is \n", + "There are many NLP tasks like text summarization, question-answering, sentence prediction to name a few. One method to get\\n these tasks done is using a pre-trained model. Instead of training\n", + "a model from scratch for NLP tasks using millions of annotated texts each time, a general language representation is created by training a model on a huge amount of data. This is called a pre-trained model. This pre-trained model is\n", "then fine-tuned for each NLP tasks according to need.\n", "Let’s just peek into the pre-BERT world…\n", - "For creating models, we need words to be represented in a form \\n understood by the training network, ie, numbers. Thus many algorithms were used to convert words into vectors or more precisely, word embeddings. \n", + "For creating models, we need words to be represented in a form \\n understood by the training network, ie, numbers. Thus many algorithms were used to convert words into vectors or more precisely, word embeddings.\n", "One of the earliest algorithms used for this purpose is word2vec. However, the drawback of word2vec models was that they were context-free. One problem caused by this is that they cannot accommodate polysemy. For example, the word ‘letter’ has a different meaning according to the context. It can mean ‘single element of alphabet’ or ‘document addressed to another person’. But in word2vec both the letter returns same embeddings.\n", "'''\n", "\n", "for anno in sd_model.fullAnnotate(text)[0][\"sentences\"]:\n", - " \n", + "\n", " print(\"{}\\t{}\\t{}\\t{}\".format(\n", " anno.metadata[\"sentence\"], anno.begin, anno.end, anno.result.replace('\\n',''))) # removing \\n to beutify printing\n" ] @@ -297,36 +298,23 @@ "4\tCourtier, of Los Angeles, disappeared after a private shuttle dropped her off on Oct. 6. at the Grotto park area inside the 232-square-mile national park.\n", "5\tShe was scheduled to be picked up later that afternoon but didn't show up, park officials said.\n", "6\tThe search included K.9. units and federal, state and local rescue teams;\n", - "7\tVolunteers also joined the effort.\n", - "\n", - "with Spacy Sentence Detection\n", - "===================================\n", - "0 \t A California woman who vanished in Utah’s Zion National Park earlier,this month was found and reunited with her family officials said Sunday.\n", - "1 \t Holly Suzanne Courtier, 38, was located within the park after a visitor saw her and alerted rangers, the National.\n", - "2 \t Park Service said in a statement.\n", - "3 \t Additional details about how she survived or where she was found were not immediately available.\n", - "4 \t In the statement, Courtier’s relatives said they were “overjoyed” that she’d been found.\n", - "5 \t Courtier, of Los Angeles, disappeared after a private shuttle dropped her off on Oct. 6.\n", - "6 \t at the Grotto park area inside the 232-square-mile national park.\n", - "7 \t She was scheduled to be picked up later that afternoon but didn't show up, park officials said.\n", - "8 \t The search included K.9.\n", - "9 \t units and federal, state and local rescue teams; Volunteers also joined the effort.\n" + "7\tVolunteers also joined the effort.\n" ] } ], "source": [ "random_broken_text = '''\n", "A California woman who vanished in Utah’s Zion National Park earlier,\n", - "this month was found and reunited with her family \n", - "officials said Sunday. Holly Suzanne Courtier, \n", - "38, was located within the park after a visitor saw \n", + "this month was found and reunited with her family\n", + "officials said Sunday. Holly Suzanne Courtier,\n", + "38, was located within the park after a visitor saw\n", "her and alerted rangers, the National. Park Service said in a statement.\n", - "Additional details about how she \n", - "survived or where she was found were not immediately available. In the statement, \n", + "Additional details about how she\n", + "survived or where she was found were not immediately available. In the statement,\n", "Courtier’s relatives said they were “overjoyed” that she’d been found.\n", - "Courtier, of Los Angeles, disappeared after a private shuttle dropped her off on Oct. 6. at the Grotto park area \n", - "inside the 232-square-mile national park. She was scheduled to be picked up later that \n", - "afternoon but didn't show up, park officials said. The search included K.9. units and federal, \n", + "Courtier, of Los Angeles, disappeared after a private shuttle dropped her off on Oct. 6. at the Grotto park area\n", + "inside the 232-square-mile national park. She was scheduled to be picked up later that\n", + "afternoon but didn't show up, park officials said. The search included K.9. units and federal,\n", "state and local rescue teams; Volunteers also joined the effort.\n", "'''\n", "\n", @@ -382,19 +370,11 @@ "text": [ "with Spark NLP SentenceDetectorDL\n", "===================================\n", - "0\tΌπως ίσως θα γνωρίζει, όταν εγκαθιστάς μια νέα εφαρμογή, θα έχεις διαπιστώσει λίγο μετά, ότι το PC αρχίζει να επιβραδύνεται.\n", - "1\tΣτη συνέχεια, όταν επισκέπτεσαι την οθόνη ή από την διαχείριση εργασιών, θα διαπιστώσεις ότι η εν λόγω εφαρμογή έχει προστεθεί στη λίστα των προγραμμάτων που εκκινούν αυτόματα, όταν ξεκινάς το PC.\n", - "2\tΠροφανώς, κάτι τέτοιο δεν αποτελεί μια ιδανική κατάσταση, ιδίως για τους λιγότερο γνώστες, οι οποίοι ίσως δεν θα συνειδητοποιήσουν ότι κάτι τέτοιο συνέβη.\n", - "3\tΌσο περισσότερες εφαρμογές στη λίστα αυτή, τόσο πιο αργή γίνεται η εκκίνηση, ιδίως αν πρόκειται για απαιτητικές εφαρμογές.\n", - "4\tΤα ευχάριστα νέα είναι ότι η τελευταία και πιο πρόσφατη preview build της έκδοσης των Windows 10 που θα καταφθάσει στο πρώτο μισό του 2021, οι εφαρμογές θα ενημερώνουν το χρήστη ότι έχουν προστεθεί στη λίστα των εφαρμογών που εκκινούν μόλις ανοίγεις το PC.\n", - "\n", - "with Spacy Sentence Detection\n", - "===================================\n", - "0 \t Όπως ίσως θα γνωρίζει, όταν εγκαθιστάς μια νέα εφαρμογή, θα έχεις διαπιστώσει λίγο μετά, ότι το PC αρχίζει να επιβραδύνεται.\n", - "1 \t Στη συνέχεια, όταν επισκέπτεσαι την οθόνη ή από την διαχείριση εργασιών, θα διαπιστώσεις ότι η εν λόγω εφαρμογή έχει προστεθεί στη λίστα των προγραμμάτων που εκκινούν αυτόματα, όταν ξεκινάς το PC.Προφανώς, κάτι τέτοιο δεν αποτελεί μια ιδανική κατάσταση, ιδίως για τους λιγότερο γνώστες, οι οποίοι ίσως δεν θα συνειδητοποιήσουν ότι κάτι τέτοιο συνέβη.\n", - "2 \t Όσο περισσότερες εφαρμογές στη λίστα αυτή, τόσο πιο αργή γίνεται η εκκίνηση, ιδίως αν πρόκειται για απαιτητικές εφαρμογές.\n", - "3 \t Τα ευχάριστα νέα είναι ότι η τελευταία και πιο πρόσφατη preview build της έκδοσης των Windows 10 που θα καταφθάσει\n", - "4 \t στο πρώτο μισό του 2021, οι εφαρμογές θα ενημερώνουν το χρήστη ότι έχουν προστεθεί στη λίστα των εφαρμογών που εκκινούν μόλις ανοίγεις το PC.\n" + "0\tΌπως ίσως θα γνωρίζει, όταν εγκαθιστάς μια νέα εφαρμογή, θα έχεις διαπιστώσειλίγο μετά, ότι το PC αρχίζει να επιβραδύνεται.\n", + "1\tΣτη συνέχεια, όταν επισκέπτεσαι την οθόνη ή από την διαχείριση εργασιών, θα διαπιστώσεις ότι η εν λόγω εφαρμογή έχει προστεθεί στηλίστα των προγραμμάτων που εκκινούν αυτόματα, όταν ξεκινάς το PC.\n", + "2\tΠροφανώς, κάτι τέτοιο δεν αποτελεί μια ιδανική κατάσταση, ιδίως για τους λιγότερο γνώστες, οιοποίοι ίσως δεν θα συνειδητοποιήσουν ότι κάτι τέτοιο συνέβη.\n", + "3\tΌσο περισσότερες εφαρμογές στη λίστα αυτή, τόσο πιο αργή γίνεται ηεκκίνηση, ιδίως αν πρόκειται για απαιτητικές εφαρμογές.\n", + "4\tΤα ευχάριστα νέα είναι ότι η τελευταία και πιο πρόσφατη preview build της έκδοσης των Windows 10 που θα καταφθάσει στο πρώτο μισό του 2021, οι εφαρμογές θαενημερώνουν το χρήστη ότι έχουν προστεθεί στη λίστα των εφαρμογών που εκκινούν μόλις ανοίγεις το PC.\n" ] } ], @@ -428,43 +408,27 @@ "text": [ "with Spark NLP SentenceDetectorDL\n", "===================================\n", - "0\tB чeтвъpтъĸ Gооglе oбяви няĸoлĸo aĸтyaлизaции нa cвoятa тъpcaчĸa, зaявявaйĸи чe e въвeлa изĸycтвeн интeлeĸт (Аl) и мaшиннo oбyчeниe зa пoдoбpявaнe нa пoтpeбитeлcĸoтo изживявaнe.\n", - "1\tΠoтpeбитeлитe вeчe мoгaт дa cи тaнaниĸaт, cвиpят или пeят мeлoдия нa пeceн нa Gооglе чpeз мoбилнoтo пpилoжeниe, ĸaтo дoĸocнaт иĸoнaтa нa миĸpoфoнa и зaдaдaт въпpoca: Koя e тaзи пeceн?\n", + "0\tB чeтвъpтъĸ Gооglе oбяви няĸoлĸo aĸтyaлизaции нa cвoятa тъpcaчĸa, зaявявaйĸи чe eвъвeлa изĸycтвeн интeлeĸт (Аl) и мaшиннo oбyчeниe зa пoдoбpявaнe нa пoтpeбитeлcĸoтo изживявaнe.\n", + "1\tΠoтpeбитeлитe вeчe мoгaт дa cи тaнaниĸaт, cвиpят или пeят мeлoдия нa пeceн нa Gооglе чpeз мoбилнoтo пpилoжeниe,ĸaтo дoĸocнaт иĸoнaтa нa миĸpoфoнa и зaдaдaт въпpoca: Koя e тaзи пeceн?\n", "2\tTaнaниĸaнeтo в пpoдължeниe нa 10-15 ceĸyнди щe дaдe шaнc нa aлгopитъмa c мaшиннo oбyчeниe нa Gооglе дa нaмepи и извeдe peзyлтaт ĸoя e пpипявaнaтa пeceн.\n", - "3\tΠoнacтoящeм фyнĸциятa e дocтъпнa нa aнглийcĸи eзиĸ зa Іоѕ и нa oĸoлo 20 eзиĸa зa Аndrоіd, ĸaтo в бъдeщe и зa двeтe oпepaциoнни cиcтeми щe бъдe пpeдлoжeн eднaĸъв нaбop oт пoддъpжaни eзици, ĸaзвaт oт Gооglе.\n", + "3\tΠoнacтoящeм фyнĸциятa e дocтъпнa нa aнглийcĸи eзиĸ зa Іоѕ и нa oĸoлo 20 eзиĸa зa Аndrоіd,ĸaтo в бъдeщe и зa двeтe oпepaциoнни cиcтeми щe бъдe пpeдлoжeн eднaĸъв нaбop oт пoддъpжaни eзици, ĸaзвaт oт Gооglе.\n", "4\tAl aĸтyaлизaциитe нa тъpceщия гигaнт cъщo oбxвaщaт пpaвoпиca и oбщитe зaявĸи зa тъpceнe.\n", - "5\tCpeд пoдoбpeниятa e вĸлючeн нoв пpaвoпиceн aлгopитъм, ĸoйтo изпoлзвa нeвpoннa мpeжa c дълбoĸo oбyчeниe, зa ĸoятo Gооglе твъpди, чe идвa cъc знaчитeлнo пoдoбpeнa cпocoбнocт зa дeшифpиpaнe нa пpaвoпиcни гpeшĸи.\n", - "\n", - "with Spacy Sentence Detection\n", - "===================================\n", - "0 \t B чeтвъpтъĸ Gооglе oбяви няĸoлĸo aĸтyaлизaции нa cвoятa тъpcaчĸa, зaявявaйĸи чe e въвeлa изĸycтвeн интeлeĸт\n", - "1 \t (Аl) и мaшиннo oбyчeниe зa пoдoбpявaнe нa пoтpeбитeлcĸoтo изживявaнe.\n", - "2 \t Πoтpeбитeлитe вeчe мoгaт дa cи тaнaниĸaт, cвиpят или пeят мeлoдия нa пeceн нa Gооglе чpeз мoбилнoтo пpилoжeниe, ĸaтo дoĸocнaт иĸoнaтa нa миĸpoфoнa и зaдaдaт въпpoca:\n", - "3 \t Koя e тaзи пeceн?Taнaниĸaнeтo в пpoдължeниe нa 10-15 ceĸyнди щe дaдe шaнc нa aлгopитъмa c мaшиннo oбyчeниe\n", - "4 \t нa\n", - "5 \t Gооglе дa нaмepи и извeдe peзyлтaт ĸoя e пpипявaнaтa пeceн.\n", - "6 \t Πoнacтoящeм фyнĸциятa e дocтъпнa нa aнглийcĸи eзиĸ\n", - "7 \t зa\n", - "8 \t Іоѕ и нa oĸoлo 20 eзиĸa зa Аndrоіd, ĸaтo в бъдeщe и зa двeтe oпepaциoнни cиcтeми щe бъдe пpeдлoжeн eднaĸъв нaбop oт пoддъpжaни eзици, ĸaзвaт oт Gооglе.\n", - "9 \t Al aĸтyaлизaциитe нa тъpceщия гигaнт cъщo oбxвaщaт пpaвoпиca и oбщитe зaявĸи зa тъpceнe.\n", - "10 \t Cpeд пoдoбpeниятa e вĸлючeн нoв пpaвoпиceн aлгopитъм, ĸoйтo изпoлзвa нeвpoннa мpeжa c дълбoĸo oбyчeниe, зa ĸoятo Gооglе твъpди, чe идвa cъc знaчитeлнo пoдoбpeнa cпocoбнocт\n", - "11 \t зa дeшифpиpaнe\n", - "12 \t нa пpaвoпиcни гpeшĸи.\n" + "5\tCpeд пoдoбpeниятa e вĸлючeн нoв пpaвoпиceн aлгopитъм, ĸoйтo изпoлзвa нeвpoннa мpeжac дълбoĸo oбyчeниe, зa ĸoятo Gооglе твъpди, чe идвa cъc знaчитeлнo пoдoбpeнa cпocoбнocт зaдeшифpиpaнe нa пpaвoпиcни гpeшĸи.\n" ] } ], "source": [ "cyrillic_text = '''\n", - "B чeтвъpтъĸ Gооglе oбяви няĸoлĸo aĸтyaлизaции нa cвoятa тъpcaчĸa, зaявявaйĸи чe e \n", + "B чeтвъpтъĸ Gооglе oбяви няĸoлĸo aĸтyaлизaции нa cвoятa тъpcaчĸa, зaявявaйĸи чe e\n", "въвeлa изĸycтвeн интeлeĸт (Аl) и мaшиннo oбyчeниe зa пoдoбpявaнe нa пoтpeбитeлcĸoтo изживявaнe.\n", - "Πoтpeбитeлитe вeчe мoгaт дa cи тaнaниĸaт, cвиpят или пeят мeлoдия нa пeceн нa Gооglе чpeз мoбилнoтo пpилoжeниe, \n", + "Πoтpeбитeлитe вeчe мoгaт дa cи тaнaниĸaт, cвиpят или пeят мeлoдия нa пeceн нa Gооglе чpeз мoбилнoтo пpилoжeниe,\n", "ĸaтo дoĸocнaт иĸoнaтa нa миĸpoфoнa и зaдaдaт въпpoca: Koя e тaзи пeceн?\n", "Taнaниĸaнeтo в пpoдължeниe нa 10-15 ceĸyнди щe дaдe шaнc нa aлгopитъмa c мaшиннo oбyчeниe нa Gооglе дa нaмepи и извeдe peзyлтaт ĸoя e пpипявaнaтa пeceн.\n", - "Πoнacтoящeм фyнĸциятa e дocтъпнa нa aнглийcĸи eзиĸ зa Іоѕ и нa oĸoлo 20 eзиĸa зa Аndrоіd, \n", + "Πoнacтoящeм фyнĸциятa e дocтъпнa нa aнглийcĸи eзиĸ зa Іоѕ и нa oĸoлo 20 eзиĸa зa Аndrоіd,\n", "ĸaтo в бъдeщe и зa двeтe oпepaциoнни cиcтeми щe бъдe пpeдлoжeн eднaĸъв нaбop oт пoддъpжaни eзици, ĸaзвaт oт Gооglе.\n", "Al aĸтyaлизaциитe нa тъpceщия гигaнт cъщo oбxвaщaт пpaвoпиca и oбщитe зaявĸи зa тъpceнe.\n", - "Cpeд пoдoбpeниятa e вĸлючeн нoв пpaвoпиceн aлгopитъм, ĸoйтo изпoлзвa нeвpoннa мpeжa \n", - "c дълбoĸo oбyчeниe, зa ĸoятo Gооglе твъpди, чe идвa cъc знaчитeлнo пoдoбpeнa cпocoбнocт зa \n", + "Cpeд пoдoбpeниятa e вĸлючeн нoв пpaвoпиceн aлгopитъм, ĸoйтo изпoлзвa нeвpoннa мpeжa\n", + "c дълбoĸo oбyчeниe, зa ĸoятo Gооglе твъpди, чe идвa cъc знaчитeлнo пoдoбpeнa cпocoбнocт зa\n", "дeшифpиpaнe нa пpaвoпиcни гpeшĸи.\n", "'''\n", "\n", @@ -472,7 +436,7 @@ "print ('===================================')\n", "\n", "for anno in sd_model_multi.fullAnnotate(cyrillic_text)[0][\"sentences\"]:\n", - " \n", + "\n", " print(\"{}\\t{}\".format(\n", " anno.metadata[\"sentence\"], anno.result.replace('\\n',''))) # removing \\n to beutify printing" ] @@ -485,7 +449,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "nlpdev", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -499,11 +463,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "cf73c0c97d90b2660ff29b0c9bed4b851524d3484a00df4555e25832aa5cf188" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/multilingual/Translation_Marian.ipynb b/examples/python/annotation/text/multilingual/Translation_Marian.ipynb index b67309de6ff684..58283d707df192 100644 --- a/examples/python/annotation/text/multilingual/Translation_Marian.ipynb +++ b/examples/python/annotation/text/multilingual/Translation_Marian.ipynb @@ -9,37 +9,26 @@ "\n", "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp/blob/master/example/python/annotation/text/multilingual/Translation_Marian.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/multilingual/Translation_Marian.ipynb)\n", "\n", - "\n" + "# Translate text with the Marian Transformer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# **Translate text**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Spark NLP documentation and instructions:\n", - "https://nlp.johnsnowlabs.com/docs/en/quickstart\n", - "\n", - "### You can find details about Spark NLP annotators here:\n", - "https://nlp.johnsnowlabs.com/docs/en/annotators\n", - "\n", - "### You can find details about Spark NLP models here:\n", - "https://nlp.johnsnowlabs.com/models\n" + "## 1. Colab Setup" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## 1. Colab Setup" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { @@ -49,7 +38,7 @@ "outputs": [], "source": [ "# Install PySpark and Spark NLP\n", - "! pip install -q pyspark==3.1.2 spark-nlp\n", + "! pip install -q pyspark==3.3.1 spark-nlp\n", "\n", "# Install Spark NLP Display lib\n", "! pip install --upgrade -q spark-nlp-display" @@ -121,7 +110,7 @@ "metadata": {}, "outputs": [], "source": [ - "text = \"\"\"La Gioconda è un dipinto ad olio del XVI secolo creato da Leonardo. Si tiene al Louvre di Parigi.\"\"\"\n" + "text = \"\"\"La Gioconda è un dipinto ad olio del XVI secolo creato da Leonardo. Si tiene al Louvre di Parigi.\"\"\"" ] }, { @@ -165,7 +154,7 @@ ".setOutputCol(\"translation\")\n", "\n", "nlp_pipeline = Pipeline(stages=[\n", - " documentAssembler, \n", + " documentAssembler,\n", " sentencerDL, marian\n", "])" ] @@ -186,7 +175,7 @@ "empty_df = spark.createDataFrame([['']]).toDF('text')\n", "pipeline_model = nlp_pipeline.fit(empty_df)\n", "lmodel = LightPipeline(pipeline_model)\n", - "res = lmodel.fullAnnotate(text)\n" + "res = lmodel.fullAnnotate(text)" ] }, { @@ -232,7 +221,7 @@ "toc_visible": true }, "kernelspec": { - "display_name": "nlpdev", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -246,11 +235,6 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" - }, - "vscode": { - "interpreter": { - "hash": "cf73c0c97d90b2660ff29b0c9bed4b851524d3484a00df4555e25832aa5cf188" - } } }, "nbformat": 4, diff --git a/examples/python/annotation/text/multilingual/WordSegmenterMultilingual.ipynb b/examples/python/annotation/text/multilingual/WordSegmenterMultilingual.ipynb index f6d8ad131101da..73c1e9bc963568 100644 --- a/examples/python/annotation/text/multilingual/WordSegmenterMultilingual.ipynb +++ b/examples/python/annotation/text/multilingual/WordSegmenterMultilingual.ipynb @@ -1,67 +1,31 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "fUB_oKDe7tv-", - "outputId": "74d0dd7c-e235-4a34-f4de-ef0f70ae60ee" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/multilingual/WordSegmenterMultilingual.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/multilingual/WordSegmenterMultilingual.ipynb)\n", + "\n", + "# Multilingual Word Segmentation" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "I9zHmXu37_AU", - "outputId": "64655014-7c3c-42bb-bdbf-353d711581ec", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:24:06-- https://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:24:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:24:07 (36.0 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 42 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 66.8 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 71.2 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "V7j7Io_n8Anv" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -74,10 +38,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "YdDln5dO8CNX" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spark = sparknlp.start()" @@ -85,18 +47,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "q8YrqB8XS5Z8" - }, + "metadata": {}, "source": [ "## Multilingual Inference" ] }, { "cell_type": "markdown", - "metadata": { - "id": "EK-KvdocS9Bx" - }, + "metadata": {}, "source": [ "When dealing with multilingual text, we have two options in WordSegmenter:\n", "1. Use `setEnableRegexTokenizer` parameter. This is useful for current pretrained models.\n", @@ -105,9 +63,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "h_N6T1WtpH3d" - }, + "metadata": {}, "source": [ "Setting `setEnableRegexTokenizer=True` parameter will make WordSegmenter to tokenize latin words based on spaces and apply word segmenter inference **only in non-latin words**. As show in the example below.\n", "\n", @@ -116,9 +72,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "rO4nw34a04Vq" - }, + "metadata": {}, "source": [ "This example has a text with Thai and English words. So, we use a WordSegmenter model of Thai language. You can check additional WordSegmenter models in our [official model's page](https://nlp.johnsnowlabs.com/models?q=Word+Segmenter).\n", "\n", @@ -128,10 +82,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "40VRukPNBd92" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "multilingual_text = \"สำหรับฐานลำโพง apple homepod อุปกรณ์เครื่องเสียงยึดขาตั้งไม้แข็งตั้งพื้น speaker stands null\"\n", @@ -140,18 +92,12 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qM36eUWBA7ND", - "outputId": "08e96c37-b7a0-4991-dd90-d03d23aa9b0e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "wordseg_best download started this may take some time.\n", "Approximate size to download 79.2 KB\n", @@ -173,18 +119,12 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_CpzTFTABugA", - "outputId": "ef1062f5-cb51-4a0f-f884-e9e1c44ad8a6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+\n", "| text| document| token|\n", @@ -201,18 +141,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "97AO5D57Sz7j", - "outputId": "f92401d2-1e24-4517-a5fd-9165c5df23c5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|token |\n", @@ -228,28 +162,23 @@ ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Bv4MabwbtCfR" - }, + "metadata": {}, "source": [ - "##Training a Multilingual Model" + "## Training a Multilingual Model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "gjSA2lqi1hSQ" - }, + "metadata": {}, "source": [ "We can also train our own multilingual model, which will require to build a training file with the required format, as in this example to label each character for English and Thai alike." ] }, { "cell_type": "markdown", - "metadata": { - "id": "EYYxvyKa10Oe" - }, + "metadata": {}, "source": [ "The tags legend for the training dataset is the following:\n", "- LL: Left Boundary of a word\n", @@ -260,10 +189,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "DRbfBpBfr2-_" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "thai_word1 = \"สำ|LL ห|MM รั|MM บ|RR ฐ|LL า|MM น|RR ลำ|LL โ|MM พ|MM ง|RR \"\n", @@ -282,18 +209,12 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sjbtYRUCs9_t", - "outputId": "8264b70c-575c-40e1-ae8c-579f17aefebc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "สำ|LL ห|MM รั|MM บ|RR ฐ|LL า|MM น|RR ลำ|LL โ|MM พ|MM ง|RR a|LL p|MM p|MM l|MM e|RR h|LL o|MM m|MM e|MM p|MM o|MM d|RR อุ|LL ป|MM ก|MM ร|MM ณ์|RR เ|LL ค|MM รื่|MM อ|MM ง|RR เ|LL สี|MM ย|MM ง|RR ยึ|LL ด|RR ข|LLา|RR ตั้|LL ง|RR พื้|LL น|RR s|LL p|MM e|MM a|MM k|MM e|MM r|RR s|LL t|MM a|MM n|MM d|MM s|RR n|LL u|MM l|MM l|RR\n", "สำ|LL ห|MM รั|MM บ|RR ฐ|LL า|MM น|RR ลำ|LL โ|MM พ|MM ง|RR a|LL p|MM p|MM l|MM e|RR h|LL o|MM m|MM e|MM p|MM o|MM d|RR อุ|LL ป|MM ก|MM ร|MM ณ์|RR เ|LL ค|MM รื่|MM อ|MM ง|RR เ|LL สี|MM ย|MM ง|RR ยึ|LL ด|RR ข|LLา|RR ตั้|LL ง|RR พื้|LL น|RR s|LL p|MM e|MM a|MM k|MM e|MM r|RR s|LL t|MM a|MM n|MM d|MM s|RR n|LL u|MM l|MM l|RR\n", @@ -309,18 +230,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "s9QNHM_uxIVL", - "outputId": "138c71bc-eff8-4085-d95c-d63a4d540858" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+\n", "| text| document| tags|\n", @@ -344,18 +259,12 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CEPIUB17tq4u", - "outputId": "d718ddf4-d863-46e8-d91e-168594dc0d10" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+\n", "| text| document| token|\n", @@ -386,18 +295,12 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qoKPY_qRt8fv", - "outputId": "811dbd1e-f950-479b-8414-a4d8f70bcf8e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|token |\n", @@ -432,8 +335,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/portuguese/MultiDateMatcherMultiLanguage_pt.ipynb b/examples/python/annotation/text/portuguese/MultiDateMatcherMultiLanguage_pt.ipynb index 8c0ed76d608ff9..e461f53e7c287c 100644 --- a/examples/python/annotation/text/portuguese/MultiDateMatcherMultiLanguage_pt.ipynb +++ b/examples/python/annotation/text/portuguese/MultiDateMatcherMultiLanguage_pt.ipynb @@ -1,161 +1,74 @@ { "cells": [ { - "cell_type": "code", - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "dFa6j1Lq8bKJ", - "outputId": "69041a40-6d43-423e-9fd3-9707cade8be6", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "id": "dFa6j1Lq8bKJ", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:25:31-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:25:31-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:25:32-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:25:32 (31.1 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 53 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 50.5 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 55.6 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d1a9947b", - "metadata": { - "id": "d1a9947b" - }, - "outputs": [], - "source": [ - "from pyspark import *\n", - "import sparknlp" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1d695f9d", - "metadata": { - "id": "1d695f9d" - }, - "outputs": [], + "attachments": {}, + "cell_type": "markdown", + "id": "cfd62825", + "metadata": {}, "source": [ - "spark= sparknlp.start()" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/portuguese/MultiDateMatcherMultiLanguage_pt.ipynb)\n", + "\n", + "# MultiDateMatcher in Portuguese" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "6edb5c48", - "metadata": { - "id": "6edb5c48" - }, + "execution_count": null, + "id": "dFa6j1Lq8bKJ", + "metadata": {}, "outputs": [], "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "b072abfa", - "metadata": { - "id": "b072abfa", - "outputId": "a1efcdfc-b76a-4fbf-e4c9-06a3877e41be", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 36 - } - }, + "execution_count": null, + "id": "84dc2c25", + "metadata": {}, "outputs": [ { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 15 + "name": "stdout", + "output_type": "stream", + "text": [ + "4.3.1\n", + "3.3.0\n" + ] } ], "source": [ - "sparknlp.version()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "84dc2c25", - "metadata": { - "id": "84dc2c25" - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType" + "from pyspark import *\n", + "from pyspark.sql.types import StringType\n", + "\n", + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spark = sparknlp.start()\n", + "print(sparknlp.version())\n", + "print(spark.version)" ] }, { "cell_type": "markdown", "id": "e547708d", - "metadata": { - "id": "e547708d" - }, + "metadata": {}, "source": [ "## Portuguese formatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "88ca9039", - "metadata": { - "id": "88ca9039", - "outputId": "2e53d440-3b60-4b4b-dac1-a22bbd5c9657", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -175,19 +88,13 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "0efc1f05", - "metadata": { - "id": "0efc1f05", - "outputId": "2f3ba70c-c129-455b-d5dc-d7403fdac4dd", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", @@ -216,28 +123,20 @@ { "cell_type": "markdown", "id": "1f7e34f8", - "metadata": { - "id": "1f7e34f8" - }, + "metadata": {}, "source": [ "## Portuguese unformatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "5258a479", - "metadata": { - "id": "5258a479", - "outputId": "8c8f59c3-6b7c-4e05-e5b4-eaf125546e0d", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -257,24 +156,18 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "ab6f1eb6", - "metadata": { - "id": "ab6f1eb6", - "outputId": "069ae819-9434-4b0b-d2e2-561c507fdee5", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 19, 28, 12/18/2022, {sentence -> 0}, []}, {date, 66, 74, 12/30/2022, {sentence -> 0}, []}]|\n", + "|[{date, 19, 28, 02/15/2023, {sentence -> 0}, []}, {date, 66, 74, 02/27/2023, {sentence -> 0}, []}]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -298,9 +191,7 @@ { "cell_type": "markdown", "id": "e4e2bb5a", - "metadata": { - "id": "e4e2bb5a" - }, + "metadata": {}, "source": [ "# A short guide to language support extension\n", "\n", @@ -315,19 +206,12 @@ "\n", "Thank you for contributing! :)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce426720", - "metadata": { - "id": "ce426720" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -342,11 +226,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/portuguese/date_matcher_multi_language_pt.ipynb b/examples/python/annotation/text/portuguese/date_matcher_multi_language_pt.ipynb index 35e0988df290e1..f4b4aa4603e7de 100644 --- a/examples/python/annotation/text/portuguese/date_matcher_multi_language_pt.ipynb +++ b/examples/python/annotation/text/portuguese/date_matcher_multi_language_pt.ipynb @@ -1,77 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "bIiSs2Om8Jrk" - }, + "metadata": {}, "source": [ - "# DateMatcher multi-language\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/portuguese/date_matcher_multi_language_pt.ipynb)\n", "\n", - "#### This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." + "# DateMatcher multi-language (Portuguese)\n", + "This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "YSzwy5W28LXm", - "outputId": "8b87f368-53f1-43c6-f659-c68ee9b9f018", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:24:19-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:24:19-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:24:20-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 0%[ ] 0 --.-KB/s Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:24:20 (72.9 MB/s) - written to stdout [1191/1191]\n", - "\n", - "\u001B[K |████████████████████████████████| 281.5 MB 54 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 69.3 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 56.3 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "943a272c-0686-4e02-a8d9-b2849721c829", - "showTitle": false, - "title": "" - }, - "id": "QlPXdwXc8Jrm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", @@ -89,28 +44,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b200e2aa-6280-4f51-9eb4-e30f660e2ba4", - "showTitle": false, - "title": "" - }, - "id": "RsznIqnN8Jrn", - "outputId": "cf3213e5-0904-4b51-f0f1-b5f43fd27a60", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -119,11 +57,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v3.2.3
\n", + "
v3.3.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -133,10 +71,14 @@ " \n", "
\n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], "source": [ @@ -145,34 +87,18 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c0b759a0-346f-4d9f-9f01-383124c0aa05", - "showTitle": false, - "title": "" - }, - "id": "eLENgNbl8Jrn", - "outputId": "273b8109-dce7-4f9c-ef64-89b50ba7c14b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } + "'4.3.1'" + ] }, + "execution_count": null, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ @@ -181,34 +107,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "yrgxWJFt8Jro" - }, + "metadata": {}, "source": [ "# Portuguese examples" ] }, { "cell_type": "markdown", - "metadata": { - "id": "17lieZtA8Jro" - }, + "metadata": {}, "source": [ "### Let's import some articoles sentences from the news where relative dates are present." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a91c2626-5ef8-4e01-9563-120daf4f63f3", - "showTitle": false, - "title": "" - }, - "id": "NLIQTudd8Jro" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pt_articles = [\n", @@ -219,33 +133,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "d834GHso8Jro" - }, + "metadata": {}, "source": [ "### Let's fill a DataFrame with the text column" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "cfe3f9e0-4a96-44bb-b056-0b4a5407c6dc", - "showTitle": false, - "title": "" - }, - "id": "z3eoDIea8Jrp", - "outputId": "e5ce7205-1c68-4514-9cbc-d685f35dca63", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", @@ -271,25 +171,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "8F_wbjj_8Jrp" - }, + "metadata": {}, "source": [ "### Now, let's create a simple pipeline to apply the DateMatcher, specifying the source language" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f4baf2a1-3e75-479e-9e9b-2b071624ee3d", - "showTitle": false, - "title": "" - }, - "id": "GH0tJQ7C8Jrp" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -305,10 +195,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "9fn58bDo8Jrp" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Let's transform the Data" @@ -316,30 +204,18 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "10380fbb-43c1-42c3-b6d0-f02e55d75a24", - "showTitle": false, - "title": "" - }, - "id": "YmK_4o6x8Jrp", - "outputId": "1eae12b2-cd75-49e2-8ed2-e3935e0b6b8e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------------------------------------------------+\n", "|date |\n", "+-------------------------------------------------+\n", "|[{date, 20, 37, 07/11/2021, {sentence -> 0}, []}]|\n", - "|[{date, 48, 58, 12/23/2017, {sentence -> 0}, []}]|\n", + "|[{date, 48, 58, 02/20/2018, {sentence -> 0}, []}]|\n", "+-------------------------------------------------+\n", "\n" ] @@ -349,15 +225,6 @@ "assembled = document_assembler.transform(df)\n", "date_matcher.transform(assembled).select('date').show(10, False)" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "aX_u0Bzb8zNp" - }, - "execution_count": null, - "outputs": [] } ], "metadata": { @@ -371,6 +238,9 @@ "notebookOrigID": 2439167545177012, "widgets": {} }, + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -385,11 +255,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/spanish/MultiDateMatcherMultiLanguage_es.ipynb b/examples/python/annotation/text/spanish/MultiDateMatcherMultiLanguage_es.ipynb index 87b31156b84428..fc713115b7d887 100644 --- a/examples/python/annotation/text/spanish/MultiDateMatcherMultiLanguage_es.ipynb +++ b/examples/python/annotation/text/spanish/MultiDateMatcherMultiLanguage_es.ipynb @@ -1,163 +1,74 @@ { "cells": [ { - "cell_type": "code", - "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "41c06Wm09q_m", - "outputId": "d3f003ee-640a-4c37-bdb4-cb3d19293fce", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, - "id": "41c06Wm09q_m", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:30:58-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:30:58-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:30:59-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:30:59 (36.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 53 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 57.7 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 53.8 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d1a9947b", - "metadata": { - "id": "d1a9947b" - }, - "outputs": [], - "source": [ - "from pyspark import *\n", - "import sparknlp" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1d695f9d", - "metadata": { - "id": "1d695f9d" - }, - "outputs": [], + "attachments": {}, + "cell_type": "markdown", + "id": "233932d4", + "metadata": {}, "source": [ - "spark = sparknlp.start()" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/spanish/MultiDateMatcherMultiLanguage_es.ipynb)\n", + "\n", + "# MultiDateMatcher in Spanish" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "6edb5c48", - "metadata": { - "id": "6edb5c48" - }, + "execution_count": null, + "id": "41c06Wm09q_m", + "metadata": {}, "outputs": [], "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "b072abfa", - "metadata": { - "id": "b072abfa", - "outputId": "477b7aed-0bcf-4d1b-a011-a378e4d82be2", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 36 - } - }, + "execution_count": null, + "id": "d1a9947b", + "metadata": {}, "outputs": [ { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 5 + "name": "stdout", + "output_type": "stream", + "text": [ + "4.3.1\n", + "3.3.0\n" + ] } ], "source": [ - "sparknlp.version()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "84dc2c25", - "metadata": { - "id": "84dc2c25" - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType" + "from pyspark import *\n", + "from pyspark.sql.types import StringType\n", + "\n", + "import sparknlp\n", + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spark = sparknlp.start()\n", + "print(sparknlp.version())\n", + "print(spark.version)" ] }, { "cell_type": "markdown", "id": "354ae84d", - "metadata": { - "id": "354ae84d" - }, + "metadata": {}, "source": [ "## Spanish formatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "7394467e", - "metadata": { - "id": "7394467e", - "outputId": "59f3265e-45fd-442a-84d2-05749c679924", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -177,25 +88,18 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "4c5b7476", - "metadata": { - "id": "4c5b7476", - "outputId": "3fcbba72-c97c-4810-fc9d-91a0ad7ebfc0", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 14, 23, 12/21/2022, {sentence -> 0}, []}, {date, 56, 64, 12/30/2022, {sentence -> 0}, []}]|\n", + "|[{date, 14, 23, 02/18/2023, {sentence -> 0}, []}, {date, 56, 64, 02/27/2023, {sentence -> 0}, []}]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -219,29 +123,20 @@ { "cell_type": "markdown", "id": "2908aab1", - "metadata": { - "id": "2908aab1" - }, + "metadata": {}, "source": [ "## Spanish unformatted dates matching examples" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "7f714b84", - "metadata": { - "id": "7f714b84", - "outputId": "00da14be-a732-410e-ed92-63d5ad1a6843", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -261,25 +156,18 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "2512db8a", - "metadata": { - "id": "2512db8a", - "outputId": "917a77e6-5acf-4a11-ee19-a023c4d27333", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------+\n", "|date |\n", "+--------------------------------------------------------------------------------------------------+\n", - "|[{date, 10, 18, 12/22/2022, {sentence -> 0}, []}, {date, 47, 55, 01/06/2023, {sentence -> 0}, []}]|\n", + "|[{date, 10, 18, 02/19/2023, {sentence -> 0}, []}, {date, 47, 55, 03/06/2023, {sentence -> 0}, []}]|\n", "+--------------------------------------------------------------------------------------------------+\n", "\n" ] @@ -303,9 +191,7 @@ { "cell_type": "markdown", "id": "71554a66", - "metadata": { - "id": "71554a66" - }, + "metadata": {}, "source": [ "# A short guide to language support extension\n", "\n", @@ -320,19 +206,12 @@ "\n", "Thank you for contributing! :)" ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a4bcef16", - "metadata": { - "id": "a4bcef16" - }, - "outputs": [], - "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -347,11 +226,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/annotation/text/spanish/date_matcher_multi_language_es.ipynb b/examples/python/annotation/text/spanish/date_matcher_multi_language_es.ipynb index f3efbbabd8e354..225b003ec78ce4 100644 --- a/examples/python/annotation/text/spanish/date_matcher_multi_language_es.ipynb +++ b/examples/python/annotation/text/spanish/date_matcher_multi_language_es.ipynb @@ -1,77 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "K_khQKne9mES" - }, + "metadata": {}, "source": [ - "# DateMatcher multi-language\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/spanish/date_matcher_multi_language_es.ipynb)\n", "\n", - "#### This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." + "# DateMatcher multi-language (Spanish)\n", + "This annotator allows you to specify a source language that will be used to identify temporal keywords and extract dates." ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "id": "kna60wk39n9T", - "outputId": "229ac371-4d54-4e3b-cc99-ba5050a8e2c9", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 12:30:40-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 12:30:40-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 12:30:41-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 12:30:42 (69.5 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 48 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 53.1 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 54.2 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "943a272c-0686-4e02-a8d9-b2849721c829", - "showTitle": false, - "title": "" - }, - "id": "m08q0Jna9mEU" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Import Spark NLP\n", @@ -89,28 +44,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b200e2aa-6280-4f51-9eb4-e30f660e2ba4", - "showTitle": false, - "title": "" - }, - "id": "-1mg-zCq9mEV", - "outputId": "8db02807-3285-4d83-cf36-711cc84b6b5b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 222 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -119,11 +57,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v3.2.3
\n", + "
v3.3.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -133,10 +71,14 @@ " \n", "
\n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], "source": [ @@ -145,34 +87,18 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c0b759a0-346f-4d9f-9f01-383124c0aa05", - "showTitle": false, - "title": "" - }, - "id": "KUj_aMEM9mEW", - "outputId": "67cd481c-130c-47f0-a467-c9dfbe5e0226", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 36 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "'4.2.6'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } + "'4.3.1'" + ] }, + "execution_count": null, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ @@ -181,34 +107,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "O8Hq95pD9mEW" - }, + "metadata": {}, "source": [ "# Spanish examples" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Q4YkIoS29mEW" - }, + "metadata": {}, "source": [ "### Let's import some articoles sentences from the news where relative dates are present." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a91c2626-5ef8-4e01-9563-120daf4f63f3", - "showTitle": false, - "title": "" - }, - "id": "eA5kta1I9mEW" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "es_articles = [\n", @@ -219,33 +133,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "k38fqXYb9mEX" - }, + "metadata": {}, "source": [ "### Let's fill a DataFrame with the text column" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "cfe3f9e0-4a96-44bb-b056-0b4a5407c6dc", - "showTitle": false, - "title": "" - }, - "id": "pblFJojX9mEX", - "outputId": "7207b85d-06b7-4e19-c81a-384dcc91571e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", @@ -271,25 +171,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9l9ABguG9mEX" - }, + "metadata": {}, "source": [ "### Now, let's create a simple pipeline to apply the DateMatcher, specifying the source language" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f4baf2a1-3e75-479e-9e9b-2b071624ee3d", - "showTitle": false, - "title": "" - }, - "id": "jYsqbRuu9mEX" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -305,10 +195,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "6AxIBpR39mEY" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Let's transform the Data" @@ -316,30 +204,18 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "10380fbb-43c1-42c3-b6d0-f02e55d75a24", - "showTitle": false, - "title": "" - }, - "id": "a0Du4hMA9mEY", - "outputId": "b0749a83-fa83-4c34-d3a3-9b322dcbe6d7", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------------------------------------------------+\n", "|date |\n", "+-------------------------------------------------+\n", "|[{date, 19, 36, 07/11/2021, {sentence -> 0}, []}]|\n", - "|[{date, 45, 55, 12/23/2017, {sentence -> 0}, []}]|\n", + "|[{date, 45, 55, 02/20/2018, {sentence -> 0}, []}]|\n", "+-------------------------------------------------+\n", "\n" ] @@ -349,15 +225,6 @@ "assembled = document_assembler.transform(df)\n", "date_matcher.transform(assembled).select('date').show(10, False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1S_ITHPr9mEY" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -371,6 +238,9 @@ "notebookOrigID": 2439167545177012, "widgets": {} }, + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -385,11 +255,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/logging/Comet_SparkNLP_Integration.ipynb b/examples/python/logging/Comet_SparkNLP_Integration.ipynb index ae3a176de18ecf..6d70be663c9f4c 100644 --- a/examples/python/logging/Comet_SparkNLP_Integration.ipynb +++ b/examples/python/logging/Comet_SparkNLP_Integration.ipynb @@ -12,7 +12,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/example/python/logging/Comet_SparkNLP_Integration.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/logging/Comet_SparkNLP_Integration.ipynb)" ] }, { @@ -37,7 +37,7 @@ "metadata": {}, "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash\n", "\n", "# Install Spark NLP Display for visualization\n", @@ -767,17 +767,12 @@ "toc_visible": true }, "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } } }, "nbformat": 4, diff --git a/examples/python/prediction/english/Load_Model_From_S3.ipynb b/examples/python/prediction/english/Load_Model_From_S3.ipynb deleted file mode 100644 index 0358f631e087e2..00000000000000 --- a/examples/python/prediction/english/Load_Model_From_S3.ipynb +++ /dev/null @@ -1,493 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/Load_Model_From_S3.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Pretrained Models from S3" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-eUrx5szYw9u", - "outputId": "500e41f0-bcf3-49ff-df59-f1a7a398566c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2022-09-08 14:43:43-- https://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-09-08 14:43:44-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 0%[ ] 0 --.-KB/s Installing PySpark 3.2.1 and Spark NLP 4.1.0\n", - "setup Colab for PySpark 3.2.1 and Spark NLP 4.1.0\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-09-08 14:43:45 (37.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "\u001B[K |████████████████████████████████| 281.4 MB 32 kB/s \n", - "\u001B[K |████████████████████████████████| 616 kB 33.3 MB/s \n", - "\u001B[K |████████████████████████████████| 198 kB 54.1 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], - "source": [ - "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Defining S3 URI in cache_pretrained " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook, we are going to see the steps required to use an external S3 URI as `cache_pretrained` folder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In Spark NLP you can configure the location to download the pre-trained models. Before Spark NLP 4.2.0, we can define a local file system, or a distributed file system (DBFS). Starting at 4.2.0, you can also set an S3 URI. To do this, we need to configure the spark session with the required settings for Spark NLP and Spark ML." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Spark NLP Settings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Spark NLP requires the following configuration:\n", - "1. `cache_folder`: Here you must define your S3 URI (using s3 or s3a prefix) that will store Spark NLP pre-trained models. This is defined in the config `spark.jsl.settings.pretrained.cache_folder`\n", - "2. S3 Region: We need the region to upload a file on your S3 bucket. This is defined in the config `spark.jsl.settings.aws.region`\n", - "3. Spark NLP JAR: Since some custom configurations are needed to use S3 URI in `cache_pretrained`. It is also required to include spark-nlp JAR either as a dependency for our application or during spark session creation. Since we are using a notebook, we will add these packages while creating a spark session in the following config:\n", - "\n", - "- `spark.jars.packages` for Maven coordinates or `spark.jar` for FAT JAR\n", - "4. We recommend also adding the parameters described in creating manually a spark session in requirements section on [Spark NLP documentation](https://github.com/JohnSnowLabs/spark-nlp#requirements)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Spark ML Settings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This configuration will depend on your S3 bucket and AWS configuration. In this notebook a connection through **Temporary Security Credentials** is showcased. **Please contact your administrator to choose the right setup, as well as, the required keys/tokens.**\n", - "\n", - "Spark ML requires the following configuration to load a model from S3 using *Temporary Security Credentials*:\n", - "\n", - "1. Authenticating with S3: This is needed to interact with external S3 buckets, and it will require an access key, a secret key, and a session token. Define the values in these configs:\n", - "\n", - "- `spark.hadoop.fs.s3a.access.key`\n", - "- `spark.hadoop.fs.s3a.secret.key`\n", - "- `spark.hadoop.fs.s3a.session.token`\n", - "2. Credential Provider: You need to define the Hadoop provider that will handle this connection. Since in this notebook, *Temporary Security Credentials* is used we need to use the provider `TemporaryAWSCredentialsProvider` from `hadoop-aws` package, and set it up in the config below:\n", - "\n", - "- `spark.hadoop.fs.s3a.aws.credentials.provider`\n", - "3. AWS packages: S3A depends upon two JARs, alongside `hadoop-common` and its dependencies, which are `hadoop-aws` and `aws-java-sdk` packages. So, you will need to either add these dependencies in your application or to your spark session. Since we are using a notebook, we will add these packages while creating the spark session in the following config:\n", - "\n", - "- `spark.jars.packages`\n", - "4. AWS File System: Defining S3AFileSystem it's also required for interacting S3 with AWS SDK. Define the value in this config:\n", - "\n", - "- `spark.hadoop.fs.s3a.impl`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let's take a look at the spark session creation below to see how to define each of the configurations with its values for **Temporary Security Credentials**:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Enter your AWS Access Key:\")\n", - "MY_ACCESS_KEY = input()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Enter your AWS Secret Key:\")\n", - "MY_SECRET_KEY = input()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Enter your AWS Session Key:\")\n", - "MY_SESSION_KEY = input()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - }, - "id": "XSCAf1NOe7rC", - "outputId": "12014be5-e174-42c1-ad37-9f97f64652aa" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "

SparkSession - in-memory

\n", - " \n", - "
\n", - "

SparkContext

\n", - "\n", - "

Spark UI

\n", - "\n", - "
\n", - "
Version
\n", - "
v3.2.1
\n", - "
Master
\n", - "
local[*]
\n", - "
AppName
\n", - "
SparkNLP
\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pyspark\n", - "from pyspark.sql import SparkSession\n", - "\n", - "spark = SparkSession.builder \\\n", - " .appName(\"SparkNLP\") \\\n", - " .master(\"local[*]\") \\\n", - " .config(\"spark.driver.memory\", \"12G\") \\\n", - " .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n", - " .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n", - " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", - " .config(\"spark.hadoop.fs.s3a.access.key\", MY_ACCESS_KEY) \\\n", - " .config(\"spark.hadoop.fs.s3a.secret.key\", MY_SECRET_KEY) \\\n", - " .config(\"spark.hadoop.fs.s3a.session.token\", MY_SESSION_KEY) \\\n", - " .config(\"spark.hadoop.fs.s3a.aws.credentials.provider\", \"org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider\") \\\n", - " .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\") \\\n", - " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.1.0,org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk:1.11.901\") \\\n", - " .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\") \\\n", - " .config(\"spark.jsl.settings.pretrained.cache_folder\", \"s3://my_bucket/my/models/\") \\\n", - " .config(\"spark.jsl.settings.aws.region\", \"us-east-1\") \\\n", - " .getOrCreate()\n", - "\n", - "spark" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Disclaimer: \n", - "- Interaction with S3 depends on Spark/Hadoop/AWS implementations, which is out of our scope. Keep in mind that the configuration requirements or formats could change in other releases. For addidional information and details, we recommend checking their up to date official documentation, like this one from [Hadoop-AWS Integration with AWS](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html)\n", - "- It's important to stand out that `hadoop-aws` and `aws-java-sdk` package versions must be compatible. Otherwise, it won't work. The example of this notebook uses Hadoop 3.3.1. So, you must modify those versions based on your Hadoop version." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FQ8jfnOR39DQ", - "outputId": "6800b159-2ada-4eb0-f8f2-06aaae482435" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hadoop version = 3.3.1\n" - ] - } - ], - "source": [ - "print(f\"Hadoop version = {spark.sparkContext._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Starting at spark-nlp 4.3.0, if you have control over spark session creation. You can also use sparknlp.start() with params argument:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sparknlp\n", - "\n", - "params = {\n", - " \"spark.jsl.settings.pretrained.cache_folder\": \"s3://auxdata.johnsnowlabs.com/public/tmp/danilo/models/\",\n", - " \"spark.hadoop.fs.s3a.access.key\": AWS_ACCESS_KEY_ID,\n", - " \"spark.hadoop.fs.s3a.secret.key\": AWS_SECRET_ACCESS_KEY,\n", - " \"spark.hadoop.fs.s3a.session.token\": AWS_SESSION_TOKEN,\n", - " \"spark.jsl.settings.aws.region\": \"us-east-1\",\n", - " \"spark.hadoop.fs.s3a.aws.credentials.provider\": \"org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider\",\n", - " \"spark.hadoop.fs.s3a.impl\": \"org.apache.hadoop.fs.s3a.S3AFileSystem\",\n", - " \"spark.jars.packages\": \"org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk:1.11.901\",\n", - " \"spark.hadoop.fs.s3a.path.style.access\": \"true\"\n", - "}\n", - "\n", - "spark = sparknlp.start(params=params)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "oz4bRCvRnPWz" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "from pyspark.ml import Pipeline\n", - "from pyspark.ml import PipelineModel" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_qiC18IvnhIA", - "outputId": "2206db7e-2012-4041-b23e-96e04f59c89f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sentence_detector_dl download started this may take some time.\n", - "Approximate size to download 354.6 KB\n", - "[OK!]\n" - ] - } - ], - "source": [ - "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", - "\n", - "sentence_detector = SentenceDetectorDLModel.pretrained() \\\n", - " .setInputCols([\"document\"]) \\\n", - " .setOutputCol(\"sentence\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "iCFm_eIwoA0P" - }, - "outputs": [], - "source": [ - "pipeline = Pipeline(stages=[\n", - " document_assembler,\n", - " sentence_detector\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "F_Vin105oH2W" - }, - "outputs": [], - "source": [ - "test_df = spark.createDataFrame([[\"This is a simple example. This is another sentence\"]]).toDF(\"text\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "7wPFZJadoD-N" - }, - "outputs": [], - "source": [ - "model = pipeline.fit(test_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "S-jN9LtwolmW", - "outputId": "0d676204-78b6-4460-fde3-0a0dfdcb8d5d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------------------------------------+--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+\n", - "|text |document |sentence |\n", - "+--------------------------------------------------+--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+\n", - "|This is a simple example. This is another sentence|[{document, 0, 49, This is a simple example. This is another sentence, {sentence -> 0}, []}]|[{document, 0, 24, This is a simple example., {sentence -> 0}, []}, {document, 25, 49, This is another sentence, {sentence -> 1}, []}]|\n", - "+--------------------------------------------------+--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+\n", - "\n" - ] - } - ], - "source": [ - "model.transform(test_df).show(truncate=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XvOWCR6EXrss", - "outputId": "96cda5f0-55e4-442d-a4d3-780201647331" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "explain_document_ml download started this may take some time.\n", - "Approx size to download 9.2 MB\n", - "[OK!]\n" - ] - } - ], - "source": [ - "from sparknlp.pretrained import PretrainedPipeline\n", - "\n", - "pipeline_model = PretrainedPipeline('explain_document_ml', lang = 'en')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tz1Y8DKRX4sS", - "outputId": "7bf91165-7912-4028-ad23-a229942572d5" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------------------------------------+--------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|text |document |sentence |token |spell |lemmas |stems |pos |\n", - "+--------------------------------------------------+--------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|This is a simple example. This is another sentence|[{document, 0, 49, This is a simple example. This is another sentence, {sentence -> 0}, []}]|[{document, 0, 24, This is a simple example., {sentence -> 0}, []}, {document, 26, 49, This is another sentence, {sentence -> 1}, []}]|[{token, 0, 3, This, {sentence -> 0}, []}, {token, 5, 6, is, {sentence -> 0}, []}, {token, 8, 8, a, {sentence -> 0}, []}, {token, 10, 15, simple, {sentence -> 0}, []}, {token, 17, 23, example, {sentence -> 0}, []}, {token, 24, 24, ., {sentence -> 0}, []}, {token, 26, 29, This, {sentence -> 1}, []}, {token, 31, 32, is, {sentence -> 1}, []}, {token, 34, 40, another, {sentence -> 1}, []}, {token, 42, 49, sentence, {sentence -> 1}, []}]|[{token, 0, 3, This, {confidence -> 1.0, sentence -> 0}, []}, {token, 5, 6, is, {confidence -> 1.0, sentence -> 0}, []}, {token, 8, 8, a, {confidence -> 1.0, sentence -> 0}, []}, {token, 10, 15, simple, {confidence -> 1.0, sentence -> 0}, []}, {token, 17, 23, example, {confidence -> 1.0, sentence -> 0}, []}, {token, 24, 24, ., {confidence -> 0.0, sentence -> 0}, []}, {token, 26, 29, This, {confidence -> 1.0, sentence -> 1}, []}, {token, 31, 32, is, {confidence -> 1.0, sentence -> 1}, []}, {token, 34, 40, another, {confidence -> 1.0, sentence -> 1}, []}, {token, 42, 49, sentence, {confidence -> 1.0, sentence -> 1}, []}]|[{token, 0, 3, This, {confidence -> 1.0, sentence -> 0}, []}, {token, 5, 6, be, {confidence -> 1.0, sentence -> 0}, []}, {token, 8, 8, a, {confidence -> 1.0, sentence -> 0}, []}, {token, 10, 15, simple, {confidence -> 1.0, sentence -> 0}, []}, {token, 17, 23, example, {confidence -> 1.0, sentence -> 0}, []}, {token, 24, 24, ., {confidence -> 0.0, sentence -> 0}, []}, {token, 26, 29, This, {confidence -> 1.0, sentence -> 1}, []}, {token, 31, 32, be, {confidence -> 1.0, sentence -> 1}, []}, {token, 34, 40, another, {confidence -> 1.0, sentence -> 1}, []}, {token, 42, 49, sentence, {confidence -> 1.0, sentence -> 1}, []}]|[{token, 0, 3, thi, {confidence -> 1.0, sentence -> 0}, []}, {token, 5, 6, i, {confidence -> 1.0, sentence -> 0}, []}, {token, 8, 8, a, {confidence -> 1.0, sentence -> 0}, []}, {token, 10, 15, simpl, {confidence -> 1.0, sentence -> 0}, []}, {token, 17, 23, exampl, {confidence -> 1.0, sentence -> 0}, []}, {token, 24, 24, ., {confidence -> 0.0, sentence -> 0}, []}, {token, 26, 29, thi, {confidence -> 1.0, sentence -> 1}, []}, {token, 31, 32, i, {confidence -> 1.0, sentence -> 1}, []}, {token, 34, 40, anoth, {confidence -> 1.0, sentence -> 1}, []}, {token, 42, 49, sentenc, {confidence -> 1.0, sentence -> 1}, []}]|[{pos, 0, 3, DT, {word -> This, sentence -> 0}, []}, {pos, 5, 6, VBZ, {word -> is, sentence -> 0}, []}, {pos, 8, 8, DT, {word -> a, sentence -> 0}, []}, {pos, 10, 15, JJ, {word -> simple, sentence -> 0}, []}, {pos, 17, 23, NN, {word -> example, sentence -> 0}, []}, {pos, 24, 24, ., {word -> ., sentence -> 0}, []}, {pos, 26, 29, DT, {word -> This, sentence -> 1}, []}, {pos, 31, 32, VBZ, {word -> is, sentence -> 1}, []}, {pos, 34, 40, DT, {word -> another, sentence -> 1}, []}, {pos, 42, 49, NN, {word -> sentence, sentence -> 1}, []}]|\n", - "+--------------------------------------------------+--------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "\n" - ] - } - ], - "source": [ - "pipeline_model.transform(test_df).show(truncate=False)" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/examples/python/prediction/english/Load_Model_from_GCP_Storage.ipynb b/examples/python/prediction/english/Load_Model_from_GCP_Storage.ipynb deleted file mode 100644 index 0063fc26d980ea..00000000000000 --- a/examples/python/prediction/english/Load_Model_from_GCP_Storage.ipynb +++ /dev/null @@ -1,312 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/Load_Model_From_GCP_Storage.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DfdkWg6LThJP" - }, - "source": [ - "## Loading Pretrained Models from GCP" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-GmZvE5oTku4" - }, - "outputs": [], - "source": [ - "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r44X4OKlToLC" - }, - "source": [ - "## Defining GCP Storage URI in cache_pretrained" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Cy14aeXATt0S" - }, - "source": [ - "In this notebook, we are going to see the steps required to use an external GCP Storage URI as cache_pretrained folder\n", - "\n", - "In Spark NLP you can configure the location to download the pre-trained models. Starting at Spark NLP 4.2.4, you can set a GCP Storage URI. To do this, we need to configure the spark session with the required settings for Spark NLP and Spark ML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CKVkaiTaULve" - }, - "source": [ - "### Spark NLP Settings" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G0DgEpU7UhBw" - }, - "source": [ - "\n", - "\n", - "1. `cache_folder`: Here you must define your GCP storage URI (using gs prefix) that will store Spark NLP pre-trained models. This is defined in the config spark.jsl.settings.pretrained.cache_folder\n", - "2. `project_id`: We need to know the ProjectId of our GCP Storage. This is defined in `spark.jsl.settings.gcp`\n", - "\n", - "To integrage with GCP, we need to setup Application Default Credentials (ADC) for GCP. You can check how to configure it in the official [GCP documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NdluR0wzVVM_" - }, - "source": [ - "### Spark ML Settings" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gUeUonSiVkQj" - }, - "source": [ - "Spark ML requires the following configuration to load a model from GCP using ADC:\n", - "\n", - "\n", - "\n", - "1. GCP connector: You need to identify your hadoop versio and set the required dependency in `spark.jars.packages`\n", - "2. ADC credentials: After following the instructions to setup ADC, you will have a JSON file that holds your authenticiation information. This file is setup in `spark.hadoop.google.cloud.auth.service.account.json.keyfile`\n", - "3. Hadoop File System: You also need to setup the Hadoop implementation to work with GCP Storage as file system. This is define in `spark.hadoop.fs.gs.impl`\n", - "3. Finally, to mitigate conflicts between Spark's dependencies and user dependencies. You must define `spark.driver.userClassPathFirst` as true. You may also need to define `spark.executor.userClassPathFirst` as true.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FEQKV1IRYhg0" - }, - "source": [ - "Now, let's take a look at a simple ecxample the spark session creation below to see how to define each of the configurations with its values:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4JfeD8Rj-as2", - "outputId": "437ae866-f63e-43e0-b898-0860e3b19b7d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.1\n" - ] - } - ], - "source": [ - "mport pyspark\n", - "from pyspark.sql import SparkSession\n", - "\n", - "#GCP Storage configuration\n", - "spark = SparkSession.builder \\\n", - " .appName(\"SparkNLP\") \\\n", - " .master(\"local[*]\") \\\n", - " .config(\"spark.driver.memory\", \"12G\") \\\n", - " .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n", - " .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n", - " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", - " .config(\"spark.jars\", \"./sparknlp.jar\") \\\n", - " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.4,com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.8\") \\\n", - " .config(\"spark.hadoop.fs.gs.impl\", \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\") \\\n", - " .config(\"spark.driver.userClassPathFirst\", \"true\") \\\n", - " .config(\"spark.hadoop.google.cloud.auth.service.account.json.keyfile\", \"/content/.config/application_default_credentials.json\") \\\n", - " .config(\"spark.jsl.settings.gcp.project_id\", \"my_project_id\") \\\n", - " .config(\"spark.jsl.settings.pretrained.cache_folder\", \"gs://my-bucket/models\") \\\n", - " .getOrCreate()\n", - "\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Starting at spark-nlp 4.3.0, if you have control over spark session creation. You can also use sparknlp.start() with params argument:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "params = {\n", - " \"spark.jars.packages\": \"com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.8\",\n", - " \"spark.hadoop.fs.gs.impl\": \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\",\n", - " \"spark.driver.userClassPathFirst\", \"true\",\n", - " \"spark.hadoop.google.cloud.auth.service.account.json.keyfile\", \"/content/.config/application_default_credentials.json\",\n", - " \"spark.jsl.settings.gcp.project_id\", \"my_project_id\",\n", - " \"spark.jsl.settings.pretrained.cache_folder\", \"gs://my-bucket/models\"\n", - "}\n", - "\n", - "spark = sparknlp.start(params=params)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XLNO3Z9r6HgR" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_eB72Yzg8_Jx" - }, - "outputs": [], - "source": [ - "sample_text = \"This is a sentence. This is another sentence\"\n", - "data_df = spark.createDataFrame([[sample_text]]).toDF(\"text\").cache()\n", - "\n", - "empty_df = spark.createDataFrame([[\"\"]]).toDF(\"text\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tRyju8D-6XJ1" - }, - "outputs": [], - "source": [ - "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", - "tokenizer = Tokenizer().setInputCols([\"document\"]).setOutputCol(\"token\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "X5G4_BXwOYtC", - "outputId": "7f15118f-6c8e-46c0-c432-48de09bd72b0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sentence_detector_dl download started this may take some time.\n", - "Approximate size to download 354.6 KB\n", - "[OK!]\n" - ] - } - ], - "source": [ - "sentence_detector_dl = SentenceDetectorDLModel() \\\n", - ".pretrained() \\\n", - ".setInputCols([\"document\"]) \\\n", - ".setOutputCol(\"sentence\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FhKPEMb09w6a" - }, - "outputs": [], - "source": [ - "pipeline = Pipeline(stages=[document_assembler, sentence_detector_dl, tokenizer])\n", - "pipeline_model = pipeline.fit(empty_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0CAp_AtrssPj", - "outputId": "4d579436-d3e5-429d-dabb-0d321dca1f0a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+--------------------+--------------------+\n", - "| text| document| sentence| token|\n", - "+--------------------+--------------------+--------------------+--------------------+\n", - "|This is a sentenc...|[{document, 0, 43...|[{document, 0, 18...|[{token, 0, 3, Th...|\n", - "+--------------------+--------------------+--------------------+--------------------+\n", - "\n" - ] - } - ], - "source": [ - "result = pipeline_model.transform(data_df)\n", - "result.show()" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/examples/python/prediction/english/ViTForImageClassification.ipynb b/examples/python/prediction/english/ViTForImageClassification.ipynb deleted file mode 100644 index e9b2e8876097ee..00000000000000 --- a/examples/python/prediction/english/ViTForImageClassification.ipynb +++ /dev/null @@ -1,395 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "v8xIEZ07QpRM", - "outputId": "b5f5db4b-bce4-4b62-883f-3b3e90a3f1cd" - }, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/ViTForImageClassification.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mz6G5fxae3HW" - }, - "outputs": [], - "source": [ - "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.1 -s 4.1.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6KvNW4MU5rrF", - "outputId": "36cf722b-f3a6-4566-8217-615cc58dc549" - }, - "source": [ - "## ViTForImageClassification Annotator" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BshxwBPTe3Hc" - }, - "source": [ - "In this notebok we are going to classify images using spark-nlp." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FaN1OWV0NQ5T" - }, - "source": [ - "### Downloading Images" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "jEHkswUjUfaU" - }, - "outputs": [], - "source": [ - "!wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/images/images.zip" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "k9F8WstLNXnS" - }, - "outputs": [], - "source": [ - "import shutil\n", - "shutil.unpack_archive(\"images.zip\", \"images\", \"zip\")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Start Spark Session" - ], - "metadata": { - "id": "3a_shOYHfpOn" - } - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "XLNO3Z9r6HgR" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "from pyspark.sql import SparkSession" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "4JfeD8Rj-as2" - }, - "outputs": [], - "source": [ - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "99AqJEThSBuT" - }, - "outputs": [], - "source": [ - "data_df = spark.read.format(\"image\").option(\"dropInvalid\", value = True).load(path=\"/content/images/images/\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J86YU794UYEG" - }, - "source": [ - "### Pipeline with ViTForImageClassification" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tRyju8D-6XJ1", - "outputId": "ad8658bb-8170-488a-f9a1-680c63ad0f80" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "image_classifier_vit_base_patch16_224 download started this may take some time.\n", - "Approximate size to download 309.7 MB\n", - "[OK!]\n" - ] - } - ], - "source": [ - "image_assembler = ImageAssembler() \\\n", - " .setInputCol(\"image\") \\\n", - " .setOutputCol(\"image_assembler\")\n", - "\n", - "image_classifier = ViTForImageClassification \\\n", - " .pretrained() \\\n", - " .setInputCols(\"image_assembler\") \\\n", - " .setOutputCol(\"class\")\n", - "\n", - "pipeline = Pipeline(stages=[\n", - " image_assembler,\n", - " image_classifier,\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "XIYjEhW3O_Uc" - }, - "outputs": [], - "source": [ - "model = pipeline.fit(data_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gIZFLaUOPBnd", - "outputId": "a8cfe0c5-fe6a-4f0b-a4c1-e9cf5d1f22c0" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+--------------------+--------------------+\n", - "| image| image_assembler| class|\n", - "+--------------------+--------------------+--------------------+\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 5,...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 11...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 55...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 2,...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 24...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 14...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 7,...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 8,...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 6,...|\n", - "|{file:///content/...|[{image, file:///...|[{category, 0, 1,...|\n", - "+--------------------+--------------------+--------------------+\n", - "\n" - ] - } - ], - "source": [ - "image_df = model.transform(data_df)\n", - "image_df.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Rfp5MK1UxoNt" - }, - "source": [ - "### Light Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-_6VJPS9xvfV" - }, - "source": [ - "To use light pipeline in ViT transformer, we need to use the new method `fullAnnotateImage`, which can receive 3 kind of inputs:\n", - "1. A path to a single image\n", - "2. A path to a list of images" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XDQ6PrgbSJ8W", - "outputId": "a2b3159d-f929-429b-d7be-fe119470fea4" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "dict_keys(['image_assembler', 'class'])" - ] - }, - "metadata": {}, - "execution_count": 11 - } - ], - "source": [ - "light_pipeline = LightPipeline(model)\n", - "annotations_result = light_pipeline.fullAnnotateImage(\"images/images/hippopotamus.JPEG\")\n", - "annotations_result[0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "73PV--LdSU5-", - "outputId": "4a5f8730-f515-413d-ed0a-010b98d2d844" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "annotator_type: image\n", - "origin: images/images/hippopotamus.JPEG\n", - "height: 333\n", - "width: 500\n", - "nChannels: 3\n", - "mode: 16\n", - "result size: 499500\n", - "metadata: Map()\n", - "[Annotation(category, 0, 55, hippopotamus, hippo, river horse, Hippopotamus amphibius, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 7.2882756E-8, Some(beer glass) -> 9.0488925E-8, image -> 0, Some(damselfly) -> 1.9379786E-7, Some(turnstile) -> 6.8434524E-8, Some(cockroach, roach) -> 1.6622849E-7, height -> 333, Some(bulbul) -> 1.6930231E-7, Some(sea snake) -> 8.89582E-8, origin -> images/images/hippopotamus.JPEG, Some(mixing bowl) -> 1.2995402E-7, mode -> 16, None -> 1.3814622E-7, Some(whippet) -> 3.894023E-8, width -> 500, Some(buckle) -> 1.0061492E-7))]\n" - ] - } - ], - "source": [ - "for result in annotations_result:\n", - " image_assembler = result['image_assembler'][0]\n", - " print(f\"annotator_type: {image_assembler.annotator_type}\")\n", - " print(f\"origin: {image_assembler.origin}\")\n", - " print(f\"height: {image_assembler.height}\")\n", - " print(f\"width: {image_assembler.width}\")\n", - " print(f\"nChannels: {image_assembler.nChannels}\")\n", - " print(f\"mode: {image_assembler.mode}\")\n", - " print(f\"result size: {str(len(image_assembler.result))}\")\n", - " print(f\"metadata: {image_assembler.metadata}\")\n", - " print(result['class'])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V37k8GQFySRW" - }, - "source": [ - "To send a list of images, we just difine a set of images" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "asf3MZGzyXl5", - "outputId": "03db32ad-2ac2-4bb9-dd38-7c06c5d6a4b8" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "dict_keys(['image_assembler', 'class'])" - ] - }, - "metadata": {}, - "execution_count": 14 - } - ], - "source": [ - "images = [\"images/images/bluetick.jpg\", \"images/images/palace.JPEG\", \"images/images/hen.JPEG\"]\n", - "annotations_result = light_pipeline.fullAnnotateImage(images)\n", - "annotations_result[0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dfby3MJlymNV", - "outputId": "ef63a544-c995-429e-e965-302bc8781851" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[Annotation(category, 0, 7, bluetick, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 1.3846728E-6, Some(beer glass) -> 1.1807944E-6, image -> 0, Some(damselfly) -> 3.6875622E-7, Some(turnstile) -> 2.023695E-6, Some(cockroach, roach) -> 6.2982855E-7, height -> 500, Some(bulbul) -> 5.417509E-7, Some(sea snake) -> 5.7421556E-7, origin -> images/images/bluetick.jpg, Some(mixing bowl) -> 5.4001305E-7, mode -> 16, None -> 4.5454306E-7, Some(whippet) -> 1.2101438E-6, width -> 333, Some(buckle) -> 1.1306514E-6))]\n", - "[Annotation(category, 0, 5, palace, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 6.3918545E-5, Some(beer glass) -> 8.879939E-6, image -> 0, Some(damselfly) -> 9.565577E-6, Some(turnstile) -> 6.315168E-5, Some(cockroach, roach) -> 1.125408E-5, height -> 334, Some(bulbul) -> 3.321073E-5, Some(sea snake) -> 1.0886038E-5, origin -> images/images/palace.JPEG, Some(mixing bowl) -> 2.6202975E-5, mode -> 16, None -> 2.6134943E-5, Some(whippet) -> 1.3805137E-5, width -> 500, Some(buckle) -> 3.121459E-5))]\n", - "[Annotation(category, 0, 2, hen, Map(nChannels -> 3, Some(lumbermill, sawmill) -> 2.1663836E-5, Some(beer glass) -> 3.062036E-6, image -> 0, Some(damselfly) -> 5.8477954E-6, Some(turnstile) -> 1.8546416E-6, Some(cockroach, roach) -> 2.5356887E-6, height -> 375, Some(bulbul) -> 3.2049334E-6, Some(sea snake) -> 2.8824059E-6, origin -> images/images/hen.JPEG, Some(mixing bowl) -> 6.9148127E-6, mode -> 16, None -> 2.824775E-6, Some(whippet) -> 4.5998115E-7, width -> 500, Some(buckle) -> 1.6334545E-5))]\n" - ] - } - ], - "source": [ - "for result in annotations_result:\n", - " print(result['class'])" - ] - } - ], - "metadata": { - "colab": { - "name": "ViTForImageClassification-LightPipeline.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/examples/python/quick_start.ipynb b/examples/python/quick_start.ipynb index cb3ebb4efa4ab2..d264827f268872 100644 --- a/examples/python/quick_start.ipynb +++ b/examples/python/quick_start.ipynb @@ -2,22 +2,16 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Z8JHELbJTRlw" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/quick_start.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "C2gbyYe9TRly" - }, + "metadata": {}, "source": [ "# Spark NLP Quick Start\n", "### How to use Spark NLP pretrained pipelines" @@ -25,27 +19,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 13787, - "status": "ok", - "timestamp": 1589692712124, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "cx8GKMsUbbSw", - "outputId": "6419ad3f-df5e-45d3-c071-c5c439076d40" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -58,33 +33,14 @@ } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 13779, - "status": "ok", - "timestamp": 1589692712125, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "Y1_3lypqTRl4", - "outputId": "49ddb0b6-cef2-4e8a-b0ce-2934931d9f6f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -106,12 +62,8 @@ }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "piF4liZZTRmD" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.pretrained import PretrainedPipeline " @@ -119,10 +71,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "w1OadFjMTRmo" - }, + "metadata": {}, "source": [ "Let's use Spark NLP pre-trained pipeline for `named entity recognition`\n", "\n", @@ -131,27 +80,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 16622, - "status": "ok", - "timestamp": 1589692715027, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "zA1Saw3qTRmq", - "outputId": "5923abe3-0d9d-4f6f-8a73-3d635fc9f476" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -169,12 +99,8 @@ }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "TtSviSpXTRm1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "result = pipeline.annotate('Google has announced the release of a beta version of the popular TensorFlow machine learning library.') " @@ -182,27 +108,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 16600, - "status": "ok", - "timestamp": 1589692715029, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "AoBxRzGBTRm8", - "outputId": "551ba735-a363-4fae-a6b7-518c64e5a0e8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -218,27 +125,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 16588, - "status": "ok", - "timestamp": 1589692715030, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "pANHeKPVTRnD", - "outputId": "75abb2f8-2ff0-4a53-ca0a-69fedc30a84a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -254,37 +142,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "BiIpiZHkTRnL" - }, + "metadata": {}, "source": [ "Let's use Spark NLP pre-trained pipeline for `sentiment` analysis" ] }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 19347, - "status": "ok", - "timestamp": 1589692717802, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "z1NgO9-vTRnM", - "outputId": "7c0e0175-c446-4f24-b2ca-427917fed031" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -302,12 +168,8 @@ }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zh3isgBmTRnS" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "result = pipeline.annotate('This is a very boring movie. I recommend others to awoid this movie is not good..')" @@ -315,27 +177,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 19334, - "status": "ok", - "timestamp": 1589692717810, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "UDsuVeL7TRnZ", - "outputId": "4cfc3157-4e07-4627-bc24-d18a2193dfc7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -351,27 +194,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 19324, - "status": "ok", - "timestamp": 1589692717813, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "X942sLnUTRne", - "outputId": "56f25579-7e1c-48f1-96ff-0e13dc2d5098" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -387,10 +211,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "wRgDI3k2TRnj" - }, + "metadata": {}, "source": [ "The word `awoid` has been corrected to `avoid` by spell checker insdie this pipeline" ] @@ -416,8 +237,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/quick_start_google_colab.ipynb b/examples/python/quick_start_google_colab.ipynb index 17159a1ba057b6..0c3f4ea015cf24 100644 --- a/examples/python/quick_start_google_colab.ipynb +++ b/examples/python/quick_start_google_colab.ipynb @@ -1,46 +1,15 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "quick_start_google_colab.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "ph3bDypIEXdd" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "aaVmDt1TEXdh" - }, + "metadata": {}, "source": [ "# Spark NLP Quick Start\n", "### How to use Spark NLP pretrained pipelines" @@ -48,37 +17,25 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "YkbpOBs6DasA" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/quick_start_google_colab.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "XtCa0sZ8EXdj" - }, + "metadata": {}, "source": [ "We will first set up the runtime environment and then load pretrained Entity Recognition model and Sentiment analysis model and give it a quick test. Feel free to test the models on your own sentences / datasets." ] }, { "cell_type": "code", - "metadata": { - "id": "tyMMD_upEfIa", - "outputId": "39b0ad03-5be5-4f61-e87e-9724293deb4a", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "execution_count": 1, + "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "--2021-04-13 17:55:16-- http://setup.johnsnowlabs.com/colab.sh\n", @@ -93,158 +50,123 @@ "Length: 1594 (1.6K) [text/plain]\n", "Saving to: ‘STDOUT’\n", "\n", - "\r- 0%[ ] 0 --.-KB/s \r- 100%[===================>] 1.56K --.-KB/s in 0s \n", + "\r", + "- 0%[ ] 0 --.-KB/s \r", + "- 100%[===================>] 1.56K --.-KB/s in 0s \n", "\n", "2021-04-13 17:55:16 (32.4 MB/s) - written to stdout [1594/1594]\n", "\n", "setup Colab for PySpark 3.0.2 and Spark NLP 3.0.1\n", - "\u001B[K |████████████████████████████████| 204.8MB 64kB/s \n", - "\u001B[K |████████████████████████████████| 153kB 39.5MB/s \n", - "\u001B[K |████████████████████████████████| 204kB 22.0MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ], - "name": "stdout" + "\u001b[K |████████████████████████████████| 204.8MB 64kB/s \n", + "\u001b[K |████████████████████████████████| 153kB 39.5MB/s \n", + "\u001b[K |████████████████████████████████| 204kB 22.0MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] } + ], + "source": [ + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5in-TmzGEXdk", - "outputId": "e97246ab-5735-4059-9ba7-050e5f817643" - }, - "source": [ - "import sparknlp\n", - "spark = sparknlp.start()\n", - "\n", - "print(\"Spark NLP version: {}\".format(sparknlp.version()))\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ], - "execution_count": 2, + "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Spark NLP version: 3.0.1\n", "Apache Spark version: 3.0.2\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "import sparknlp\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Spark NLP version: {}\".format(sparknlp.version()))\n", + "print(\"Apache Spark version: {}\".format(spark.version))" ] }, { "cell_type": "code", - "metadata": { - "id": "Wt1KiTMFEXdp" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from sparknlp.pretrained import PretrainedPipeline " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "metadata": { - "id": "RtGm-OY4EXds" - }, + "metadata": {}, "source": [ "Let's use Spark NLP pre-trained pipeline for `named entity recognition`" ] }, { "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lNu3meQKEXdu", - "outputId": "08a9d3fe-e9f1-4543-fde4-dea56e3f9a90" - }, - "source": [ - "pipeline = PretrainedPipeline('recognize_entities_dl', 'en')" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "recognize_entities_dl download started this may take some time.\n", "Approx size to download 160.1 MB\n", "[OK!]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "pipeline = PretrainedPipeline('recognize_entities_dl', 'en')" ] }, { "cell_type": "code", - "metadata": { - "id": "iMzyLyftEXdy" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "result = pipeline.annotate('President Biden represented Delaware for 36 years in the U.S. Senate before becoming the 47th Vice President of the United States.') " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Ark1N0GEXd1", - "outputId": "bbb74761-e5bd-4e86-8c7f-9da99c8ddc7b" - }, - "source": [ - "print(result['ner'])\n", - "print(result['entities'])" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "['O', 'B-PER', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O']\n", "['Biden', 'Delaware', 'U.S', 'Senate', 'United States']\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(result['ner'])\n", + "print(result['entities'])" ] }, { "cell_type": "markdown", - "metadata": { - "id": "h5ivlUOaXQVl" - }, + "metadata": {}, "source": [ "Let's try another Spark NLP pre-trained pipeline for `named entity recognition`" ] }, { "cell_type": "code", - "metadata": { - "id": "XxWfmz_sXWWv", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "1a19f337-b281-4b89-fd02-ab12d2e0055f" - }, - "source": [ - "pipeline = PretrainedPipeline('onto_recognize_entities_bert_tiny', 'en')\n", - "\n", - "result = pipeline.annotate(\"Johnson first entered politics when elected in 2001 as a member of Parliament. He then served eight years as the mayor of London, from 2008 to 2016, before rejoining Parliament.\")\n", - "\n", - "print(result['ner'])\n", - "print(result['entities'])" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "onto_recognize_entities_bert_tiny download started this may take some time.\n", @@ -252,96 +174,102 @@ "[OK!]\n", "['B-PERSON', 'B-ORDINAL', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'B-DATE', 'O', 'B-DATE', 'O', 'O', 'O', 'B-ORG']\n", "['Johnson', 'first', '2001', 'Parliament.', 'eight years', 'London,', '2008', '2016', 'Parliament.']\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "pipeline = PretrainedPipeline('onto_recognize_entities_bert_tiny', 'en')\n", + "\n", + "result = pipeline.annotate(\"Johnson first entered politics when elected in 2001 as a member of Parliament. He then served eight years as the mayor of London, from 2008 to 2016, before rejoining Parliament.\")\n", + "\n", + "print(result['ner'])\n", + "print(result['entities'])" ] }, { "cell_type": "markdown", - "metadata": { - "id": "0EKcEN_oEXd9" - }, + "metadata": {}, "source": [ "Let's use Spark NLP pre-trained pipeline for `sentiment` analysis" ] }, { "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "n4ZXQDnlEXd-", - "outputId": "bbee27fd-7834-4c35-e21e-91357fd66722" - }, - "source": [ - "pipeline = PretrainedPipeline('analyze_sentimentdl_glove_imdb', 'en')" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "analyze_sentimentdl_glove_imdb download started this may take some time.\n", "Approx size to download 155.3 MB\n", "[OK!]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "pipeline = PretrainedPipeline('analyze_sentimentdl_glove_imdb', 'en')" ] }, { "cell_type": "code", - "metadata": { - "id": "73O-w8IYEXeC" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "result = pipeline.annotate(\"Harry Potter is a great movie.\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "joIUX2P4EXeJ", - "outputId": "c275c0bd-5ba9-4054-cb0e-c477fcf3ae24" - }, - "source": [ - "print(result['sentiment'])" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "['pos']\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(result['sentiment'])" ] }, { "cell_type": "markdown", - "metadata": { - "id": "DI57n5vNYY6M" - }, + "metadata": {}, "source": [ "### Please check our [Models Hub](https://nlp.johnsnowlabs.com/models) for more pretrained models and pipelines! 😊 " ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "quick_start_google_colab.ipynb", + "provenance": [], + "toc_visible": true }, - { - "cell_type": "code", - "metadata": { - "id": "U8h-9Q32YZRG" + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 }, - "source": [], - "execution_count": null, - "outputs": [] + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/python/quick_start_offline.ipynb b/examples/python/quick_start_offline.ipynb index 454f02d8d52459..0fd0150b002594 100644 --- a/examples/python/quick_start_offline.ipynb +++ b/examples/python/quick_start_offline.ipynb @@ -1,24 +1,8 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "SparkNLP_offline_installation.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "PA-GQ-icbc4l" - }, + "metadata": {}, "source": [ "# Description\n", "## This notebok provides set of commands to install Spark NLP for offline usage. It contains 4 sections:\n", @@ -35,52 +19,43 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JQfDxcj_cHfB" - }, + "metadata": {}, "source": [ "## 1) Download all dependencies for Spark NLP" ] }, { "cell_type": "code", - "metadata": { - "id": "gksUrPmN6uk7", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - }, - "outputId": "05e6b829-2faa-4a1e-f666-00b3279f0052" - }, - "source": [ - "import json\n", - "\n", - "with open('workshop_license_keys_365.json') as f:\n", - " license_keys = json.load(f)\n", - "\n", - "license_keys.keys()\n" - ], "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "dict_keys(['PUBLIC_VERSION', 'JSL_VERSION', 'SECRET', 'SPARK_NLP_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET'])" ] }, + "execution_count": null, "metadata": { "tags": [] }, - "execution_count": 1 + "output_type": "execute_result" } + ], + "source": [ + "import json\n", + "\n", + "with open('workshop_license_keys_365.json') as f:\n", + " license_keys = json.load(f)\n", + "\n", + "license_keys.keys()\n" ] }, { "cell_type": "code", - "metadata": { - "id": "5ej26v-R7PA4" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "\n", "os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']\n", @@ -90,404 +65,310 @@ "\n", "version = license_keys['PUBLIC_VERSION']\n", "jsl_version = license_keys['JSL_VERSION']\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "lYQa6btGcA3K" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "! apt-get update -qq\n", "! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "VKzKoFqYeuXV", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "outputId": "e43e3d88-a357-4b0d-86ca-ecd14e4de0a1" - }, - "source": [ - "import os\n", - "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", - "os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]\n", - "!java -version" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "openjdk version \"1.8.0_265\"\n", "OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)\n", "OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", + "os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]\n", + "!java -version" ] }, { "cell_type": "code", - "metadata": { - "id": "wFLhAlrrekgY", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "outputId": "3645e311-51d5-4175-8562-0d80e1c33b72" - }, - "source": [ - "!pip install --ignore-installed -q pyspark==2.4.4" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 215.7MB 65kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 44.9MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ], - "name": "stdout" + "\u001b[K |████████████████████████████████| 215.7MB 65kB/s \n", + "\u001b[K |████████████████████████████████| 204kB 44.9MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] } + ], + "source": [ + "!pip install --ignore-installed -q pyspark==2.4.4" ] }, { "cell_type": "code", - "metadata": { - "id": "JcCaD_opjW2j", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "40ab71c2-7d13-4550-dc1a-1bc1de9aff3d" - }, - "source": [ - "!pip list | grep spark" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "pyspark 2.4.4 \n" - ], - "name": "stdout" + ] } + ], + "source": [ + "!pip list | grep spark" ] }, { "cell_type": "code", - "metadata": { - "id": "amPNFytRe1oK" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!sudo apt install awscli" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "c1c6LBDRi94D" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# spark-nlp jar\n", "!wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-nlp-assembly-2.7.3.jar\n", "\n", "# spark-nlp wheel\n", "!wget -q https://github.com/JohnSnowLabs/spark-nlp/archive/2.7.3.tar.gz" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "nouKIWjKzy8x" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!tar -xvf spark-nlp-2.7.3.tar.gz" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "Dd0s4_2fz_Wh", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "00e5793d-c84c-4e74-f7cf-afce4c3f3a63" - }, - "source": [ - "!pip install -q spark-nlp-2.7.3/ " - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ - " Building wheel for spark-nlp (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ], - "name": "stdout" + " Building wheel for spark-nlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] } + ], + "source": [ + "!pip install -q spark-nlp-2.7.3/ " ] }, { "cell_type": "markdown", - "metadata": { - "id": "S1lC_kgv0QU8" - }, + "metadata": {}, "source": [ "## 2) Download all dependencies for Spark NLP (enterprise/licensed)" ] }, { "cell_type": "code", - "metadata": { - "id": "OwjjLwbUJp-d" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# here you need to enter your AWS KEY and AWS SECRET KEY.\n", "# As a region enter \"ohio\"\n", "# As a language enter \"en\"\n", "!aws configure" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "1sraWBjHJEis" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "jsl_secret = license_keys['SECRET']\n", "jsl_jar = jsl_version+'.jar'\n", "jsl_tar = jsl_version+'.tar.gz'" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "MVP0_TeTIVB9" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# spark nlp JSL wheel\n", "!sudo aws s3 cp --region us-east-2 s3://pypi.johnsnowlabs.com/$jsl_secret/spark-nlp-jsl-$jsl_jar spark-nlp-jsl-$jsl_jar\n", "!sudo aws s3 cp --region us-east-2 s3://pypi.johnsnowlabs.com/$secret/spark-nlp-jsl/spark-nlp-jsl-$jsl_tar spark-nlp-jsl-$jsl_tar" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "Re8Nz55gGINz" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!tar -xvf spark-nlp-jsl-$jsl_tar" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "MZ9ZXoNZGZfv" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!pip install -q /content/spark-nlp-jsl-$jsl_version/ " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "eMz1WnR-GbdS", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "outputId": "db7697f5-9742-4da8-8c39-981701fd3810" - }, - "source": [ - "!pip list | grep spark" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "pyspark 2.4.4 \n", "spark-nlp 2.6.0 \n", "spark-nlp-jsl 2.6.0 \n" - ], - "name": "stdout" + ] } + ], + "source": [ + "!pip list | grep spark" ] }, { "cell_type": "markdown", - "metadata": { - "id": "O40K3dk0HTTJ" - }, + "metadata": {}, "source": [ "## 3) Download all dependencies for Spark NLP OCR" ] }, { "cell_type": "code", - "metadata": { - "id": "0jnh1V1bB-5z" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "ocr_secret = license_keys['SPARK_OCR_SECRET']\n", "ocr_version = ocr_secret.split('-')[0]\n", - "ocr_jar = ocr_version+'.spark24.jar'", - "ocr_tar = ocr_version+'.spark24.tar.gz'" - ], - "execution_count": null, - "outputs": [] + "ocr_jar = ocr_version+'.spark24.jar'ocr_tar = ocr_version+'.spark24.tar.gz'" + ] }, { "cell_type": "code", - "metadata": { - "id": "U2wt9j78CM2j" - }, - "source": [ - "!wget -q https://pypi.johnsnowlabs.com/$ocr_secret/jars/spark-ocr-assembly-$ocr_jar", - "!wget -q https://pypi.johnsnowlabs.com/$ocr_secret/spark-ocr/spark-ocr-$ocr_tar" - ], "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [ + "!wget -q https://pypi.johnsnowlabs.com/$ocr_secret/jars/spark-ocr-assembly-$ocr_jar!wget -q https://pypi.johnsnowlabs.com/$ocr_secret/spark-ocr/spark-ocr-$ocr_tar" + ] }, { "cell_type": "code", - "metadata": { - "id": "FZRvMXzxHyGh" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# unpack wheel OCR\n", "!tar -xvf /content/spark-ocr-$ocr_tar" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "lvRdkgezH3ZG" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!pip install -q /content/spark-ocr-$ocr_version/" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "_jay9Oo4H4vm", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - }, - "outputId": "506068a1-6675-4107-a589-153d99cf0ac8" - }, - "source": [ - "#sanity check\n", - "!pip list | grep spark" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "pyspark 2.4.4 \n", "spark-nlp 2.6.0 \n", "spark-nlp-jsl 2.6.0 \n", "spark-ocr 1.5.0 \n" - ], - "name": "stdout" + ] } + ], + "source": [ + "#sanity check\n", + "!pip list | grep spark" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Q-8LX-epIq2N" - }, + "metadata": {}, "source": [ "## Installation completed. Let's download models using AWS keys" ] }, { "cell_type": "markdown", - "metadata": { - "id": "rlSvYKY4I2jk" - }, + "metadata": {}, "source": [ "## 4) Download all models/embeddings for offline usage" ] }, { "cell_type": "code", - "metadata": { - "id": "a3v-bdXnI15g" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# This code will download >100 GB of Spark NLP models to your local disk\n", "# !sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "Dnt54kUxK_1m" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# This code also will download >100 GB of clinical embeddings from Spark NLP models\n", "# !sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/clinical/models/ clinical_models/ --recursive " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "QC1ysJefIVAn", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 326 - }, - "outputId": "61391f2e-e1b5-4ac5-a71f-a521c4d1a96b" - }, - "source": [ - "# For example purposes let's download only subset for NER and glove\n", - "!sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive --exclude \"*\" --include \"ner_dl*\"" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_base_cased_en_2.6.0_2.4_1599550960441.zip to public_models/ner_dl_bert_base_cased_en_2.6.0_2.4_1599550960441.zip\n", @@ -507,27 +388,21 @@ "download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_sentence_en_2.0.2_2.4_1556666842347.zip to public_models/ner_dl_sentence_en_2.0.2_2.4_1556666842347.zip\n", "download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_en_2.4.3_2.4_1584624950746.zip to public_models/ner_dl_en_2.4.3_2.4_1584624950746.zip\n", "download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_en_2.0.2_2.4_1558802205173.zip to public_models/ner_dl_en_2.0.2_2.4_1558802205173.zip\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "# For example purposes let's download only subset for NER and glove\n", + "!sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive --exclude \"*\" --include \"ner_dl*\"" ] }, { "cell_type": "code", - "metadata": { - "id": "uwljAlhVKTPL", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "outputId": "6c843e35-6f26-4caf-a87f-f4ff06bfad2c" - }, - "source": [ - "!sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive --exclude \"*\" --include \"glove*\"" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "download: s3://auxdata.johnsnowlabs.com/public/models/glove_6B_100_xx_2.4.0_2.4_1579690037117.zip to public_models/glove_6B_100_xx_2.4.0_2.4_1579690037117.zip\n", @@ -539,83 +414,73 @@ "download: s3://auxdata.johnsnowlabs.com/public/models/glove_6B_300_xx_2.1.0_2.4_1564760779318.zip to public_models/glove_6B_300_xx_2.1.0_2.4_1564760779318.zip\n", "download: s3://auxdata.johnsnowlabs.com/public/models/glove_840B_300_xx_2.0.2_2.4_1558645003344.zip to public_models/glove_840B_300_xx_2.0.2_2.4_1558645003344.zip\n", "download: s3://auxdata.johnsnowlabs.com/public/models/glove_840B_300_xx_2.4.0_2.4_1579698926752.zip to public_models/glove_840B_300_xx_2.4.0_2.4_1579698926752.zip\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "!sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive --exclude \"*\" --include \"glove*\"" ] }, { "cell_type": "code", - "metadata": { - "id": "QixCIfYFKqXk" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# !sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/clinical/models/ clinical_models/ --recursive --exclude \"*\" --include \"embeddings_clinical*\"" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "metadata": { - "id": "raLv3rlIyLUW" - }, + "metadata": {}, "source": [ "## 5) Example on NER" ] }, { "cell_type": "code", - "metadata": { - "id": "SGI4DSrjLfhJ" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!unzip -q /content/public_models/ner_dl_en_2.4.3_2.4_1584624950746.zip -d ner_dl_glove/" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "-uv0nAgB323j" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!unzip -q /content/public_models/glove_100d_en_2.4.0_2.4_1579690104032.zip -d glove_embeddings/" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "dSy25ADGyXGS" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "ner_local_path = 'ner_dl_glove'\n", "embeddings_local_path = 'glove_embeddings'" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "61tSeaUKzAKb" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "spark_nlp_jar_path = \"/content/spark-nlp-assembly-\"+version+\".jar\"\n", "spark_nlp_internal = \"/content/spark-nlp-jsl-\"+jsl_jar\n", "spark_nlp_jar_path = spark_nlp_jar_path+\",\"+spark_nlp_internal" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "OPBk1kOizK9u" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import json\n", "import os\n", @@ -626,15 +491,13 @@ "from sparknlp_jsl.annotator import *\n", "from sparknlp.base import *\n", "import sparknlp_jsl" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "FVzd5SJFzOAA" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def start():\n", " builder = SparkSession.builder \\\n", @@ -647,15 +510,13 @@ " return builder.getOrCreate()\n", "\n", "spark = start()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "uRs4AIfry7dQ" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "documentAssembler = DocumentAssembler()\\\n", " .setInputCol(\"text\")\\\n", @@ -685,30 +546,15 @@ "empty_df = spark.createDataFrame([['']]).toDF(\"text\")\n", "\n", "pipelineModel = nlpPipeline.fit(empty_df)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": { - "id": "-F9F8e7CMgYM", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "outputId": "09c1e8d6-b77f-4ebf-8aa7-b6eb2155660e" - }, - "source": [ - "df = spark.createDataFrame([['Peter Parker lives in New York.']]).toDF(\"text\")\n", - "\n", - "result = pipelineModel.transform(df)\n", - "\n", - "result.select('token.result','ner.result').show(truncate=False)" - ], "execution_count": null, + "metadata": {}, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "+----------------------------------------+-------------------------------------+\n", @@ -717,34 +563,23 @@ "|[Peter, Parker, lives, in, New, York, .]|[B-PER, I-PER, O, O, B-LOC, I-LOC, O]|\n", "+----------------------------------------+-------------------------------------+\n", "\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "df = spark.createDataFrame([['Peter Parker lives in New York.']]).toDF(\"text\")\n", + "\n", + "result = pipelineModel.transform(df)\n", + "\n", + "result.select('token.result','ner.result').show(truncate=False)" ] }, { "cell_type": "code", - "metadata": { - "id": "xoboVMnO4KaD", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "outputId": "a586aeb4-5431-42c6-b543-bcbba503e2cd" - }, - "source": [ - "light_model = LightPipeline(pipelineModel)\n", - "\n", - "text = 'Peter Parker lives in New York.'\n", - "\n", - "light_result = light_model.annotate(text)\n", - "\n", - "list(zip(light_result['token'], light_result['ner']))" - ], "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[('Peter', 'B-PER'),\n", @@ -756,23 +591,36 @@ " ('.', 'O')]" ] }, + "execution_count": null, "metadata": { "tags": [] }, - "execution_count": 97 + "output_type": "execute_result" } + ], + "source": [ + "light_model = LightPipeline(pipelineModel)\n", + "\n", + "text = 'Peter Parker lives in New York.'\n", + "\n", + "light_result = light_model.annotate(text)\n", + "\n", + "list(zip(light_result['token'], light_result['ner']))" ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "SparkNLP_offline_installation.ipynb", + "provenance": [], + "toc_visible": true }, - { - "cell_type": "code", - "metadata": { - "id": "-Echfpd-4jhc" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/python/training/chinese/word-segmentation/WordSegmenter_train_chinese_segmentation.ipynb b/examples/python/training/chinese/word-segmentation/WordSegmenter_train_chinese_segmentation.ipynb index b3d4057f6e7de9..33dfd8feb75251 100644 --- a/examples/python/training/chinese/word-segmentation/WordSegmenter_train_chinese_segmentation.ipynb +++ b/examples/python/training/chinese/word-segmentation/WordSegmenter_train_chinese_segmentation.ipynb @@ -2,28 +2,22 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "cpYpeEfnmWKd" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "xl3k8bt-mZIc" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/chinese/word-segmentation/WordSegmenter_train_chinese_segmentation.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/training/chinese/word-segmentation/WordSegmenter_train_chinese_segmentation.ipynb)\n", "\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "xluzxinzKK-L" - }, + "metadata": {}, "source": [ "# [Word Segmenter](https://nlp.johnsnowlabs.com/docs/en/annotators#wordsegmenter)\n", "\n", @@ -45,10 +39,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "h4fQwZ46x4fu" - }, + "metadata": {}, "source": [ "Only run this block if you are inside Google Colab to set up Spark NLP otherwise\n", "skip it." @@ -56,10 +47,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "MdE588BiY3z1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -81,17 +70,15 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "SBtn9YsW0eHz" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 3.4.2\n", - "Apache Spark version: 3.0.2\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -125,45 +112,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2022-03-29 17:26:03-- https://raw.githubusercontent.com/taotao033/conll-formatted-ontonotes-5.0_for_chinese_language/master/onto.train.ner\n", - "SSL_INIT\n", - "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 960 [text/plain]\n", - "Saving to: ‘onto.train.ner’\n", - "\n", - "onto.train.ner 100%[===================>] 960 --.-KB/s in 0s \n", - "\n", - "2022-03-29 17:26:03 (178 MB/s) - ‘onto.train.ner’ saved [960/960]\n", - "\n", - "--2022-03-29 17:26:03-- https://raw.githubusercontent.com/taotao033/conll-formatted-ontonotes-5.0_for_chinese_language/master/onto.test.ner\n", - "SSL_INIT\n", - "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 974 [text/plain]\n", - "Saving to: ‘onto.test.ner’\n", - "\n", - "onto.test.ner 100%[===================>] 974 --.-KB/s in 0s \n", - "\n", - "2022-03-29 17:26:03 (191 MB/s) - ‘onto.test.ner’ saved [974/974]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "!wget https://raw.githubusercontent.com/taotao033/conll-formatted-ontonotes-5.0_for_chinese_language/master/onto.train.ner\n", - "!wget https://raw.githubusercontent.com/taotao033/conll-formatted-ontonotes-5.0_for_chinese_language/master/onto.test.ner\n" + "!wget https://raw.githubusercontent.com/taotao033/conll-formatted-ontonotes-5.0_for_chinese_language/master/onto.test.ner" ] }, { @@ -176,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -202,14 +156,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hJFV80wXyXiQ", - "outputId": "c1c1ef34-8604-482d-d845-11ed44d48275" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -246,10 +194,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "id": "06z9uTcD1RU8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -293,8 +239,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/classification/ClassifierDL_Train_and_Evaluate.ipynb b/examples/python/training/english/classification/ClassifierDL_Train_and_Evaluate.ipynb index 1b68511b18f0b2..9f52a173d0b902 100644 --- a/examples/python/training/english/classification/ClassifierDL_Train_and_Evaluate.ipynb +++ b/examples/python/training/english/classification/ClassifierDL_Train_and_Evaluate.ipynb @@ -1,39 +1,20 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "ph3bDypIEXdd" - }, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aaVmDt1TEXdh" - }, - "source": [ - "# Spark NLP\n", - "### Multi-class Text Classification\n", - "#### By using ClassifierDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jmo3o-b3MF5W" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/ClassifierDL_Train_and_Evaluate.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/ClassifierDL_Train_and_Evaluate.ipynb)\n", + "\n", + "# Multi-class Text Classification using ClassifierDL" ] }, { "cell_type": "markdown", - "metadata": { - "id": "h4fQwZ46x4fu" - }, + "metadata": {}, "source": [ "Only run this block if you are inside Google Colab otherwise skip it" ] @@ -44,36 +25,40 @@ "metadata": {}, "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Ti2kwlQNyXCh" - }, + "metadata": {}, "source": [ "In this notebook we are going to check the training logs on the fly. Thus, we start a session with `real_time_output=True`" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Hmvv5Q4jMF5b", - "outputId": "8d427e29-4079-4c79-ea48-4142545d3e66" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.1.0\n", - "Apache Spark version; 3.2.1\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version; 3.3.0\n", + "23/02/20 17:36:52 WARN Utils: Your hostname, duc-manjaro resolves to a loopback address: 127.0.1.1; using 192.168.0.34 instead (on interface enp3s0)\n", + "23/02/20 17:36:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/root/.conda/envs/sparknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n", + "23/02/20 17:36:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "23/02/20 17:36:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", + "23/02/20 17:36:55 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n" ] } ], @@ -88,38 +73,31 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "xKQcm8R6MF5e" - }, + "metadata": {}, "source": [ "Let's download news category dataset for training our text classifier" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "W0FkrTb4MF5f", - "outputId": "284656a9-2f35-4f08-e1b2-8e40b653c1d8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2022-09-23 17:48:38-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_train.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.160.208\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.160.208|:443... connected.\n", + "--2023-02-20 17:37:03-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_train.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.142.224, 52.216.50.8, 52.216.162.69, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.142.224|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 24032125 (23M) [text/csv]\n", "Saving to: ‘news_category_train.csv’\n", "\n", - "news_category_train 100%[===================>] 22.92M 102MB/s in 0.2s \n", + "news_category_train 100%[===================>] 22,92M 13,1MB/s in 1,7s \n", "\n", - "2022-09-23 17:48:38 (102 MB/s) - ‘news_category_train.csv’ saved [24032125/24032125]\n", + "2023-02-20 17:37:05 (13,1 MB/s) - ‘news_category_train.csv’ saved [24032125/24032125]\n", "\n" ] } @@ -130,29 +108,24 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QDIQgMv6tuqu", - "outputId": "55fc6a77-2858-4fd9-e359-7cea4c6dfc4b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2022-09-23 17:48:38-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.160.208\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.160.208|:443... connected.\n", + "--2023-02-20 17:37:06-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.21.181, 54.231.130.208, 52.217.205.32, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.21.181|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1504408 (1.4M) [text/csv]\n", + "Length: 1504408 (1,4M) [text/csv]\n", "Saving to: ‘news_category_test.csv’\n", "\n", - "news_category_test. 100%[===================>] 1.43M --.-KB/s in 0.05s \n", + "news_category_test. 100%[===================>] 1,43M 2,09MB/s in 0,7s \n", "\n", - "2022-09-23 17:48:39 (27.1 MB/s) - ‘news_category_test.csv’ saved [1504408/1504408]\n", + "2023-02-20 17:37:07 (2,09 MB/s) - ‘news_category_test.csv’ saved [1504408/1504408]\n", "\n" ] } @@ -163,14 +136,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QYolNmBtMF5h", - "outputId": "f90d588d-dc3a-4ada-9a38-3d8f41eaed4c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -195,19 +162,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zWFUDI6jMF5k" - }, + "metadata": {}, "source": [ "The content is inside `description` column and the labels are inside `category` column" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "S9TRr7iAMF5l" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "trainDataset = spark.read \\\n", @@ -217,14 +180,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nURzgFJ7MF5o", - "outputId": "8c71d015-34ca-45e4-cac5-444fc7389525" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -265,14 +222,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5NIHJuVKx4gk", - "outputId": "0707bf0c-3fcd-4071-d41f-18d99f29c8ad" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -280,7 +231,7 @@ "120000" ] }, - "execution_count": 11, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -291,10 +242,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "0oKvNZaEMF5q" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -306,9 +255,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UoJH3kA7RJpD" - }, + "metadata": {}, "source": [ "# Prepare TestDataset for Evaluation \n", "\n", @@ -317,10 +264,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "E5Tx0-Axarh2" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "news_test_dataset = spark.read \\\n", @@ -330,14 +275,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "msvqqFHOaL1m", - "outputId": "28a2c6d5-db54-4d8f-cc3e-91c85c688845" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -347,7 +286,7 @@ "Approximate size to download 923.7 MB\n", "[ | ]tfhub_use download started this may take some time.\n", "Approximate size to download 923.7 MB\n", - "[ | ]Download done! Loading the resource.\n", + "[ / ]Download done! Loading the resource.\n", "[OK!]\n" ] } @@ -368,14 +307,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "taJTgrntaz1Z", - "outputId": "91eac9f2-690e-4b6d-9232-357f026a30e4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -398,19 +331,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-MAxy_pVzZh-" - }, + "metadata": {}, "source": [ "Now, that out test dataset has the required embeddings, we save it as parquet and use it while training our ClassifierDL model." ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "SLDJCelza2Ac" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "test_dataset.write.parquet(\"./test_news.parquet\")" @@ -418,19 +347,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "f6Dhbj78a7qZ" - }, + "metadata": {}, "source": [ "Now let's train it and use a validation and the test dataset above for evaluation" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "H30A4FgNMF5t" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "classsifierdl = ClassifierDLApproach()\\\n", @@ -453,125 +378,119 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kia7NpRJMF5v", - "outputId": "5659d234-da46-4da7-8102-693c8225c688" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 96000 - classes: 4\n", - "Epoch 1/5 - 21.56s - loss: 1294.569 - acc: 0.8790208 - batches: 1500\n", + "Epoch 1/5 - 6.34s - loss: 1305.07 - acc: 0.8793854 - batches: 1500\n", "Quality on validation dataset (20.0%), validation examples = 24000\n", - "time to finish evaluation: 1.38s\n", + "time to finish evaluation: 0.44s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 5158\t 945\t 807\t 0.8451581\t 0.8647108\t 0.85482264\n", - "Business 5003\t 952\t 1004\t 0.8401343\t 0.83286166\t 0.83648217\n", - "Sports 5901\t 270\t 147\t 0.956247\t 0.9756944\t 0.9658728\n", - "World 5235\t 536\t 745\t 0.90712184\t 0.87541807\t 0.89098805\n", - "tp: 21297 fp: 2703 fn: 2703 labels: 4\n", - "Macro-average\t prec: 0.88716537, rec: 0.88717127, f1: 0.8871684\n", - "Micro-average\t prec: 0.887375, recall: 0.887375, f1: 0.887375\n", + "Sci/Tech 4907\t 753\t 1013\t 0.8669611\t 0.82888514\t 0.8474956\n", + "Business 5201\t 1162\t 806\t 0.81738174\t 0.8658232\t 0.8409054\n", + "Sports 5891\t 350\t 118\t 0.94391924\t 0.9803628\t 0.96179587\n", + "World 5272\t 464\t 792\t 0.9191074\t 0.8693931\t 0.8935593\n", + "tp: 21271 fp: 2729 fn: 2729 labels: 4\n", + "Macro-average\t prec: 0.88684237, rec: 0.8861161, f1: 0.886479\n", + "Micro-average\t prec: 0.8862917, recall: 0.8862917, f1: 0.8862917\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.35s\n", + "time to finish evaluation: 0.07s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 1658\t 322\t 242\t 0.83737373\t 0.87263155\t 0.8546392\n", - "Business 1569\t 306\t 331\t 0.8368\t 0.82578945\t 0.8312583\n", - "Sports 1840\t 81\t 60\t 0.9578345\t 0.96842104\t 0.9630987\n", - "World 1661\t 163\t 239\t 0.91063595\t 0.87421054\t 0.8920516\n", - "tp: 6728 fp: 872 fn: 872 labels: 4\n", - "Macro-average\t prec: 0.885661, rec: 0.88526314, f1: 0.8854621\n", - "Micro-average\t prec: 0.88526314, recall: 0.88526314, f1: 0.88526314\n", - "Epoch 2/5 - 25.97s - loss: 1279.9918 - acc: 0.8916354 - batches: 1500\n", + "Sci/Tech 1579\t 247\t 321\t 0.86473167\t 0.83105266\t 0.84755766\n", + "Business 1631\t 390\t 269\t 0.8070262\t 0.858421\t 0.83193064\n", + "Sports 1858\t 100\t 42\t 0.94892746\t 0.9778947\t 0.96319336\n", + "World 1639\t 156\t 261\t 0.9130919\t 0.86263156\t 0.8871448\n", + "tp: 6707 fp: 893 fn: 893 labels: 4\n", + "Macro-average\t prec: 0.8834443, rec: 0.8825, f1: 0.88297194\n", + "Micro-average\t prec: 0.8825, recall: 0.8825, f1: 0.88249993\n", + "Epoch 2/5 - 5.87s - loss: 1282.2052 - acc: 0.8911354 - batches: 1500\n", "Quality on validation dataset (20.0%), validation examples = 24000\n", - "time to finish evaluation: 1.04s\n", + "time to finish evaluation: 0.23s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 5151\t 891\t 814\t 0.85253227\t 0.8635373\t 0.8579995\n", - "Business 4991\t 899\t 1016\t 0.8473684\t 0.830864\t 0.83903503\n", - "Sports 5917\t 273\t 131\t 0.9558966\t 0.97833997\t 0.96698815\n", - "World 5294\t 584\t 686\t 0.9006465\t 0.8852843\t 0.89289933\n", - "tp: 21353 fp: 2647 fn: 2647 labels: 4\n", - "Macro-average\t prec: 0.8891109, rec: 0.88950634, f1: 0.88930863\n", - "Micro-average\t prec: 0.88970834, recall: 0.88970834, f1: 0.88970834\n", + "Sci/Tech 5165\t 975\t 755\t 0.84120524\t 0.8724662\t 0.8565506\n", + "Business 4938\t 802\t 1069\t 0.8602787\t 0.822041\t 0.84072536\n", + "Sports 5892\t 355\t 117\t 0.9431727\t 0.9805292\t 0.96148825\n", + "World 5345\t 528\t 719\t 0.91009706\t 0.8814314\t 0.89553493\n", + "tp: 21340 fp: 2660 fn: 2660 labels: 4\n", + "Macro-average\t prec: 0.88868845, rec: 0.88911694, f1: 0.88890266\n", + "Micro-average\t prec: 0.88916665, recall: 0.88916665, f1: 0.88916665\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.35s\n", + "time to finish evaluation: 0.06s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 1657\t 290\t 243\t 0.8510529\t 0.87210524\t 0.8614505\n", - "Business 1573\t 286\t 327\t 0.84615386\t 0.82789475\t 0.83692473\n", - "Sports 1851\t 87\t 49\t 0.95510834\t 0.9742105\t 0.96456486\n", - "World 1681\t 175\t 219\t 0.90571123\t 0.88473684\t 0.8951012\n", - "tp: 6762 fp: 838 fn: 838 labels: 4\n", - "Macro-average\t prec: 0.8895066, rec: 0.88973683, f1: 0.8896217\n", - "Micro-average\t prec: 0.88973683, recall: 0.88973683, f1: 0.88973683\n", - "Epoch 3/5 - 24.51s - loss: 1271.581 - acc: 0.89664584 - batches: 1500\n", + "Sci/Tech 1671\t 315\t 229\t 0.8413897\t 0.8794737\t 0.8600103\n", + "Business 1545\t 268\t 355\t 0.8521787\t 0.8131579\t 0.83221114\n", + "Sports 1860\t 108\t 40\t 0.94512194\t 0.97894734\t 0.9617373\n", + "World 1667\t 166\t 233\t 0.9094381\t 0.87736845\t 0.89311546\n", + "tp: 6743 fp: 857 fn: 857 labels: 4\n", + "Macro-average\t prec: 0.88703215, rec: 0.88723683, f1: 0.88713443\n", + "Micro-average\t prec: 0.88723683, recall: 0.88723683, f1: 0.88723683\n", + "Epoch 3/5 - 5.91s - loss: 1276.7024 - acc: 0.8959896 - batches: 1500\n", "Quality on validation dataset (20.0%), validation examples = 24000\n", - "time to finish evaluation: 1.13s\n", + "time to finish evaluation: 1.38s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 5219\t 923\t 746\t 0.8497232\t 0.8749371\t 0.86214584\n", - "Business 4989\t 863\t 1018\t 0.85252905\t 0.83053106\t 0.84138626\n", - "Sports 5930\t 280\t 118\t 0.9549114\t 0.98048943\t 0.9675314\n", - "World 5274\t 522\t 706\t 0.90993786\t 0.8819398\t 0.89572006\n", - "tp: 21412 fp: 2588 fn: 2588 labels: 4\n", - "Macro-average\t prec: 0.89177537, rec: 0.8919744, f1: 0.89187485\n", - "Micro-average\t prec: 0.8921667, recall: 0.8921667, f1: 0.8921667\n", + "Sci/Tech 5278\t 1094\t 642\t 0.8283114\t 0.89155406\t 0.85876995\n", + "Business 4908\t 762\t 1099\t 0.86560845\t 0.81704676\t 0.8406269\n", + "Sports 5901\t 364\t 108\t 0.9418994\t 0.98202693\t 0.96154475\n", + "World 5274\t 419\t 790\t 0.92640084\t 0.86972296\t 0.8971677\n", + "tp: 21361 fp: 2639 fn: 2639 labels: 4\n", + "Macro-average\t prec: 0.8905551, rec: 0.8900876, f1: 0.89032125\n", + "Micro-average\t prec: 0.89004165, recall: 0.89004165, f1: 0.89004165\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.36s\n", + "time to finish evaluation: 0.07s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 1673\t 302\t 227\t 0.84708863\t 0.8805263\t 0.8634839\n", - "Business 1569\t 275\t 331\t 0.8508677\t 0.82578945\t 0.83814096\n", - "Sports 1855\t 89\t 45\t 0.9542181\t 0.9763158\t 0.9651405\n", - "World 1673\t 164\t 227\t 0.910724\t 0.8805263\t 0.8953706\n", - "tp: 6770 fp: 830 fn: 830 labels: 4\n", - "Macro-average\t prec: 0.8907246, rec: 0.89078945, f1: 0.890757\n", - "Micro-average\t prec: 0.89078945, recall: 0.89078945, f1: 0.89078945\n", - "Epoch 4/5 - 24.66s - loss: 1271.1621 - acc: 0.9004167 - batches: 1500\n", + "Sci/Tech 1695\t 344\t 205\t 0.8312898\t 0.8921053\t 0.86062455\n", + "Business 1537\t 261\t 363\t 0.8548387\t 0.8089474\t 0.83126014\n", + "Sports 1862\t 110\t 38\t 0.94421905\t 0.98\t 0.96177685\n", + "World 1645\t 146\t 255\t 0.9184813\t 0.8657895\t 0.89135736\n", + "tp: 6739 fp: 861 fn: 861 labels: 4\n", + "Macro-average\t prec: 0.88720727, rec: 0.8867105, f1: 0.88695884\n", + "Micro-average\t prec: 0.8867105, recall: 0.8867105, f1: 0.8867105\n", + "Epoch 4/5 - 5.91s - loss: 1269.6151 - acc: 0.89971876 - batches: 1500\n", "Quality on validation dataset (20.0%), validation examples = 24000\n", - "time to finish evaluation: 1.04s\n", + "time to finish evaluation: 0.23s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 5224\t 896\t 741\t 0.8535948\t 0.87577534\t 0.86454284\n", - "Business 4994\t 851\t 1013\t 0.85440546\t 0.8313634\t 0.84272695\n", - "Sports 5927\t 271\t 121\t 0.95627624\t 0.9799934\t 0.96798956\n", - "World 5293\t 544\t 687\t 0.90680146\t 0.88511705\t 0.895828\n", - "tp: 21438 fp: 2562 fn: 2562 labels: 4\n", - "Macro-average\t prec: 0.89276946, rec: 0.8930623, f1: 0.89291584\n", - "Micro-average\t prec: 0.89325, recall: 0.89325, f1: 0.89325\n", + "Sci/Tech 5327\t 1144\t 593\t 0.82321125\t 0.89983106\t 0.8598176\n", + "Business 4927\t 762\t 1080\t 0.8660573\t 0.82020974\t 0.8425102\n", + "Sports 5890\t 316\t 119\t 0.94908154\t 0.98019636\t 0.9643881\n", + "World 5261\t 373\t 803\t 0.9337948\t 0.86757916\t 0.89947\n", + "tp: 21405 fp: 2595 fn: 2595 labels: 4\n", + "Macro-average\t prec: 0.8930362, rec: 0.8919541, f1: 0.8924948\n", + "Micro-average\t prec: 0.891875, recall: 0.891875, f1: 0.891875\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.34s\n", + "time to finish evaluation: 0.07s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 1667\t 291\t 233\t 0.851379\t 0.87736845\t 0.86417836\n", - "Business 1572\t 278\t 328\t 0.8497297\t 0.82736844\t 0.8384\n", - "Sports 1854\t 86\t 46\t 0.9556701\t 0.9757895\t 0.965625\n", - "World 1680\t 172\t 220\t 0.90712744\t 0.8842105\t 0.8955224\n", - "tp: 6773 fp: 827 fn: 827 labels: 4\n", - "Macro-average\t prec: 0.89097655, rec: 0.8911843, f1: 0.89108044\n", - "Micro-average\t prec: 0.8911842, recall: 0.8911842, f1: 0.8911842\n", - "Epoch 5/5 - 23.62s - loss: 1266.5956 - acc: 0.90358335 - batches: 1500\n", + "Sci/Tech 1704\t 359\t 196\t 0.82598156\t 0.8968421\t 0.8599546\n", + "Business 1542\t 264\t 358\t 0.8538206\t 0.8115789\t 0.83216405\n", + "Sports 1860\t 99\t 40\t 0.949464\t 0.97894734\t 0.9639803\n", + "World 1639\t 133\t 261\t 0.92494357\t 0.86263156\t 0.89270157\n", + "tp: 6745 fp: 855 fn: 855 labels: 4\n", + "Macro-average\t prec: 0.8885524, rec: 0.8875, f1: 0.8880258\n", + "Micro-average\t prec: 0.8875, recall: 0.8875, f1: 0.8875\n", + "Epoch 5/5 - 5.82s - loss: 1267.1547 - acc: 0.9025625 - batches: 1500\n", "Quality on validation dataset (20.0%), validation examples = 24000\n", - "time to finish evaluation: 1.07s\n", + "time to finish evaluation: 0.24s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 5193\t 864\t 772\t 0.8573551\t 0.87057835\t 0.86391616\n", - "Business 5052\t 898\t 955\t 0.8490756\t 0.8410188\t 0.845028\n", - "Sports 5919\t 266\t 129\t 0.95699275\t 0.97867066\t 0.96771026\n", - "World 5278\t 530\t 702\t 0.90874654\t 0.8826087\t 0.89548695\n", - "tp: 21442 fp: 2558 fn: 2558 labels: 4\n", - "Macro-average\t prec: 0.8930425, rec: 0.8932191, f1: 0.89313084\n", - "Micro-average\t prec: 0.89341664, recall: 0.89341664, f1: 0.89341664\n", + "Sci/Tech 5337\t 1168\t 583\t 0.82044584\t 0.90152025\t 0.8590745\n", + "Business 4907\t 734\t 1100\t 0.8698812\t 0.8168803\t 0.8425481\n", + "Sports 5891\t 318\t 118\t 0.948784\t 0.9803628\t 0.96431494\n", + "World 5269\t 376\t 795\t 0.9333924\t 0.8688984\t 0.89999145\n", + "tp: 21404 fp: 2596 fn: 2596 labels: 4\n", + "Macro-average\t prec: 0.8931259, rec: 0.89191544, f1: 0.89252025\n", + "Micro-average\t prec: 0.8918333, recall: 0.8918333, f1: 0.8918333\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.34s\n", + "time to finish evaluation: 0.06s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "Sci/Tech 1662\t 283\t 238\t 0.85449874\t 0.87473685\t 0.8644994\n", - "Business 1591\t 295\t 309\t 0.8435843\t 0.8373684\t 0.84046483\n", - "Sports 1852\t 83\t 48\t 0.95710593\t 0.97473687\t 0.96584094\n", - "World 1672\t 162\t 228\t 0.9116685\t 0.88\t 0.8955544\n", - "tp: 6777 fp: 823 fn: 823 labels: 4\n", - "Macro-average\t prec: 0.8917144, rec: 0.8917105, f1: 0.8917125\n", - "Micro-average\t prec: 0.8917105, recall: 0.8917105, f1: 0.8917105\n" + "Sci/Tech 1711\t 357\t 189\t 0.82736945\t 0.90052634\t 0.8623993\n", + "Business 1537\t 254\t 363\t 0.8581798\t 0.8089474\t 0.8328367\n", + "Sports 1862\t 101\t 38\t 0.94854814\t 0.98\t 0.9640176\n", + "World 1645\t 133\t 255\t 0.9251968\t 0.8657895\t 0.8945079\n", + "tp: 6755 fp: 845 fn: 845 labels: 4\n", + "Macro-average\t prec: 0.88982356, rec: 0.88881576, f1: 0.88931936\n", + "Micro-average\t prec: 0.88881576, recall: 0.88881576, f1: 0.88881576\n" ] } ], @@ -581,9 +500,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zMLuwQSCB05e" - }, + "metadata": {}, "source": [ "# How to use already trained ClassifierDL pipeline or its model\n", "\n", @@ -594,19 +511,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4I7COUCPCPe3" - }, + "metadata": {}, "source": [ "## Save and load pre-trained ClassifierDL pipeline" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "QTDQ3riLD-zW" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Google Colab is free so it comes with a little memory. \n", @@ -619,45 +532,32 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "TI9JR8AoLbW3" - }, + "metadata": {}, "source": [ "# Save and load pre-trained ClassifierDL model" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "3r3_q4CJLkZR" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# dbfs:/ or hdfs:/ if you are saving it on distributed file systems\n", - "pipelineModel.stages[-1].write().overwrite().save('./tmp_classifierDL_model')\n", - "\n" + "pipelineModel.stages[-1].write().overwrite().save('./tmp_classifierDL_model')" ] }, { "cell_type": "markdown", - "metadata": { - "id": "3JaclNFsQJ-X" - }, + "metadata": {}, "source": [ "Let's use our pre-trained ClassifierDLModel in a pipeline: " ] }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "NTJ53PbYQI-f", - "outputId": "1d97b123-f050-4b5a-c18f-362578ff6365" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -694,19 +594,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "VOrjIsKXHea8" - }, + "metadata": {}, "source": [ "Now let's load it back so we can have prediction all together with everything in that pipeline:" ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "ccy54HeERCZ1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.types import StringType\n", @@ -719,10 +615,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "3BsNAWS4VRkd" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "prediction = pipeline.fit(dfTest).transform(dfTest)" @@ -730,14 +624,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nz32PDjEVUTk", - "outputId": "de3fe9e8-fa6c-4276-c441-918e6f56a960" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -750,12 +638,12 @@ "|[Sci/Tech]|\n", "+----------+\n", "\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", - "|metadata |\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", - "|[{Sports -> 2.753349E-6, Business -> 0.99998844, World -> 6.6571633E-6, Sci/Tech -> 2.1566113E-6, sentence -> 0}]|\n", - "|[{Sports -> 1.4710765E-14, Business -> 1.1435716E-13, World -> 2.8883496E-13, Sci/Tech -> 1.0, sentence -> 0}] |\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", + "+------------------------------------------------------------------------------------------------------------------+\n", + "|metadata |\n", + "+------------------------------------------------------------------------------------------------------------------+\n", + "|[{Sports -> 1.09076216E-4, Business -> 0.9996996, World -> 1.05234445E-4, Sci/Tech -> 8.617702E-5, sentence -> 0}]|\n", + "|[{Sports -> 8.139581E-19, Business -> 2.8369764E-17, World -> 3.814643E-17, Sci/Tech -> 1.0, sentence -> 0}] |\n", + "+------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } @@ -787,8 +675,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.21.0" diff --git a/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb b/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb index c2dd5e13badc97..79a96ef2652cf5 100644 --- a/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb +++ b/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb @@ -1,59 +1,28 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ph3bDypIEXdd" - }, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aaVmDt1TEXdh" - }, - "source": [ - "# Spark NLP\n", - "### Multi-class Text Classification\n", - "#### By using ClassifierDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jmo3o-b3MF5W" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb)\n", + "\n", + "# Multi-class Text Classification using ClassifierDL" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "h4fQwZ46x4fu" - }, + "metadata": {}, "source": [ "Only run this block if you are inside Google Colab otherwise skip it" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "MzishpT-MF5X", - "outputId": "6fbc0282-277b-4afc-993c-89f3d633c4b4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -66,29 +35,21 @@ } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "Hmvv5Q4jMF5b", - "outputId": "f4d57658-0eb1-4cf2-d083-45a4e0714470" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.1\n", - "Apache Spark version; 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version; 3.3.0\n" ] } ], @@ -103,41 +64,31 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "xKQcm8R6MF5e" - }, + "metadata": {}, "source": [ "Let's download news category dataset for training our text classifier" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 - }, - "colab_type": "code", - "id": "W0FkrTb4MF5f", - "outputId": "948e3dcd-036c-4d4f-8d90-045220ae1c98" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-06-01 14:03:51-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_train.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.108.53\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.108.53|:443... connected.\n", + "--2023-02-20 17:39:41-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_train.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.140.206, 52.216.147.5, 54.231.204.0, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.140.206|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 24032125 (23M) [text/csv]\n", "Saving to: ‘news_category_train.csv’\n", "\n", - "news_category_train 100%[===================>] 22.92M 52.0MB/s in 0.4s \n", + "news_category_train 100%[===================>] 22,92M 10,2MB/s in 2,3s \n", "\n", - "2020-06-01 14:03:52 (52.0 MB/s) - ‘news_category_train.csv’ saved [24032125/24032125]\n", + "2023-02-20 17:39:44 (10,2 MB/s) - ‘news_category_train.csv’ saved [24032125/24032125]\n", "\n" ] } @@ -148,31 +99,24 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 - }, - "colab_type": "code", - "id": "QDIQgMv6tuqu", - "outputId": "05ac2eba-1bda-4199-e748-e1e5a3c49cc8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-06-01 14:03:53-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.171.149\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.171.149|:443... connected.\n", + "--2023-02-20 17:39:46-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.131.61, 52.217.231.232, 54.231.235.104, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.131.61|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1504408 (1.4M) [text/csv]\n", + "Length: 1504408 (1,4M) [text/csv]\n", "Saving to: ‘news_category_test.csv’\n", "\n", - "news_category_test. 100%[===================>] 1.43M --.-KB/s in 0.08s \n", + "news_category_test. 100%[===================>] 1,43M 2,10MB/s in 0,7s \n", "\n", - "2020-06-01 14:03:53 (18.2 MB/s) - ‘news_category_test.csv’ saved [1504408/1504408]\n", + "2023-02-20 17:39:47 (2,10 MB/s) - ‘news_category_test.csv’ saved [1504408/1504408]\n", "\n" ] } @@ -183,16 +127,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 207 - }, - "colab_type": "code", - "id": "QYolNmBtMF5h", - "outputId": "833fb436-b15c-4ca4-ba35-064ef9cfba1c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -217,10 +153,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zWFUDI6jMF5k" - }, + "metadata": {}, "source": [ "The content is inside `description` column and the labels are inside `category` column" ] @@ -228,11 +161,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "S9TRr7iAMF5l" - }, + "metadata": {}, "outputs": [], "source": [ "trainDataset = spark.read \\\n", @@ -242,16 +171,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 459 - }, - "colab_type": "code", - "id": "nURzgFJ7MF5o", - "outputId": "59fb0534-d38c-4a16-dbd5-6a3183ecd679" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -292,16 +213,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "id": "5NIHJuVKx4gk", - "outputId": "1d75a4d1-1ee5-4411-9c21-b0d9e932bd4e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -309,10 +222,8 @@ "120000" ] }, - "execution_count": 8, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -323,11 +234,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0oKvNZaEMF5q" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -339,16 +246,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "H30A4FgNMF5t", - "outputId": "d044723f-48ec-40e3-b8cd-c6454e0a02f3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -389,11 +288,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "kia7NpRJMF5v" - }, + "metadata": {}, "outputs": [], "source": [ "pipelineModel = pipeline.fit(trainDataset)" @@ -401,23 +296,51 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "TOLU30ilMF5y", - "outputId": "74177e85-128f-4ae5-b38e-2e7244fe3b3f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total 4\n", - "-rw-r--r-- 1 root root 529 Jun 1 14:11 ClassifierDLApproach_d82e68030034.log\n" + "total 148\n", + "-rw-r--r-- 1 root root 456 20. Feb 17:41 ClassifierDLApproach_0375e3a8df00.log\n", + "-rw-r--r-- 1 root root 918 20. Feb 17:38 ClassifierDLApproach_6fdb8a569309.log\n", + "-rw-r--r-- 1 root root 446 20. Feb 15:55 ClassifierDLApproach_97ff5c76d735.log\n", + "-rw-r--r-- 1 root root 438 20. Feb 17:38 ClassifierMetrics_09bd6fa2ecf7.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_17606bbb7d1f.log\n", + "-rw-r--r-- 1 root root 313 10. Feb 16:54 ClassifierMetrics_1a6c515483ae.log\n", + "-rw-r--r-- 1 root root 441 20. Feb 17:38 ClassifierMetrics_1e0c8ea78e67.log\n", + "-rw-r--r-- 1 root root 323 10. Feb 16:54 ClassifierMetrics_2530315112a8.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_2b7b458fc84d.log\n", + "-rw-r--r-- 1 root root 314 10. Feb 16:54 ClassifierMetrics_3ccf43933a23.log\n", + "-rw-r--r-- 1 root root 449 20. Feb 17:38 ClassifierMetrics_4a2e4a7dac7c.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_55c7e364bf2b.log\n", + "-rw-r--r-- 1 root root 426 20. Feb 17:38 ClassifierMetrics_73fa92fe4be8.log\n", + "-rw-r--r-- 1 root root 433 20. Feb 17:38 ClassifierMetrics_7764aa9b23e3.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_890dcfe0db80.log\n", + "-rw-r--r-- 1 root root 444 20. Feb 17:38 ClassifierMetrics_8ecc3f83e12d.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_9290b613e8d7.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_aa0e2812a3b9.log\n", + "-rw-r--r-- 1 root root 318 10. Feb 16:54 ClassifierMetrics_ad4cb4a650fa.log\n", + "-rw-r--r-- 1 root root 452 20. Feb 17:38 ClassifierMetrics_e0da6952b2c6.log\n", + "-rw-r--r-- 1 root root 312 10. Feb 16:54 ClassifierMetrics_efc7f6345e79.log\n", + "-rw-r--r-- 1 root root 319 10. Feb 16:54 ClassifierMetrics_f571876aaa09.log\n", + "-rw-r--r-- 1 root root 436 20. Feb 17:38 ClassifierMetrics_fdc5fa307baf.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:23 NerDL_0f47f69f09e6.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_10e337c8a3ef.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 17:31 NerDL_18e7b1673dab.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_27f18f749174.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_3ae0321ce66a.log\n", + "-rw-r--r-- 1 root root 319 26. Okt 09:13 NerDL_568d747656b8.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:03 NerDL_5970e276422f.log\n", + "-rw-r--r-- 1 root root 320 16. Jan 11:10 NerDL_759a68c3769d.log\n", + "-rw-r--r-- 1 root root 320 3. Nov 19:22 NerDL_891f9b941985.log\n", + "-rw-r--r-- 1 root root 320 2. Feb 2022 NerDL_8e8184f259cb.log\n", + "-rw-r--r-- 1 root root 320 27. Okt 13:02 NerDL_add5b34b2ecb.log\n", + "-rw-r--r-- 1 root root 320 21. Okt 19:06 NerDL_bc57a96c68c3.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 16:47 NerDL_ff0a43f20378.log\n", + "-rw-r--r-- 1 root root 897 10. Feb 16:54 SentimentDLApproach_98dfd2c1fdee.log\n" ] } ], @@ -427,27 +350,14 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "c6TAuRqBNs4_", - "outputId": "527beaf3-30ba-4be5-af35-57aa89963731" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Training started - total epochs: 5 - learning rate: 0.005 - batch size: 64 - training examples: 120000\n", - "Epoch 0/5 - 35.876454147%.2fs - loss: 1588.0098 - accuracy: 0.8812917 - batches: 1875\n", - "Epoch 1/5 - 34.984400619%.2fs - loss: 1569.1891 - accuracy: 0.892 - batches: 1875\n", - "Epoch 2/5 - 34.980620721%.2fs - loss: 1560.8793 - accuracy: 0.8966333 - batches: 1875\n", - "Epoch 3/5 - 34.97171791%.2fs - loss: 1556.4751 - accuracy: 0.9005917 - batches: 1875\n", - "Epoch 4/5 - 35.060583703%.2fs - loss: 1550.6415 - accuracy: 0.90370834 - batches: 1875\n" + "cat: /home/root/annotator_logs/ClassifierDLApproach_d82e68030034.log: No such file or directory\n" ] } ], @@ -457,10 +367,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zMLuwQSCB05e" - }, + "metadata": {}, "source": [ "# How to use already trained ClassifierDL pipeline or its model\n", "\n", @@ -471,10 +378,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4I7COUCPCPe3" - }, + "metadata": {}, "source": [ "## Save and load pre-trained ClassifierDL pipeline" ] @@ -482,11 +386,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "QTDQ3riLD-zW" - }, + "metadata": {}, "outputs": [], "source": [ "# Google Colab is free so it comes with a little memory. \n", @@ -499,10 +399,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "TI9JR8AoLbW3" - }, + "metadata": {}, "source": [ "# Save and load pre-trained ClassifierDL model" ] @@ -510,11 +407,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3r3_q4CJLkZR" - }, + "metadata": {}, "outputs": [], "source": [ "# dbfs:/ or hdfs:/ if you are saving it on distributed file systems\n", @@ -524,26 +417,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "3JaclNFsQJ-X" - }, + "metadata": {}, "source": [ "Let's use our pre-trained ClassifierDLModel in a pipeline: " ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "NTJ53PbYQI-f", - "outputId": "07177371-a8ed-4cd4-ac9d-8de2058a01fd" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -580,10 +462,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "VOrjIsKXHea8" - }, + "metadata": {}, "source": [ "Now let's load it back so we can have prediction all together with everything in that pipeline:" ] @@ -591,11 +470,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ccy54HeERCZ1" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.types import StringType\n", @@ -609,11 +484,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3BsNAWS4VRkd" - }, + "metadata": {}, "outputs": [], "source": [ "prediction = pipeline.fit(dfTest).transform(dfTest)" @@ -621,16 +492,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "colab_type": "code", - "id": "nz32PDjEVUTk", - "outputId": "b4c6ad7d-4fca-4e64-e665-1bc918109297" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -643,12 +506,12 @@ "|[Sci/Tech]|\n", "+----------+\n", "\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", - "|metadata |\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", - "|[[Sports -> 9.916687E-8, Business -> 0.99999917, World -> 4.8718215E-7, Sci/Tech -> 2.1577937E-7, sentence -> 0]]|\n", - "|[[Sports -> 9.949142E-13, Business -> 2.2465226E-11, World -> 6.4324095E-11, Sci/Tech -> 1.0, sentence -> 0]] |\n", - "+-----------------------------------------------------------------------------------------------------------------+\n", + "+----------------------------------------------------------------------------------------------------------------+\n", + "|metadata |\n", + "+----------------------------------------------------------------------------------------------------------------+\n", + "|[{Sports -> 9.7500305E-8, Business -> 0.9999994, World -> 3.060714E-7, Sci/Tech -> 1.7089135E-7, sentence -> 0}]|\n", + "|[{Sports -> 5.5346327E-15, Business -> 1.8843019E-13, World -> 8.666048E-14, Sci/Tech -> 1.0, sentence -> 0}] |\n", + "+----------------------------------------------------------------------------------------------------------------+\n", "\n" ] } @@ -661,10 +524,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "UoJH3kA7RJpD" - }, + "metadata": {}, "source": [ "# Evaluation \n", "\n", @@ -674,11 +534,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5HkV5BAiWPAo" - }, + "metadata": {}, "outputs": [], "source": [ "testDataset = spark.read \\\n", @@ -689,11 +545,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_aVPZXgst0-V" - }, + "metadata": {}, "outputs": [], "source": [ "preds = pipelineModel.transform(testDataset)" @@ -701,16 +553,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 969 - }, - "colab_type": "code", - "id": "-H9UAWO_t-b9", - "outputId": "eaa3dacb-fbf3-4125-a915-0e6c29c4f59e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -724,7 +568,7 @@ "|Sci/Tech| A company founded by a chemistry researcher at...|[Sci/Tech]|\n", "|Sci/Tech| It's barely dawn when Mike Fitzpatrick starts ...|[Sci/Tech]|\n", "|Sci/Tech| Southern California's smog fighting agency wen...|[Sci/Tech]|\n", - "|Sci/Tech|\"The British Department for Education and Skill...|[Sci/Tech]|\n", + "|Sci/Tech|\"The British Department for Education and Skill...| [World]|\n", "|Sci/Tech|\"confessed author of the Netsky and Sasser viru...|[Sci/Tech]|\n", "|Sci/Tech|\\\\FOAF/LOAF and bloom filters have a lot of in...|[Sci/Tech]|\n", "|Sci/Tech|\"Wiltshire Police warns about \"\"phishing\"\" afte...|[Sci/Tech]|\n", @@ -742,7 +586,7 @@ "|Sci/Tech|By the end of the year, the computing giant pla...|[Sci/Tech]|\n", "|Sci/Tech|Developers get early code for new operating sys...|[Sci/Tech]|\n", "|Sci/Tech|New technology applies electrical fuses to help...|[Sci/Tech]|\n", - "|Sci/Tech|Google has billed its IPO as a way for everyday...|[Sci/Tech]|\n", + "|Sci/Tech|Google has billed its IPO as a way for everyday...|[Business]|\n", "|Sci/Tech|By MICHAEL LIEDTKE SAN FRANCISCO (AP) -- Wi...|[Business]|\n", "|Sci/Tech|Industry cyber security standards fail to reach...|[Sci/Tech]|\n", "| Sports|Michael Phelps won the gold medal in the 400 in...| [Sports]|\n", @@ -782,11 +626,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8-JF5_Y9uPFj" - }, + "metadata": {}, "outputs": [], "source": [ "preds_df = preds.select('category','description',\"class.result\").toPandas()" @@ -795,11 +635,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CS2q_OajuZyO" - }, + "metadata": {}, "outputs": [], "source": [ "# The result is an array since in Spark NLP you can have multiple sentences.\n", @@ -812,11 +648,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "naAHGWV5ugNX" - }, + "metadata": {}, "outputs": [], "source": [ "# We are going to use sklearn to evalute the results on test dataset\n", @@ -825,26 +657,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "o2BiHF_sR3Cz" - }, + "metadata": {}, "source": [ "Let's use `classification_report` from `sklearn` to evaluate the final scores. (keep in mind due to limited resources on a free Google Colab we only used 5 Epochs :)" ] }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 - }, - "colab_type": "code", - "id": "kLeO9u1bunPB", - "outputId": "d72ac4ac-754e-409d-8a99-088fc1838712" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -852,13 +673,13 @@ "text": [ " precision recall f1-score support\n", "\n", - " Business 0.82 0.87 0.84 1801\n", - " Sci/Tech 0.90 0.84 0.87 2036\n", - " Sports 0.98 0.95 0.97 1957\n", - " World 0.87 0.92 0.90 1806\n", + " Business 0.84 0.85 0.85 1860\n", + " Sci/Tech 0.89 0.84 0.87 2001\n", + " Sports 0.98 0.96 0.97 1944\n", + " World 0.87 0.92 0.90 1795\n", "\n", " accuracy 0.89 7600\n", - " macro avg 0.89 0.89 0.89 7600\n", + " macro avg 0.89 0.90 0.89 7600\n", "weighted avg 0.90 0.89 0.89 7600\n", "\n" ] @@ -867,17 +688,6 @@ "source": [ "print (classification_report(preds_df['result'], preds_df['category']))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3Zlwshvwx4hu" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -901,8 +711,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.21.0" diff --git a/examples/python/training/english/classification/MultiClassifierDL_Train_and_Evaluate.ipynb b/examples/python/training/english/classification/MultiClassifierDL_Train_and_Evaluate.ipynb index 9fdd5125ce2ab8..861c6245ab20c0 100644 --- a/examples/python/training/english/classification/MultiClassifierDL_Train_and_Evaluate.ipynb +++ b/examples/python/training/english/classification/MultiClassifierDL_Train_and_Evaluate.ipynb @@ -1,25 +1,15 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "aaVmDt1TEXdh" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "# Spark NLP\n", - "## Multi-label Text Classification\n", - "### Toxic Comments\n", - "#### By using MultiClassifierDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jSgSzQsusNIQ" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/MultiClassifierDL_Train_and_Evaluate.ipynb)" + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/MultiClassifierDL_Train_and_Evaluate.ipynb)\n", + "\n", + "# Multi-label Text Classification of Toxic Comments using MultiClassifierDL" ] }, { @@ -28,29 +18,21 @@ "metadata": {}, "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Uuwsnj7VsXm3" - }, + "metadata": {}, "source": [ "Let's download our Toxic comments for tarining and testing:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Q7me57t41nSe", - "outputId": "4ae5d238-d8dd-4bad-a4ff-d89cd0adcf44" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -58,7 +40,7 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 2702k 100 2702k 0 0 1017k 0 0:00:02 0:00:02 --:--:-- 1016k\n" + "100 2702k 100 2702k 0 0 1699k 0 0:00:01 0:00:01 --:--:-- 1699k\n" ] } ], @@ -68,14 +50,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Dc8yON7W1qAq", - "outputId": "f55c2d0e-9d88-439d-ce2c-3390d0e4c456" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -83,7 +59,7 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 289k 100 289k 0 0 148k 0 0:00:01 0:00:01 --:--:-- 148k\n" + "100 289k 100 289k 0 0 249k 0 0:00:01 0:00:01 --:--:-- 249k\n" ] } ], @@ -93,24 +69,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "haHMQFsr6ZUt" - }, + "metadata": {}, "source": [ "In this notebook we are going to check the training logs on the fly. Thus, we start a session with real_time_output=True" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "id": "Hmvv5Q4jMF5b", - "outputId": "0e3d7c52-9f76-4e5c-be7c-0636d850b6f3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -121,16 +88,21 @@ }, { "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, "text/plain": [ - "'4.1.0'" + "'4.3.1'" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23/02/20 17:43:36 WARN Utils: Your hostname, duc-manjaro resolves to a loopback address: 127.0.1.1; using 192.168.0.34 instead (on interface enp3s0)\n", + "23/02/20 17:43:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n" + ] } ], "source": [ @@ -143,35 +115,25 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "pZUlTYncseVF" - }, + "metadata": {}, "source": [ "Let's read our Toxi comments datasets:" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "S9TRr7iAMF5l" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "train_dataset = spark.read.parquet(\"/content/toxic_train.snappy.parquet\").repartition(120)\n", - "toxic_test_dataset = spark.read.parquet(\"/content/toxic_test.snappy.parquet\").repartition(10)" + "train_dataset = spark.read.parquet(\"toxic_train.snappy.parquet\").repartition(120)\n", + "toxic_test_dataset = spark.read.parquet(\"toxic_test.snappy.parquet\").repartition(10)" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nURzgFJ7MF5o", - "outputId": "5f7ce400-b6e5-4a9b-ec4e-7dc074099ad1" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -194,23 +156,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "aQa57ITfslQr" - }, + "metadata": {}, "source": [ "As you can see, there are lots of new lines in our comments which we can fix them with `DocumentAssembler`" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iVHvdF481OCG", - "outputId": "532b5560-387a-44e1-9c94-47cee4cdcb31" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -228,9 +182,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_kMkE9zd6sU3" - }, + "metadata": {}, "source": [ "# Evaluation \n", "\n", @@ -239,10 +191,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "0oKvNZaEMF5q" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -254,14 +204,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "uaMefWq667D5", - "outputId": "d53252d7-57ca-44dd-bf35-8946b7d03964" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -269,6 +213,9 @@ "text": [ "tfhub_use download started this may take some time.\n", "Approximate size to download 923.7 MB\n", + "[ | ]tfhub_use download started this may take some time.\n", + "Approximate size to download 923.7 MB\n", + "[ / ]Download done! Loading the resource.\n", "[OK!]\n" ] } @@ -292,14 +239,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "231I290d7Jtg", - "outputId": "1acb5c7e-0fbf-4705-9839-c8771031743a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -322,19 +263,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "i4hN4TgG7LHT" - }, + "metadata": {}, "source": [ "Now, that out test dataset has the required embeddings, we save it as parquet and use it while training our MultiClassifierDL model." ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "iXYZwA567Ps9" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "test_dataset.write.parquet(\"./toxic_test.parquet\")" @@ -342,19 +279,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gcuUw8Ck7ZPS" - }, + "metadata": {}, "source": [ "Now let's train it and use a validation and the test dataset above for evaluation" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "H30A4FgNMF5t" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# We will use MultiClassifierDL built by using Bidirectional GRU and CNNs inside TensorFlow that supports up to 100 classes\n", @@ -383,23 +316,17 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kia7NpRJMF5v", - "outputId": "51fe67b9-31eb-4f68-fa04-57899637c432" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training started - epochs: 5 - learning_rate: 0.001 - batch_size: 128 - training_examples: 13158 - classes: 6\n", - "Epoch 1/5 - 5.53s - loss: 0.38046357 - acc: 0.848714 - batches: 103\n", + "Epoch 1/5 - 4.34s - loss: 0.38046357 - acc: 0.848714 - batches: 103\n", "Quality on validation dataset (10.0%), validation examples = 1462 \n", - "time to finish evaluation: 1.66s\n", + "time to finish evaluation: 2.05s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1385\t 77\t 0\t 0.94733244\t 1.0\t 0.97295403\n", "threat 0\t 0\t 47\t 0.0\t 0.0\t 0.0\n", @@ -411,7 +338,7 @@ "Macro-average\t prec: 0.63539594, rec: 0.46277428, f1: 0.5355179\n", "Micro-average\t prec: 0.85334265, recall: 0.77470607, f1: 0.81212527\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.26s\n", + "time to finish evaluation: 0.08s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1504\t 101\t 0\t 0.9370716\t 1.0\t 0.9675137\n", "threat 0\t 1\t 50\t 0.0\t 0.0\t 0.0\n", @@ -422,9 +349,9 @@ "tp: 2612 fp: 498 fn: 801 labels: 6\n", "Macro-average\t prec: 0.59864384, rec: 0.4619752, f1: 0.52150416\n", "Micro-average\t prec: 0.8398714, recall: 0.7653091, f1: 0.8008585\n", - "Epoch 2/5 - 3.11s - loss: 0.30138606 - acc: 0.87715614 - batches: 103\n", + "Epoch 2/5 - 1.16s - loss: 0.30138606 - acc: 0.87715614 - batches: 103\n", "Quality on validation dataset (10.0%), validation examples = 1462 \n", - "time to finish evaluation: 0.21s\n", + "time to finish evaluation: 0.11s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1385\t 77\t 0\t 0.94733244\t 1.0\t 0.97295403\n", "threat 4\t 0\t 43\t 1.0\t 0.08510638\t 0.15686274\n", @@ -436,7 +363,7 @@ "Macro-average\t prec: 0.7648964, rec: 0.5112443, f1: 0.61286175\n", "Micro-average\t prec: 0.85131896, recall: 0.7896409, f1: 0.8193208\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.21s\n", + "time to finish evaluation: 0.08s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1504\t 101\t 0\t 0.9370716\t 1.0\t 0.9675137\n", "threat 4\t 2\t 46\t 0.6666667\t 0.08\t 0.14285715\n", @@ -447,9 +374,9 @@ "tp: 2644 fp: 502 fn: 769 labels: 6\n", "Macro-average\t prec: 0.6991777, rec: 0.49554834, f1: 0.5800097\n", "Micro-average\t prec: 0.8404323, recall: 0.774685, f1: 0.8062205\n", - "Epoch 3/5 - 3.10s - loss: 0.29324573 - acc: 0.87968993 - batches: 103\n", + "Epoch 3/5 - 1.08s - loss: 0.29324576 - acc: 0.87968993 - batches: 103\n", "Quality on validation dataset (10.0%), validation examples = 1462 \n", - "time to finish evaluation: 0.19s\n", + "time to finish evaluation: 0.08s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1385\t 77\t 0\t 0.94733244\t 1.0\t 0.97295403\n", "threat 9\t 0\t 38\t 1.0\t 0.19148937\t 0.3214286\n", @@ -461,7 +388,7 @@ "Macro-average\t prec: 0.7737805, rec: 0.5374858, f1: 0.6343426\n", "Micro-average\t prec: 0.8522302, recall: 0.7953607, f1: 0.822814\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.33s\n", + "time to finish evaluation: 0.08s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1504\t 101\t 0\t 0.9370716\t 1.0\t 0.9675137\n", "threat 14\t 4\t 36\t 0.7777778\t 0.28\t 0.41176474\n", @@ -472,9 +399,9 @@ "tp: 2657 fp: 523 fn: 756 labels: 6\n", "Macro-average\t prec: 0.7071623, rec: 0.5317086, f1: 0.60701126\n", "Micro-average\t prec: 0.8355346, recall: 0.778494, f1: 0.8060064\n", - "Epoch 4/5 - 3.30s - loss: 0.28977355 - acc: 0.88131446 - batches: 103\n", + "Epoch 4/5 - 1.09s - loss: 0.28977352 - acc: 0.88131446 - batches: 103\n", "Quality on validation dataset (10.0%), validation examples = 1462 \n", - "time to finish evaluation: 0.15s\n", + "time to finish evaluation: 0.07s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1385\t 77\t 0\t 0.94733244\t 1.0\t 0.97295403\n", "threat 11\t 0\t 36\t 1.0\t 0.23404256\t 0.37931037\n", @@ -486,7 +413,7 @@ "Macro-average\t prec: 0.77932376, rec: 0.54305106, f1: 0.6400796\n", "Micro-average\t prec: 0.85601914, recall: 0.7953607, f1: 0.82457584\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.19s\n", + "time to finish evaluation: 0.08s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1504\t 101\t 0\t 0.9370716\t 1.0\t 0.9675137\n", "threat 17\t 5\t 33\t 0.77272725\t 0.34\t 0.4722222\n", @@ -497,9 +424,9 @@ "tp: 2664 fp: 517 fn: 749 labels: 6\n", "Macro-average\t prec: 0.712634, rec: 0.5416196, f1: 0.6154681\n", "Micro-average\t prec: 0.8374725, recall: 0.780545, f1: 0.80800736\n", - "Epoch 5/5 - 3.08s - loss: 0.2876302 - acc: 0.88208383 - batches: 103\n", + "Epoch 5/5 - 1.11s - loss: 0.2876302 - acc: 0.88208383 - batches: 103\n", "Quality on validation dataset (10.0%), validation examples = 1462 \n", - "time to finish evaluation: 0.16s\n", + "time to finish evaluation: 0.07s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1385\t 77\t 0\t 0.94733244\t 1.0\t 0.97295403\n", "threat 11\t 0\t 36\t 1.0\t 0.23404256\t 0.37931037\n", @@ -511,7 +438,7 @@ "Macro-average\t prec: 0.7783019, rec: 0.54327005, f1: 0.6398866\n", "Micro-average\t prec: 0.8566541, recall: 0.79567844, f1: 0.8250412\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.17s\n", + "time to finish evaluation: 0.09s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", "toxic 1504\t 101\t 0\t 0.9370716\t 1.0\t 0.9675137\n", "threat 17\t 5\t 33\t 0.77272725\t 0.34\t 0.4722222\n", @@ -531,43 +458,32 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "F2x-_A5ZuSIX" - }, + "metadata": {}, "source": [ "Let's save our trained multi-label classifier model to be loaded in our prediction pipeline:" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "IWkBcvA_1OCV" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "pipelineModel.stages[-1].write().overwrite().save('/content/tmp_multi_classifierDL_model')" + "pipelineModel.stages[-1].write().overwrite().save('tmp_multi_classifierDL_model')" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "ppQ211AW1OCg" - }, + "metadata": {}, "source": [ - "## load saved pipeline" + "## Load the Saved Pipeline" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RglS6Ohj1OCi", - "outputId": "81ac84d1-a464-4da3-b193-2b46569c9474" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -588,7 +504,7 @@ " .setInputCols([\"document\"])\\\n", " .setOutputCol(\"sentence_embeddings\")\n", "\n", - "multiClassifier = MultiClassifierDLModel.load(\"/content/tmp_multi_classifierDL_model\") \\\n", + "multiClassifier = MultiClassifierDLModel.load(\"tmp_multi_classifierDL_model\") \\\n", " .setInputCols([\"sentence_embeddings\"])\\\n", " .setOutputCol(\"category\")\\\n", " .setThreshold(0.5)\n", @@ -623,8 +539,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" }, "name": "MultiClassifierDL_Train_multi_label_toxic_classifier", "notebookId": 1952370652427552, diff --git a/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb b/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb index 6670003f7694fc..c16a317a7f46b4 100644 --- a/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb +++ b/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb @@ -1,79 +1,39 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aaVmDt1TEXdh" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "# Spark NLP\n", - "## Multi-label Text Classification\n", - "### E2E Challenge\n", - "#### By using MultiClassifierDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jSgSzQsusNIQ" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb)" + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb)\n", + "\n", + "\n", + "# Multi-label Text Classification: E2E Challenge using MultiClassifierDL" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "S2XBx14_1tlw", - "outputId": "936a7951-52ad-4f27-9af0-a7b18f1365bd" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_265\"\n", - "OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)\n", - "OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Uuwsnj7VsXm3" - }, + "metadata": {}, "source": [ "Let's download our Toxic comments for tarining and testing:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "Q7me57t41nSe", - "outputId": "e6fbc37e-c127-42fb-b1dc-198cbf653b0b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -81,7 +41,7 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 1922k 100 1922k 0 0 2000k 0 --:--:-- --:--:-- --:--:-- 1998k\n" + "100 1922k 100 1922k 0 0 1337k 0 0:00:01 0:00:01 --:--:-- 1337k\n" ] } ], @@ -91,16 +51,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "colab_type": "code", - "id": "Hmvv5Q4jMF5b", - "outputId": "f06e50ba-8992-4856-a3dc-7d5ebd694a4c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -111,17 +63,12 @@ }, { "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, "text/plain": [ - "'2.6.0'" + "'4.3.1'" ] }, - "execution_count": 2, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -135,40 +82,25 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pZUlTYncseVF" - }, + "metadata": {}, "source": [ "Let's read our Toxi comments datasets:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "S9TRr7iAMF5l" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "trainDataset, testDataset = spark.read.parquet(\"/content/e2e_train.snappy.parquet\")\\\n", - " .randomSplit([0.9, 0.1], seed = 12345) " + "trainDataset, testDataset = spark.read.parquet(\"e2e_train.snappy.parquet\") \\\n", + " .randomSplit([0.9, 0.1], seed = 12345)" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 153 - }, - "colab_type": "code", - "id": "nURzgFJ7MF5o", - "outputId": "edab7951-7df8-4d3f-a291-b66ce5405580" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -191,33 +123,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aQa57ITfslQr" - }, + "metadata": {}, "source": [ "As you can see, there are lots of new lines in our comments which we can fix them with `DocumentAssembler`" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "iVHvdF481OCG", - "outputId": "7e3e8164-bead-417a-8cff-0d23402a3a5c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "37792\n", - "4269\n" + "37762\n", + "4299\n" ] } ], @@ -228,12 +149,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0oKvNZaEMF5q" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -245,16 +162,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "H30A4FgNMF5t", - "outputId": "b32d95a5-50b1-4cc3-e912-66860fde67bb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -301,12 +210,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "kia7NpRJMF5v" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipelineModel = pipeline.fit(trainDataset)" @@ -315,39 +220,84 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "RjP_MAc5kNDi" - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 216\n", + "-rw-r--r-- 1 root root 456 20. Feb 17:41 ClassifierDLApproach_0375e3a8df00.log\n", + "-rw-r--r-- 1 root root 918 20. Feb 17:38 ClassifierDLApproach_6fdb8a569309.log\n", + "-rw-r--r-- 1 root root 446 20. Feb 15:55 ClassifierDLApproach_97ff5c76d735.log\n", + "-rw-r--r-- 1 root root 438 20. Feb 17:38 ClassifierMetrics_09bd6fa2ecf7.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_17606bbb7d1f.log\n", + "-rw-r--r-- 1 root root 571 20. Feb 17:45 ClassifierMetrics_176ce729caa6.log\n", + "-rw-r--r-- 1 root root 313 10. Feb 16:54 ClassifierMetrics_1a6c515483ae.log\n", + "-rw-r--r-- 1 root root 441 20. Feb 17:38 ClassifierMetrics_1e0c8ea78e67.log\n", + "-rw-r--r-- 1 root root 323 10. Feb 16:54 ClassifierMetrics_2530315112a8.log\n", + "-rw-r--r-- 1 root root 566 20. Feb 17:45 ClassifierMetrics_26e8744dc78c.log\n", + "-rw-r--r-- 1 root root 565 20. Feb 17:45 ClassifierMetrics_284f041511fb.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_2b7b458fc84d.log\n", + "-rw-r--r-- 1 root root 551 20. Feb 17:45 ClassifierMetrics_2fde2811a93c.log\n", + "-rw-r--r-- 1 root root 133 20. Feb 17:52 ClassifierMetrics_387f03f0b7a0.log\n", + "-rw-r--r-- 1 root root 314 10. Feb 16:54 ClassifierMetrics_3ccf43933a23.log\n", + "-rw-r--r-- 1 root root 559 20. Feb 17:45 ClassifierMetrics_49fdfe64394f.log\n", + "-rw-r--r-- 1 root root 449 20. Feb 17:38 ClassifierMetrics_4a2e4a7dac7c.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_55c7e364bf2b.log\n", + "-rw-r--r-- 1 root root 128 20. Feb 17:52 ClassifierMetrics_66b22a01b7d3.log\n", + "-rw-r--r-- 1 root root 555 20. Feb 17:45 ClassifierMetrics_71effbac2282.log\n", + "-rw-r--r-- 1 root root 426 20. Feb 17:38 ClassifierMetrics_73fa92fe4be8.log\n", + "-rw-r--r-- 1 root root 433 20. Feb 17:38 ClassifierMetrics_7764aa9b23e3.log\n", + "-rw-r--r-- 1 root root 127 20. Feb 17:52 ClassifierMetrics_7dc198897be3.log\n", + "-rw-r--r-- 1 root root 570 20. Feb 17:45 ClassifierMetrics_80808e6b12d1.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_890dcfe0db80.log\n", + "-rw-r--r-- 1 root root 444 20. Feb 17:38 ClassifierMetrics_8ecc3f83e12d.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_9290b613e8d7.log\n", + "-rw-r--r-- 1 root root 567 20. Feb 17:45 ClassifierMetrics_9ba6210e2c94.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:52 ClassifierMetrics_a579e188cf6b.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_aa0e2812a3b9.log\n", + "-rw-r--r-- 1 root root 318 10. Feb 16:54 ClassifierMetrics_ad4cb4a650fa.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:52 ClassifierMetrics_b901376087b3.log\n", + "-rw-r--r-- 1 root root 564 20. Feb 17:45 ClassifierMetrics_d302c6e17f10.log\n", + "-rw-r--r-- 1 root root 452 20. Feb 17:38 ClassifierMetrics_e0da6952b2c6.log\n", + "-rw-r--r-- 1 root root 567 20. Feb 17:45 ClassifierMetrics_e29d5ee5fe87.log\n", + "-rw-r--r-- 1 root root 312 10. Feb 16:54 ClassifierMetrics_efc7f6345e79.log\n", + "-rw-r--r-- 1 root root 319 10. Feb 16:54 ClassifierMetrics_f571876aaa09.log\n", + "-rw-r--r-- 1 root root 436 20. Feb 17:38 ClassifierMetrics_fdc5fa307baf.log\n", + "-rw-r--r-- 1 root root 922 20. Feb 17:45 MultiClassifierDLApproach_0420b23f4851.log\n", + "-rw-r--r-- 1 root root 792 20. Feb 17:52 MultiClassifierDLApproach_73f999799c2b.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:23 NerDL_0f47f69f09e6.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_10e337c8a3ef.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 17:31 NerDL_18e7b1673dab.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_27f18f749174.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_3ae0321ce66a.log\n", + "-rw-r--r-- 1 root root 319 26. Okt 09:13 NerDL_568d747656b8.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:03 NerDL_5970e276422f.log\n", + "-rw-r--r-- 1 root root 320 16. Jan 11:10 NerDL_759a68c3769d.log\n", + "-rw-r--r-- 1 root root 320 3. Nov 19:22 NerDL_891f9b941985.log\n", + "-rw-r--r-- 1 root root 320 2. Feb 2022 NerDL_8e8184f259cb.log\n", + "-rw-r--r-- 1 root root 320 27. Okt 13:02 NerDL_add5b34b2ecb.log\n", + "-rw-r--r-- 1 root root 320 21. Okt 19:06 NerDL_bc57a96c68c3.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 16:47 NerDL_ff0a43f20378.log\n", + "-rw-r--r-- 1 root root 897 10. Feb 16:54 SentimentDLApproach_98dfd2c1fdee.log\n" + ] + } + ], "source": [ "!ls -l ~/annotator_logs/" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "kHa_D2KFkL5w", - "outputId": "8d0b7bca-1b44-4800-f676-5da989c293cb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Training started - epochs: 5 - learning_rate: 0.001 - batch_size: 128 - training_examples: 34013 - classes: 79\n", - "Epoch 0/5 - 18.96s - loss: 0.22942108 - acc: 0.9338577 - val_loss: 0.17501871 - val_acc: 0.9417629 - val_f1: 0.3146024 - val_tpr: 0.19535509 - batches: 266\n", - "Epoch 1/5 - 10.60s - loss: 0.14757492 - acc: 0.953353 - val_loss: 0.12445798 - val_acc: 0.9562459 - val_f1: 0.57075405 - val_tpr: 0.4252112 - batches: 266\n", - "Epoch 2/5 - 10.46s - loss: 0.112007715 - acc: 0.96444803 - val_loss: 0.1024009 - val_acc: 0.9635221 - val_f1: 0.667721 - val_tpr: 0.5356968 - batches: 266\n", - "Epoch 3/5 - 10.66s - loss: 0.09598791 - acc: 0.96988803 - val_loss: 0.09133494 - val_acc: 0.9674665 - val_f1: 0.71459305 - val_tpr: 0.5951355 - batches: 266\n", - "Epoch 4/5 - 10.39s - loss: 0.08701118 - acc: 0.9730473 - val_loss: 0.08419453 - val_acc: 0.96987855 - val_f1: 0.74224013 - val_tpr: 0.63378865 - batches: 266\n" + "cat: /home/root/annotator_logs/MultiClassifierDLApproach_b80de1f04776.log: No such file or directory\n" ] } ], @@ -357,49 +307,31 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "F2x-_A5ZuSIX" - }, + "metadata": {}, "source": [ "Let's save our trained multi-label classifier model to be loaded in our prediction pipeline:" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "IWkBcvA_1OCV" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "pipelineModel.stages[-1].write().overwrite().save('/content/tmp_multi_classifierDL_model')" + "pipelineModel.stages[-1].write().overwrite().save('tmp_multi_classifierDL_model')" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ppQ211AW1OCg" - }, + "metadata": {}, "source": [ "## load saved pipeline" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "RglS6Ohj1OCi", - "outputId": "e2a46992-7b86-4516-8d19-8a5a26ce73d2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -420,7 +352,7 @@ " .setInputCols([\"document\"])\\\n", " .setOutputCol(\"sentence_embeddings\")\n", "\n", - "multiClassifier = MultiClassifierDLModel.load(\"/content/tmp_multi_classifierDL_model\") \\\n", + "multiClassifier = MultiClassifierDLModel.load(\"tmp_multi_classifierDL_model\") \\\n", " .setInputCols([\"sentence_embeddings\"])\\\n", " .setOutputCol(\"category\")\\\n", " .setThreshold(0.5)\n", @@ -435,32 +367,21 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ATJtZgiBwU-_" - }, + "metadata": {}, "source": [ "Let's now use our testing datasets to evaluate our model:" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 71 - }, - "colab_type": "code", - "id": "NDA_F1SD1OCm", - "outputId": "35209a89-b154-405a-fecc-49023218790c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['name[Bibimbap House]', 'name[Wildwood]', 'name[Cotto]', 'name[Clowns]', 'near[Burger King]', 'name[The Dumpling Tree]', 'name[The Vaults]', 'near[Crowne Plaza Hotel]', 'name[The Golden Palace]', 'name[The Rice Boat]', 'customer rating[high]', 'near[Avalon]', 'name[Alimentum]', 'near[The Bakers]', 'name[The Waterman]', 'near[Ranch]', 'name[The Olive Grove]', 'name[The Eagle]', 'name[The Wrestlers]', 'eatType[restaurant]', 'near[All Bar One]', 'customer rating[low]', 'near[Café Sicilia]', 'near[Yippee Noodle Bar]', 'food[Indian]', 'eatType[pub]', 'name[Green Man]', 'name[Strada]', 'near[Café Adriatic]', 'eatType[coffee shop]', 'name[Loch Fyne]', 'customer rating[5 out of 5]', 'near[Express by Holiday Inn]', 'food[French]', 'name[The Mill]', 'food[Japanese]', 'name[Travellers Rest Beefeater]', 'name[The Plough]', 'name[Cocum]', 'near[The Six Bells]', 'name[The Phoenix]', 'priceRange[cheap]', 'name[Midsummer House]', 'near[Rainbow Vegetarian Café]', 'near[The Rice Boat]', 'customer rating[1 out of 5]', 'customer rating[3 out of 5]', 'name[The Cricketers]', 'area[riverside]', 'priceRange[£20-25]', 'name[Blue Spice]', 'priceRange[moderate]', 'priceRange[less than £20]', 'priceRange[high]', 'name[The Golden Curry]', 'name[Giraffe]', 'customer rating[average]', 'name[Aromi]', 'name[The Twenty Two]', 'food[Fast food]', 'name[Browns Cambridge]', 'near[Café Rouge]', 'familyFriendly[no]', 'area[city centre]', 'food[Chinese]', 'name[Taste of Cambridge]', 'food[Italian]', 'near[Raja Indian Cuisine]', 'name[Zizzi]', 'priceRange[more than £30]', 'name[The Punter]', 'food[English]', 'near[Clare Hall]', 'near[The Portland Arms]', 'name[The Cambridge Blue]', 'near[The Sorrento]', 'near[Café Brazil]', 'familyFriendly[yes]', 'name[Fitzbillies]']\n", + "['name[Bibimbap House]', 'name[Wildwood]', 'name[Cotto]', 'name[Clowns]', 'near[Burger King]', 'name[The Dumpling Tree]', 'name[The Vaults]', 'near[Crowne Plaza Hotel]', 'name[The Golden Palace]', 'name[The Rice Boat]', 'customer rating[high]', 'near[Avalon]', 'name[Alimentum]', 'near[The Bakers]', 'name[The Waterman]', 'near[Ranch]', 'name[The Olive Grove]', 'name[The Eagle]', 'name[The Wrestlers]', 'eatType[restaurant]', 'near[All Bar One]', 'customer rating[low]', 'near[Café Sicilia]', 'near[Yippee Noodle Bar]', 'food[Indian]', 'eatType[pub]', 'name[Green Man]', 'name[Strada]', 'near[Café Adriatic]', 'eatType[coffee shop]', 'name[Loch Fyne]', 'customer rating[5 out of 5]', 'near[Express by Holiday Inn]', 'food[French]', 'name[The Mill]', 'food[Japanese]', 'name[Travellers Rest Beefeater]', 'name[The Plough]', 'name[Cocum]', 'near[The Six Bells]', 'name[The Phoenix]', 'priceRange[cheap]', 'name[Midsummer House]', 'near[Rainbow Vegetarian Café]', 'near[The Rice Boat]', 'customer rating[1 out of 5]', 'customer rating[3 out of 5]', 'name[The Cricketers]', 'area[riverside]', 'priceRange[£20-25]', 'name[Blue Spice]', 'priceRange[moderate]', 'priceRange[less than £20]', 'priceRange[high]', 'name[The Golden Curry]', 'name[Giraffe]', 'customer rating[average]', 'name[Aromi]', 'name[The Twenty Two]', 'food[Fast food]', 'name[Browns Cambridge]', 'near[Café Rouge]', 'familyFriendly[no]', 'area[city centre]', 'food[Chinese]', 'name[Taste of Cambridge]', 'food[Italian]', 'name[Zizzi]', 'near[Raja Indian Cuisine]', 'priceRange[more than £30]', 'name[The Punter]', 'food[English]', 'near[Clare Hall]', 'near[The Portland Arms]', 'name[The Cambridge Blue]', 'near[The Sorrento]', 'near[Café Brazil]', 'familyFriendly[yes]', 'name[Fitzbillies]']\n", "79\n" ] } @@ -473,12 +394,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "JJYBe_hq1OCo" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "preds = pipeline.fit(testDataset).transform(testDataset)\n" @@ -486,16 +403,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 153 - }, - "colab_type": "code", - "id": "QSdIxWPV1OCq", - "outputId": "ea397faf-08d1-4413-d470-3e0b17b27e74" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -504,8 +413,8 @@ "+--------------------+--------------------+--------------------+\n", "| labels| ref| result|\n", "+--------------------+--------------------+--------------------+\n", - "|[name[Strada], ea...|'Strada' is a pub...|[name[Alimentum],...|\n", - "|[name[The Eagle],...|'The Eagle' is lo...|[name[The Eagle],...|\n", + "|[name[Alimentum],...|1 out of 5 stars ...|[name[Alimentum],...|\n", + "|[name[The Punter]...|1 star budget, fa...|[near[Café Sicili...|\n", "+--------------------+--------------------+--------------------+\n", "only showing top 2 rows\n", "\n" @@ -518,16 +427,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "hHh1vMkN1OCs", - "outputId": "49eaebda-a6de-4564-8372-954b630689cb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -536,100 +437,100 @@ "Classification report: \n", " precision recall f1-score support\n", "\n", - " 0 0.84 0.86 0.85 795\n", - " 1 0.89 0.82 0.86 1724\n", - " 2 0.72 0.07 0.12 415\n", - " 3 0.68 0.13 0.21 377\n", - " 4 0.68 0.21 0.32 504\n", - " 5 0.72 0.40 0.51 557\n", - " 6 0.65 0.09 0.16 437\n", - " 7 0.74 0.28 0.41 541\n", - " 8 0.99 0.96 0.98 1000\n", - " 9 0.94 0.91 0.92 701\n", - " 10 0.86 0.52 0.65 329\n", - " 11 0.84 0.52 0.64 908\n", - " 12 0.81 0.81 0.81 1784\n", - " 13 0.95 0.91 0.93 294\n", - " 14 0.92 0.56 0.70 410\n", - " 15 0.95 0.77 0.85 566\n", - " 16 0.89 0.76 0.82 581\n", - " 17 0.95 0.89 0.92 471\n", - " 18 0.94 0.81 0.87 589\n", - " 19 0.97 0.84 0.90 650\n", - " 20 0.91 0.78 0.84 178\n", - " 21 1.00 0.22 0.36 104\n", - " 22 0.99 0.91 0.95 76\n", - " 23 0.99 0.85 0.91 110\n", - " 24 0.95 0.88 0.92 179\n", - " 25 1.00 0.73 0.84 73\n", - " 26 0.82 0.39 0.53 102\n", - " 27 0.94 0.73 0.82 123\n", - " 28 0.72 0.32 0.44 174\n", - " 29 0.96 0.83 0.89 86\n", - " 30 0.95 0.86 0.91 95\n", - " 31 0.94 0.79 0.86 144\n", - " 32 0.94 0.84 0.89 80\n", - " 33 0.89 0.87 0.88 152\n", - " 34 1.00 0.02 0.04 51\n", - " 35 0.95 0.83 0.89 94\n", - " 36 0.97 0.85 0.91 74\n", - " 37 1.00 0.71 0.83 89\n", - " 38 0.99 0.97 0.98 185\n", - " 39 0.98 0.99 0.99 198\n", - " 40 1.00 0.56 0.72 88\n", - " 41 0.93 0.77 0.84 87\n", - " 42 0.98 0.73 0.84 130\n", - " 43 0.97 0.87 0.92 158\n", - " 44 0.78 0.78 0.78 60\n", - " 45 0.97 0.90 0.93 125\n", - " 46 0.86 0.90 0.88 293\n", - " 47 0.89 0.67 0.77 122\n", - " 48 0.96 0.92 0.94 109\n", - " 49 0.93 0.48 0.63 290\n", - " 50 0.98 0.98 0.98 156\n", - " 51 1.00 0.53 0.69 36\n", - " 52 0.88 0.36 0.51 127\n", - " 53 0.78 0.12 0.20 121\n", - " 54 0.87 0.60 0.71 140\n", - " 55 0.92 0.72 0.81 32\n", - " 56 0.98 0.98 0.98 126\n", - " 57 0.94 0.60 0.73 48\n", - " 58 0.97 0.93 0.95 71\n", - " 59 0.97 0.98 0.98 190\n", - " 60 0.96 0.65 0.77 108\n", - " 61 0.99 0.88 0.93 92\n", - " 62 0.98 0.93 0.96 115\n", - " 63 0.99 0.97 0.98 216\n", - " 64 0.99 0.98 0.99 131\n", - " 65 1.00 0.98 0.99 53\n", - " 66 0.98 0.73 0.84 119\n", - " 67 0.95 0.77 0.85 101\n", - " 68 0.99 0.96 0.97 142\n", - " 69 0.88 0.60 0.71 109\n", - " 70 0.90 0.56 0.69 48\n", - " 71 0.91 0.68 0.78 87\n", - " 72 0.96 0.93 0.95 135\n", - " 73 0.75 0.08 0.15 439\n", - " 74 0.67 0.11 0.19 485\n", - " 75 0.62 0.23 0.34 585\n", - " 76 0.85 0.22 0.36 552\n", - " 77 0.57 0.15 0.24 468\n", - " 78 0.68 0.08 0.14 412\n", + " 0 0.88 0.84 0.86 790\n", + " 1 0.86 0.87 0.86 1774\n", + " 2 0.68 0.06 0.11 431\n", + " 3 0.70 0.12 0.20 422\n", + " 4 0.72 0.25 0.37 525\n", + " 5 0.78 0.37 0.50 592\n", + " 6 0.68 0.18 0.29 421\n", + " 7 0.72 0.21 0.32 512\n", + " 8 0.99 0.95 0.97 1043\n", + " 9 0.97 0.88 0.92 660\n", + " 10 0.84 0.52 0.64 306\n", + " 11 0.81 0.62 0.70 932\n", + " 12 0.83 0.80 0.81 1777\n", + " 13 0.95 0.90 0.92 292\n", + " 14 0.94 0.50 0.66 411\n", + " 15 0.93 0.81 0.86 599\n", + " 16 0.90 0.73 0.80 564\n", + " 17 0.98 0.89 0.93 487\n", + " 18 0.93 0.77 0.84 588\n", + " 19 0.96 0.83 0.89 635\n", + " 20 0.90 0.78 0.84 175\n", + " 21 0.83 0.17 0.28 88\n", + " 22 1.00 0.84 0.91 68\n", + " 23 0.98 0.92 0.95 143\n", + " 24 0.97 0.88 0.92 193\n", + " 25 1.00 0.79 0.88 84\n", + " 26 0.85 0.49 0.62 103\n", + " 27 0.95 0.85 0.90 124\n", + " 28 0.81 0.34 0.47 191\n", + " 29 0.96 0.79 0.87 68\n", + " 30 0.97 0.83 0.89 102\n", + " 31 0.96 0.84 0.89 152\n", + " 32 0.94 0.68 0.79 124\n", + " 33 0.94 0.86 0.90 174\n", + " 34 1.00 0.07 0.12 45\n", + " 35 0.96 0.89 0.93 76\n", + " 36 1.00 0.77 0.87 73\n", + " 37 0.97 0.73 0.83 79\n", + " 38 0.97 0.97 0.97 202\n", + " 39 0.99 0.98 0.98 211\n", + " 40 0.98 0.54 0.70 96\n", + " 41 0.94 0.80 0.87 82\n", + " 42 0.97 0.68 0.80 109\n", + " 43 0.96 0.94 0.95 138\n", + " 44 0.93 0.54 0.68 70\n", + " 45 0.98 0.80 0.88 107\n", + " 46 0.88 0.83 0.85 303\n", + " 47 0.94 0.57 0.71 126\n", + " 48 0.94 0.86 0.90 119\n", + " 49 0.96 0.47 0.63 279\n", + " 50 0.99 0.90 0.94 171\n", + " 51 1.00 0.49 0.65 35\n", + " 52 0.82 0.42 0.56 112\n", + " 53 0.47 0.12 0.19 77\n", + " 54 0.92 0.56 0.70 158\n", + " 55 1.00 0.83 0.91 35\n", + " 56 1.00 0.93 0.96 146\n", + " 57 0.95 0.71 0.81 49\n", + " 58 0.98 0.89 0.94 73\n", + " 59 0.99 0.98 0.98 212\n", + " 60 0.95 0.57 0.71 102\n", + " 61 0.99 0.88 0.93 93\n", + " 62 0.99 0.93 0.96 120\n", + " 63 1.00 0.98 0.99 209\n", + " 64 1.00 0.99 1.00 148\n", + " 65 1.00 0.98 0.99 62\n", + " 66 0.95 0.73 0.82 99\n", + " 67 0.89 0.78 0.83 95\n", + " 68 0.98 0.97 0.97 157\n", + " 69 0.79 0.76 0.77 111\n", + " 70 0.97 0.70 0.82 44\n", + " 71 0.91 0.63 0.75 93\n", + " 72 0.99 0.94 0.97 133\n", + " 73 0.73 0.23 0.35 459\n", + " 74 0.77 0.07 0.13 559\n", + " 75 0.64 0.21 0.32 543\n", + " 76 0.80 0.27 0.40 580\n", + " 77 0.70 0.11 0.19 426\n", + " 78 0.72 0.06 0.12 401\n", "\n", - " micro avg 0.89 0.63 0.74 22906\n", - " macro avg 0.90 0.66 0.73 22906\n", - "weighted avg 0.86 0.63 0.69 22906\n", - " samples avg 0.89 0.63 0.72 22906\n", + " micro avg 0.90 0.64 0.74 23167\n", + " macro avg 0.90 0.66 0.73 23167\n", + "weighted avg 0.87 0.64 0.70 23167\n", + " samples avg 0.90 0.63 0.72 23167\n", "\n", - "F1 micro averaging: 0.7408601325248804\n", - "ROC: 0.8133453460541078\n" + "F1 micro averaging: 0.7445738686610911\n", + "ROC: 0.815347330332481\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n", + "/home/root/.conda/envs/sparknlp/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } @@ -654,16 +555,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 289 - }, - "colab_type": "code", - "id": "LR7PpSll1OCy", - "outputId": "0738f91d-b878-4fa8-ee2d-d7a9637e18cd" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -672,16 +565,16 @@ "+--------------------+\n", "| metadata|\n", "+--------------------+\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", - "|[[name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", + "|[{name[Alimentum]...|\n", "+--------------------+\n", "only showing top 10 rows\n", "\n" @@ -694,16 +587,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "73Gm6Rno1OC0", - "outputId": "9072df2b-73f6-4aae-a221-b95e669d079f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -721,17 +606,6 @@ "source": [ "preds.select(\"category.metadata\").printSchema()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "BJ0aOVjtx8sL" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -756,8 +630,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "name": "MultiClassifierDL_Train_multi_label_toxic_classifier", "notebookId": 1952370652427552, diff --git a/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb b/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb index 00b0e8599ee8e2..02ed05fc4e4438 100644 --- a/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb +++ b/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb @@ -1,66 +1,31 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aaVmDt1TEXdh" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "# Spark NLP\n", - "## Multi-label Text Classification\n", - "### Toxic Comments\n", - "#### By using MultiClassifierDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jSgSzQsusNIQ" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb)" + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb)\n", + "\n", + "\n", + "# Multi-label Text Classification of Toxic Comments using MultiClassifierDL" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "S2XBx14_1tlw", - "outputId": "7ec141b0-d342-4ce1-f121-cc1e834ed7d8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_265\"\n", - "OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)\n", - "OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)\n", - "\u001B[K |████████████████████████████████| 218.4MB 67kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 57.7MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Uuwsnj7VsXm3" - }, + "metadata": {}, "source": [ "Let's download our Toxic comments for tarining and testing:" ] @@ -68,15 +33,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "Q7me57t41nSe", - "outputId": "f9307a55-ed9e-466b-8465-243934345d7a" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -84,7 +41,7 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 2702k 100 2702k 0 0 3117k 0 --:--:-- --:--:-- --:--:-- 3113k\n" + "100 2702k 100 2702k 0 0 1720k 0 0:00:01 0:00:01 --:--:-- 1720k\n" ] } ], @@ -95,15 +52,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "Dc8yON7W1qAq", - "outputId": "933c95d8-4a9a-40f3-bb98-84a23bedda7f" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -111,7 +60,7 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 289k 100 289k 0 0 185k 0 0:00:01 0:00:01 --:--:-- 185k\n" + "100 289k 100 289k 0 0 254k 0 0:00:01 0:00:01 --:--:-- 254k\n" ] } ], @@ -122,15 +71,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "colab_type": "code", - "id": "Hmvv5Q4jMF5b", - "outputId": "4ec222ab-e332-4617-940e-9cfbfc27828b" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -141,17 +82,12 @@ }, { "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, "text/plain": [ - "'2.6.0'" + "'4.3.1'" ] }, - "execution_count": 4, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -160,15 +96,12 @@ "\n", "spark=sparknlp.start()\n", "print(\"Spark NLP version\")\n", - "sparknlp.version()\n" + "sparknlp.version()" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pZUlTYncseVF" - }, + "metadata": {}, "source": [ "Let's read our Toxi comments datasets:" ] @@ -176,29 +109,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "S9TRr7iAMF5l" - }, + "metadata": {}, "outputs": [], "source": [ - "trainDataset = spark.read.parquet(\"/content/toxic_train.snappy.parquet\").repartition(120)\n", - "testDataset = spark.read.parquet(\"/content/toxic_test.snappy.parquet\").repartition(10)" + "trainDataset = spark.read.parquet(\"toxic_train.snappy.parquet\").repartition(120)\n", + "testDataset = spark.read.parquet(\"toxic_test.snappy.parquet\").repartition(10)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 170 - }, - "colab_type": "code", - "id": "nURzgFJ7MF5o", - "outputId": "4cf99e20-9d9c-48a3-fe01-48ffcefe4fc6" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -208,8 +129,7 @@ "| id| text| labels|\n", "+----------------+--------------------+-------+\n", "|e63f1cc4b0b9959f|EAT SHIT HORSE FA...|[toxic]|\n", - "|ed58abb40640f983|PN News\n", - "You mean ...|[toxic]|\n", + "|ed58abb40640f983|PN News\\nYou mean...|[toxic]|\n", "+----------------+--------------------+-------+\n", "only showing top 2 rows\n", "\n" @@ -222,10 +142,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aQa57ITfslQr" - }, + "metadata": {}, "source": [ "As you can see, there are lots of new lines in our comments which we can fix them with `DocumentAssembler`" ] @@ -233,15 +150,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "iVHvdF481OCG", - "outputId": "60f3143c-a992-4e07-9c42-672ff1a8b5ed" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -260,11 +169,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0oKvNZaEMF5q" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -277,15 +182,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "H30A4FgNMF5t", - "outputId": "de876ba8-cc9f-494b-cc25-f861c2f8716c" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -334,11 +231,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "kia7NpRJMF5v" - }, + "metadata": {}, "outputs": [], "source": [ "pipelineModel = pipeline.fit(trainDataset)" @@ -347,22 +240,73 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "RjP_MAc5kNDi", - "outputId": "43f80e82-f82a-4299-e753-a50dd1d8d2cf" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total 4\n", - "-rw-r--r-- 1 root root 885 Sep 2 16:56 MultiClassifierDLApproach_d670b2c2d0df.log\n" + "total 240\n", + "-rw-r--r-- 1 root root 456 20. Feb 17:41 ClassifierDLApproach_0375e3a8df00.log\n", + "-rw-r--r-- 1 root root 918 20. Feb 17:38 ClassifierDLApproach_6fdb8a569309.log\n", + "-rw-r--r-- 1 root root 446 20. Feb 15:55 ClassifierDLApproach_97ff5c76d735.log\n", + "-rw-r--r-- 1 root root 438 20. Feb 17:38 ClassifierMetrics_09bd6fa2ecf7.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_17606bbb7d1f.log\n", + "-rw-r--r-- 1 root root 571 20. Feb 17:45 ClassifierMetrics_176ce729caa6.log\n", + "-rw-r--r-- 1 root root 313 10. Feb 16:54 ClassifierMetrics_1a6c515483ae.log\n", + "-rw-r--r-- 1 root root 441 20. Feb 17:38 ClassifierMetrics_1e0c8ea78e67.log\n", + "-rw-r--r-- 1 root root 323 10. Feb 16:54 ClassifierMetrics_2530315112a8.log\n", + "-rw-r--r-- 1 root root 566 20. Feb 17:45 ClassifierMetrics_26e8744dc78c.log\n", + "-rw-r--r-- 1 root root 565 20. Feb 17:45 ClassifierMetrics_284f041511fb.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_2b7b458fc84d.log\n", + "-rw-r--r-- 1 root root 551 20. Feb 17:45 ClassifierMetrics_2fde2811a93c.log\n", + "-rw-r--r-- 1 root root 133 20. Feb 17:52 ClassifierMetrics_387f03f0b7a0.log\n", + "-rw-r--r-- 1 root root 314 10. Feb 16:54 ClassifierMetrics_3ccf43933a23.log\n", + "-rw-r--r-- 1 root root 132 20. Feb 17:59 ClassifierMetrics_41db1fc54c4f.log\n", + "-rw-r--r-- 1 root root 559 20. Feb 17:45 ClassifierMetrics_49fdfe64394f.log\n", + "-rw-r--r-- 1 root root 449 20. Feb 17:38 ClassifierMetrics_4a2e4a7dac7c.log\n", + "-rw-r--r-- 1 root root 126 20. Feb 17:59 ClassifierMetrics_4a623cb68ecc.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_55c7e364bf2b.log\n", + "-rw-r--r-- 1 root root 128 20. Feb 17:52 ClassifierMetrics_66b22a01b7d3.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:59 ClassifierMetrics_6f4f96da828e.log\n", + "-rw-r--r-- 1 root root 555 20. Feb 17:45 ClassifierMetrics_71effbac2282.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:59 ClassifierMetrics_73bcd38f71f7.log\n", + "-rw-r--r-- 1 root root 426 20. Feb 17:38 ClassifierMetrics_73fa92fe4be8.log\n", + "-rw-r--r-- 1 root root 433 20. Feb 17:38 ClassifierMetrics_7764aa9b23e3.log\n", + "-rw-r--r-- 1 root root 127 20. Feb 17:52 ClassifierMetrics_7dc198897be3.log\n", + "-rw-r--r-- 1 root root 570 20. Feb 17:45 ClassifierMetrics_80808e6b12d1.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_890dcfe0db80.log\n", + "-rw-r--r-- 1 root root 444 20. Feb 17:38 ClassifierMetrics_8ecc3f83e12d.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_9290b613e8d7.log\n", + "-rw-r--r-- 1 root root 567 20. Feb 17:45 ClassifierMetrics_9ba6210e2c94.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:52 ClassifierMetrics_a579e188cf6b.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_aa0e2812a3b9.log\n", + "-rw-r--r-- 1 root root 318 10. Feb 16:54 ClassifierMetrics_ad4cb4a650fa.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:52 ClassifierMetrics_b901376087b3.log\n", + "-rw-r--r-- 1 root root 564 20. Feb 17:45 ClassifierMetrics_d302c6e17f10.log\n", + "-rw-r--r-- 1 root root 452 20. Feb 17:38 ClassifierMetrics_e0da6952b2c6.log\n", + "-rw-r--r-- 1 root root 567 20. Feb 17:45 ClassifierMetrics_e29d5ee5fe87.log\n", + "-rw-r--r-- 1 root root 312 10. Feb 16:54 ClassifierMetrics_efc7f6345e79.log\n", + "-rw-r--r-- 1 root root 319 10. Feb 16:54 ClassifierMetrics_f571876aaa09.log\n", + "-rw-r--r-- 1 root root 131 20. Feb 17:59 ClassifierMetrics_fbe1c172154f.log\n", + "-rw-r--r-- 1 root root 436 20. Feb 17:38 ClassifierMetrics_fdc5fa307baf.log\n", + "-rw-r--r-- 1 root root 922 20. Feb 17:45 MultiClassifierDLApproach_0420b23f4851.log\n", + "-rw-r--r-- 1 root root 792 20. Feb 17:52 MultiClassifierDLApproach_73f999799c2b.log\n", + "-rw-r--r-- 1 root root 792 20. Feb 17:59 MultiClassifierDLApproach_e6ae1c4549a9.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:23 NerDL_0f47f69f09e6.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_10e337c8a3ef.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 17:31 NerDL_18e7b1673dab.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_27f18f749174.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_3ae0321ce66a.log\n", + "-rw-r--r-- 1 root root 319 26. Okt 09:13 NerDL_568d747656b8.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:03 NerDL_5970e276422f.log\n", + "-rw-r--r-- 1 root root 320 16. Jan 11:10 NerDL_759a68c3769d.log\n", + "-rw-r--r-- 1 root root 320 3. Nov 19:22 NerDL_891f9b941985.log\n", + "-rw-r--r-- 1 root root 320 2. Feb 2022 NerDL_8e8184f259cb.log\n", + "-rw-r--r-- 1 root root 320 27. Okt 13:02 NerDL_add5b34b2ecb.log\n", + "-rw-r--r-- 1 root root 320 21. Okt 19:06 NerDL_bc57a96c68c3.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 16:47 NerDL_ff0a43f20378.log\n", + "-rw-r--r-- 1 root root 897 10. Feb 16:54 SentimentDLApproach_98dfd2c1fdee.log\n" ] } ], @@ -373,26 +317,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 139 - }, - "colab_type": "code", - "id": "kHa_D2KFkL5w", - "outputId": "54f088f1-2e31-4ad8-feb5-260485a326c3" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Training started - epochs: 5 - learning_rate: 0.001 - batch_size: 128 - training_examples: 13158 - classes: 6\n", - "Epoch 0/5 - 15.19s - loss: 0.38046357 - acc: 0.848714 - val_loss: 0.30129096 - val_acc: 0.871466 - val_f1: 0.81246215 - val_tpr: 0.77513814 - batches: 103\n", - "Epoch 1/5 - 5.51s - loss: 0.30138606 - acc: 0.87715614 - val_loss: 0.28858984 - val_acc: 0.8747491 - val_f1: 0.819081 - val_tpr: 0.789548 - batches: 103\n", - "Epoch 2/5 - 5.37s - loss: 0.29324576 - acc: 0.87968993 - val_loss: 0.28451642 - val_acc: 0.8766811 - val_f1: 0.82239383 - val_tpr: 0.79497665 - batches: 103\n", - "Epoch 3/5 - 5.38s - loss: 0.28977352 - acc: 0.88131446 - val_loss: 0.2825411 - val_acc: 0.87826157 - val_f1: 0.8243148 - val_tpr: 0.7951459 - batches: 103\n", - "Epoch 4/5 - 5.38s - loss: 0.2876302 - acc: 0.88208383 - val_loss: 0.28134403 - val_acc: 0.878595 - val_f1: 0.82474065 - val_tpr: 0.79545283 - batches: 103\n" + "cat: /home/root/annotator_logs/MultiClassifierDLApproach_d670b2c2d0df.log: No such file or directory\n" ] } ], @@ -402,10 +333,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "F2x-_A5ZuSIX" - }, + "metadata": {}, "source": [ "Let's save our trained multi-label classifier model to be loaded in our prediction pipeline:" ] @@ -413,22 +341,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "IWkBcvA_1OCV" - }, + "metadata": {}, "outputs": [], "source": [ - "pipelineModel.stages[-1].write().overwrite().save('/content/tmp_multi_classifierDL_model')" + "pipelineModel.stages[-1].write().overwrite().save('tmp_multi_classifierDL_model')" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ppQ211AW1OCg" - }, + "metadata": {}, "source": [ "## load saved pipeline" ] @@ -436,15 +357,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "RglS6Ohj1OCi", - "outputId": "4c685ef1-7d32-424e-dd91-e9987435ccbb" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -465,7 +378,7 @@ " .setInputCols([\"document\"])\\\n", " .setOutputCol(\"sentence_embeddings\")\n", "\n", - "multiClassifier = MultiClassifierDLModel.load(\"/content/tmp_multi_classifierDL_model\") \\\n", + "multiClassifier = MultiClassifierDLModel.load(\"tmp_multi_classifierDL_model\") \\\n", " .setInputCols([\"sentence_embeddings\"])\\\n", " .setOutputCol(\"category\")\\\n", " .setThreshold(0.5)\n", @@ -480,10 +393,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ATJtZgiBwU-_" - }, + "metadata": {}, "source": [ "Let's now use our testing datasets to evaluate our model:" ] @@ -491,15 +401,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "id": "NDA_F1SD1OCm", - "outputId": "47b04e03-9ac3-46b3-88df-12dc887e493c" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -517,11 +419,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "JJYBe_hq1OCo" - }, + "metadata": {}, "outputs": [], "source": [ "preds = pipeline.fit(testDataset).transform(testDataset)\n" @@ -530,15 +428,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 187 - }, - "colab_type": "code", - "id": "QSdIxWPV1OCq", - "outputId": "7165c72b-6aa4-4868-8d3c-c57a5acb6a9d" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -547,9 +437,7 @@ "+----------------+--------------------+----------------+\n", "| labels| text| result|\n", "+----------------+--------------------+----------------+\n", - "| [toxic]|Vegan \n", - "\n", - "What in t...| [toxic]|\n", + "| [toxic]|Vegan \\n\\nWhat in...| [toxic]|\n", "|[toxic, obscene]|Fight Club! F**k ...|[toxic, obscene]|\n", "+----------------+--------------------+----------------+\n", "only showing top 2 rows\n", @@ -564,15 +452,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 306 - }, - "colab_type": "code", - "id": "hHh1vMkN1OCs", - "outputId": "50619054-3488-41cb-e8ca-78dcdd19e233" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -620,34 +500,26 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 309 - }, - "colab_type": "code", - "id": "LR7PpSll1OCy", - "outputId": "9f672c9e-a9a5-402e-f65c-af580cb41cc9" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|metadata |\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[[obscene -> 0.07668711, identity_hate -> 0.08003419, toxic -> 0.8547159, insult -> 0.14573382, severe_toxic -> 0.010274827, threat -> 0.0013722687, sentence -> 0]] |\n", - "|[[obscene -> 0.74973583, identity_hate -> 0.026573237, toxic -> 0.97452515, insult -> 0.4271415, severe_toxic -> 0.07580829, threat -> 0.012425529, sentence -> 0], [obscene -> 0.74973583, identity_hate -> 0.026573237, toxic -> 0.97452515, insult -> 0.4271415, severe_toxic -> 0.07580829, threat -> 0.012425529, sentence -> 0]] |\n", - "|[[obscene -> 0.2895946, identity_hate -> 0.017944919, toxic -> 0.88083005, insult -> 0.34860942, severe_toxic -> 0.012507909, threat -> 0.0027540186, sentence -> 0]] |\n", - "|[[obscene -> 0.14852583, identity_hate -> 0.13101593, toxic -> 0.93538547, insult -> 0.36898047, severe_toxic -> 0.020003503, threat -> 0.0014350729, sentence -> 0]] |\n", - "|[[obscene -> 0.2026581, identity_hate -> 0.0071552373, toxic -> 0.9020695, insult -> 0.20001398, severe_toxic -> 0.014318457, threat -> 0.0016921534, sentence -> 0]] |\n", - "|[[obscene -> 0.27696964, identity_hate -> 0.014545166, toxic -> 0.82669973, insult -> 0.26631594, severe_toxic -> 0.041005336, threat -> 0.038255215, sentence -> 0]] |\n", - "|[[obscene -> 0.992353, identity_hate -> 0.5780954, toxic -> 0.9924388, insult -> 0.92266214, severe_toxic -> 0.60503715, threat -> 0.058374558, sentence -> 0], [obscene -> 0.992353, identity_hate -> 0.5780954, toxic -> 0.9924388, insult -> 0.92266214, severe_toxic -> 0.60503715, threat -> 0.058374558, sentence -> 0], [obscene -> 0.992353, identity_hate -> 0.5780954, toxic -> 0.9924388, insult -> 0.92266214, severe_toxic -> 0.60503715, threat -> 0.058374558, sentence -> 0], [obscene -> 0.992353, identity_hate -> 0.5780954, toxic -> 0.9924388, insult -> 0.92266214, severe_toxic -> 0.60503715, threat -> 0.058374558, sentence -> 0], [obscene -> 0.992353, identity_hate -> 0.5780954, toxic -> 0.9924388, insult -> 0.92266214, severe_toxic -> 0.60503715, threat -> 0.058374558, sentence -> 0]]|\n", - "|[[obscene -> 0.11543953, identity_hate -> 0.019395102, toxic -> 0.9097985, insult -> 0.1980844, severe_toxic -> 0.007957691, threat -> 0.007705507, sentence -> 0]] |\n", - "|[[obscene -> 0.835811, identity_hate -> 0.0037145552, toxic -> 0.9678078, insult -> 0.55136216, severe_toxic -> 0.03057244, threat -> 3.7179954E-4, sentence -> 0], [obscene -> 0.835811, identity_hate -> 0.0037145552, toxic -> 0.9678078, insult -> 0.55136216, severe_toxic -> 0.03057244, threat -> 3.7179954E-4, sentence -> 0], [obscene -> 0.835811, identity_hate -> 0.0037145552, toxic -> 0.9678078, insult -> 0.55136216, severe_toxic -> 0.03057244, threat -> 3.7179954E-4, sentence -> 0]] |\n", - "|[[obscene -> 0.63264567, identity_hate -> 0.00646477, toxic -> 0.94940895, insult -> 0.5641152, severe_toxic -> 0.032555852, threat -> 0.0070275636, sentence -> 0], [obscene -> 0.63264567, identity_hate -> 0.00646477, toxic -> 0.94940895, insult -> 0.5641152, severe_toxic -> 0.032555852, threat -> 0.0070275636, sentence -> 0], [obscene -> 0.63264567, identity_hate -> 0.00646477, toxic -> 0.94940895, insult -> 0.5641152, severe_toxic -> 0.032555852, threat -> 0.0070275636, sentence -> 0]] |\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|metadata |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{obscene -> 0.07668713, identity_hate -> 0.08003414, toxic -> 0.8547158, insult -> 0.1457338, severe_toxic -> 0.010274887, threat -> 0.0013722777, sentence -> 0}] |\n", + "|[{obscene -> 0.74973583, identity_hate -> 0.026573211, toxic -> 0.9745251, insult -> 0.4271415, severe_toxic -> 0.07580829, threat -> 0.0124256015, sentence -> 0}, {obscene -> 0.74973583, identity_hate -> 0.026573211, toxic -> 0.9745251, insult -> 0.4271415, severe_toxic -> 0.07580829, threat -> 0.0124256015, sentence -> 0}] |\n", + "|[{obscene -> 0.2895946, identity_hate -> 0.017944932, toxic -> 0.88083, insult -> 0.34860942, severe_toxic -> 0.012507945, threat -> 0.0027540624, sentence -> 0}] |\n", + "|[{obscene -> 0.14852577, identity_hate -> 0.13101593, toxic -> 0.9353854, insult -> 0.36898047, severe_toxic -> 0.020003498, threat -> 0.001435101, sentence -> 0}] |\n", + "|[{obscene -> 0.20265803, identity_hate -> 0.0071552694, toxic -> 0.9020696, insult -> 0.20001402, severe_toxic -> 0.014318436, threat -> 0.0016921461, sentence -> 0}] |\n", + "|[{obscene -> 0.2769695, identity_hate -> 0.014545143, toxic -> 0.82669973, insult -> 0.26631594, severe_toxic -> 0.041005254, threat -> 0.038255185, sentence -> 0}] |\n", + "|[{obscene -> 0.99235296, identity_hate -> 0.57809556, toxic -> 0.9924389, insult -> 0.9226622, severe_toxic -> 0.6050372, threat -> 0.058374584, sentence -> 0}, {obscene -> 0.99235296, identity_hate -> 0.57809556, toxic -> 0.9924389, insult -> 0.9226622, severe_toxic -> 0.6050372, threat -> 0.058374584, sentence -> 0}, {obscene -> 0.99235296, identity_hate -> 0.57809556, toxic -> 0.9924389, insult -> 0.9226622, severe_toxic -> 0.6050372, threat -> 0.058374584, sentence -> 0}, {obscene -> 0.99235296, identity_hate -> 0.57809556, toxic -> 0.9924389, insult -> 0.9226622, severe_toxic -> 0.6050372, threat -> 0.058374584, sentence -> 0}, {obscene -> 0.99235296, identity_hate -> 0.57809556, toxic -> 0.9924389, insult -> 0.9226622, severe_toxic -> 0.6050372, threat -> 0.058374584, sentence -> 0}]|\n", + "|[{obscene -> 0.115439504, identity_hate -> 0.019395113, toxic -> 0.90979856, insult -> 0.1980845, severe_toxic -> 0.007957667, threat -> 0.00770548, sentence -> 0}] |\n", + "|[{obscene -> 0.835811, identity_hate -> 0.0037145615, toxic -> 0.9678079, insult -> 0.5513622, severe_toxic -> 0.030572414, threat -> 3.7184358E-4, sentence -> 0}, {obscene -> 0.835811, identity_hate -> 0.0037145615, toxic -> 0.9678079, insult -> 0.5513622, severe_toxic -> 0.030572414, threat -> 3.7184358E-4, sentence -> 0}, {obscene -> 0.835811, identity_hate -> 0.0037145615, toxic -> 0.9678079, insult -> 0.5513622, severe_toxic -> 0.030572414, threat -> 3.7184358E-4, sentence -> 0}] |\n", + "|[{obscene -> 0.6326457, identity_hate -> 0.006464809, toxic -> 0.94940895, insult -> 0.56411535, severe_toxic -> 0.03255585, threat -> 0.0070275366, sentence -> 0}, {obscene -> 0.6326457, identity_hate -> 0.006464809, toxic -> 0.94940895, insult -> 0.56411535, severe_toxic -> 0.03255585, threat -> 0.0070275366, sentence -> 0}, {obscene -> 0.6326457, identity_hate -> 0.006464809, toxic -> 0.94940895, insult -> 0.56411535, severe_toxic -> 0.03255585, threat -> 0.0070275366, sentence -> 0}] |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 10 rows\n", "\n" ] @@ -660,15 +532,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "73Gm6Rno1OC0", - "outputId": "6d0934e0-2190-47f7-fa60-7be3e7782465" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -686,17 +550,6 @@ "source": [ "preds.select(\"category.metadata\").printSchema()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "A59H3EMd1OC7" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -720,8 +573,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "name": "MultiClassifierDL_Train_multi_label_toxic_classifier", "notebookId": 1952370652427552, diff --git a/examples/python/training/english/classification/SentimentDL_Train_and_Evaluate.ipynb b/examples/python/training/english/classification/SentimentDL_Train_and_Evaluate.ipynb index 55ba22d3264d77..8bd9d8a0674e90 100644 --- a/examples/python/training/english/classification/SentimentDL_Train_and_Evaluate.ipynb +++ b/examples/python/training/english/classification/SentimentDL_Train_and_Evaluate.ipynb @@ -1,39 +1,20 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "ph3bDypIEXdd" - }, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aaVmDt1TEXdh" - }, - "source": [ - "# Spark NLP\n", - "### Multi-class Sentiment Classification\n", - "#### By using SentimentDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jmo3o-b3MF5W" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/SentimentDL_Train_and_Evaluate.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/SentimentDL_Train_and_Evaluate.ipynb)\n", + "\n", + "# Multi-class Sentiment Classification using SentimentDL" ] }, { "cell_type": "markdown", - "metadata": { - "id": "h4fQwZ46x4fu" - }, + "metadata": {}, "source": [ "Only run this block if you are inside Google Colab otherwise skip it" ] @@ -44,36 +25,32 @@ "metadata": {}, "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "-pDX9mP2yqwE" - }, + "metadata": {}, "source": [ "In this notebook we are going to check the training logs on the fly. Thus, we start a session with `real_time_output=True`" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Hmvv5Q4jMF5b", - "outputId": "90c8b6f7-3898-4aa0-8802-22cc01b33a43" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 4.1.0\n", - "Apache Spark version 3.2.1\n" + "23/02/20 18:01:18 WARN Utils: Your hostname, duc-manjaro resolves to a loopback address: 127.0.1.1; using 192.168.0.34 instead (on interface enp3s0)\n", + "23/02/20 18:01:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Spark NLP version: 4.3.1\n", + "Apache Spark version 3.3.0\n", + ":: loading settings :: url = jar:file:/home/root/.conda/envs/sparknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n", + "23/02/20 18:01:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], @@ -88,38 +65,31 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "xKQcm8R6MF5e" - }, + "metadata": {}, "source": [ "Let's download IMDB movie reviews dataset for training our multi-class sentiment classifier" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "W0FkrTb4MF5f", - "outputId": "1be5081f-fcc9-4355-e839-a4657ff3d600" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2022-09-23 19:33:40-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.230.64\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.230.64|:443... connected.\n", + "--2023-02-20 18:01:23-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.141.0, 52.216.18.187, 52.217.165.168, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.141.0|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 33497180 (32M) [text/csv]\n", "Saving to: ‘aclimdb_train.csv’\n", "\n", - "aclimdb_train.csv 100%[===================>] 31.95M 20.5MB/s in 1.6s \n", + "aclimdb_train.csv 100%[===================>] 31,95M 11,0MB/s in 2,9s \n", "\n", - "2022-09-23 19:33:42 (20.5 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]\n", + "2023-02-20 18:01:27 (11,0 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]\n", "\n" ] } @@ -130,29 +100,24 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QDIQgMv6tuqu", - "outputId": "ebaa30c0-bdcb-416b-c23c-2fd87b69a9f5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2022-09-23 19:33:42-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.81.83\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.81.83|:443... connected.\n", + "--2023-02-20 18:01:27-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.172.80, 52.216.233.189, 52.216.171.29, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.172.80|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 32715164 (31M) [text/csv]\n", "Saving to: ‘aclimdb_test.csv’\n", "\n", - "aclimdb_test.csv 100%[===================>] 31.20M 21.3MB/s in 1.5s \n", + "aclimdb_test.csv 100%[===================>] 31,20M 15,8MB/s in 2,0s \n", "\n", - "2022-09-23 19:33:44 (21.3 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]\n", + "2023-02-20 18:01:30 (15,8 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]\n", "\n" ] } @@ -163,14 +128,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QYolNmBtMF5h", - "outputId": "c2bd0784-9c7b-4ea8-904f-8b365b97c6c5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -190,19 +149,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zWFUDI6jMF5k" - }, + "metadata": {}, "source": [ "The content is inside `text` column and the sentiment is inside `label` column" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "S9TRr7iAMF5l" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "trainDataset = spark.read \\\n", @@ -212,14 +167,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nURzgFJ7MF5o", - "outputId": "5c6e6122-64e5-4dea-cb09-562af61562d3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -260,14 +209,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5NIHJuVKx4gk", - "outputId": "051fad87-ea23-4977-9a24-b3892b51f424" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -275,7 +218,7 @@ "25000" ] }, - "execution_count": 11, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -286,9 +229,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UoJH3kA7RJpD" - }, + "metadata": {}, "source": [ "# Evaluation \n", "\n", @@ -297,10 +238,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "5HkV5BAiWPAo" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "imdb_test_dataset = spark.read \\\n", @@ -310,10 +249,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "0oKvNZaEMF5q" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -325,14 +262,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hg0GqUsIzUBP", - "outputId": "a9098f37-2e46-48fc-cbeb-1e177f79c8a4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -342,7 +273,7 @@ "Approximate size to download 923.7 MB\n", "[ | ]tfhub_use download started this may take some time.\n", "Approximate size to download 923.7 MB\n", - "[ \\ ]Download done! Loading the resource.\n", + "[ / ]Download done! Loading the resource.\n", "[OK!]\n" ] } @@ -363,14 +294,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "K3gciy-ezYFh", - "outputId": "5e27f1ee-2389-4143-b52c-a5de043579e4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -393,19 +318,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "823lZCT4zost" - }, + "metadata": {}, "source": [ "Now, that out test dataset has the required embeddings, we save it as parquet and use it while training our SentimentDL model." ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "SVM6Bdw1zwXF" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "test_dataset.write.parquet(\"./aclimdb_test.parquet\")" @@ -413,19 +334,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "DAUYQlXv0NpJ" - }, + "metadata": {}, "source": [ "Now let's train it and use a validation and the test dataset above for evaluation" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "H30A4FgNMF5t" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# the classes/labels/categories are in category column\n", @@ -449,105 +366,99 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kia7NpRJMF5v", - "outputId": "4ed55eed-8bac-4cac-d551-d8b90fd71199" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 20000\n", - "Epoch 1/5 - 4.03s - loss: 152.93646 - acc: 0.8290765 - batches: 313\n", + "Epoch 1/5 - 1.42s - loss: 147.28938 - acc: 0.82757413 - batches: 313\n", "Quality on validation dataset (20.0%), validation examples = 5000\n", - "time to finish evaluation: 0.34s\n", + "time to finish evaluation: 0.06s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 2198\t 477\t 280\t 0.8216822\t 0.8870056\t 0.8530953\n", - "0 2045\t 280\t 477\t 0.8795699\t 0.8108644\t 0.8438209\n", - "tp: 4243 fp: 757 fn: 757 labels: 2\n", - "Macro-average\t prec: 0.85062605, rec: 0.848935, f1: 0.84977967\n", - "Micro-average\t prec: 0.8486, recall: 0.8486, f1: 0.8486\n", + "1 1968\t 263\t 519\t 0.88211566\t 0.79131484\t 0.8342518\n", + "0 2250\t 519\t 263\t 0.8125677\t 0.8953442\t 0.85195\n", + "tp: 4218 fp: 782 fn: 782 labels: 2\n", + "Macro-average\t prec: 0.84734166, rec: 0.84332955, f1: 0.8453309\n", + "Micro-average\t prec: 0.8436, recall: 0.8436, f1: 0.8436\n", "Quality on test dataset: \n", - "time to finish evaluation: 1.18s\n", + "time to finish evaluation: 0.24s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 10943\t 2350\t 1557\t 0.82321525\t 0.87544\t 0.84852475\n", - "0 10150\t 1557\t 2350\t 0.86700267\t 0.812\t 0.8386004\n", - "tp: 21093 fp: 3907 fn: 3907 labels: 2\n", - "Macro-average\t prec: 0.845109, rec: 0.84371996, f1: 0.84441394\n", - "Micro-average\t prec: 0.84372, recall: 0.84372, f1: 0.84372\n", - "Epoch 2/5 - 4.63s - loss: 144.18388 - acc: 0.8535156 - batches: 313\n", + "1 9922\t 1415\t 2578\t 0.87518746\t 0.79376\t 0.83248734\n", + "0 11085\t 2578\t 1415\t 0.81131524\t 0.8868\t 0.8473799\n", + "tp: 21007 fp: 3993 fn: 3993 labels: 2\n", + "Macro-average\t prec: 0.84325135, rec: 0.84028, f1: 0.8417631\n", + "Micro-average\t prec: 0.84028, recall: 0.84028, f1: 0.84028\n", + "Epoch 2/5 - 1.38s - loss: 134.46562 - acc: 0.8528145 - batches: 313\n", "Quality on validation dataset (20.0%), validation examples = 5000\n", - "time to finish evaluation: 0.30s\n", + "time to finish evaluation: 0.05s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 2139\t 402\t 339\t 0.84179455\t 0.86319613\t 0.852361\n", - "0 2120\t 339\t 402\t 0.8621391\t 0.8406027\t 0.85123473\n", - "tp: 4259 fp: 741 fn: 741 labels: 2\n", - "Macro-average\t prec: 0.85196686, rec: 0.8518994, f1: 0.8519331\n", - "Micro-average\t prec: 0.8518, recall: 0.8518, f1: 0.8518\n", + "1 2084\t 334\t 403\t 0.86186934\t 0.8379574\t 0.84974515\n", + "0 2179\t 403\t 334\t 0.84391946\t 0.8670911\t 0.8553484\n", + "tp: 4263 fp: 737 fn: 737 labels: 2\n", + "Macro-average\t prec: 0.8528944, rec: 0.8525243, f1: 0.8527093\n", + "Micro-average\t prec: 0.8526, recall: 0.8526, f1: 0.8526\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.95s\n", + "time to finish evaluation: 0.22s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 10709\t 1976\t 1791\t 0.84422547\t 0.85672\t 0.85042685\n", - "0 10524\t 1791\t 1976\t 0.8545676\t 0.84192\t 0.8481966\n", - "tp: 21233 fp: 3767 fn: 3767 labels: 2\n", - "Macro-average\t prec: 0.8493965, rec: 0.84932, f1: 0.84935826\n", - "Micro-average\t prec: 0.84932, recall: 0.84932, f1: 0.84932\n", - "Epoch 3/5 - 3.45s - loss: 141.53355 - acc: 0.8609776 - batches: 313\n", + "1 10519\t 1829\t 1981\t 0.8518788\t 0.84152\t 0.8466677\n", + "0 10671\t 1981\t 1829\t 0.84342396\t 0.85368\t 0.848521\n", + "tp: 21190 fp: 3810 fn: 3810 labels: 2\n", + "Macro-average\t prec: 0.84765136, rec: 0.8476, f1: 0.8476257\n", + "Micro-average\t prec: 0.8476, recall: 0.8476, f1: 0.8476\n", + "Epoch 3/5 - 1.96s - loss: 131.90747 - acc: 0.86177886 - batches: 313\n", "Quality on validation dataset (20.0%), validation examples = 5000\n", - "time to finish evaluation: 0.29s\n", + "time to finish evaluation: 0.07s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 2134\t 393\t 344\t 0.8444796\t 0.8611784\t 0.85274726\n", - "0 2129\t 344\t 393\t 0.8608977\t 0.8441713\t 0.8524524\n", - "tp: 4263 fp: 737 fn: 737 labels: 2\n", - "Macro-average\t prec: 0.85268867, rec: 0.85267484, f1: 0.85268176\n", - "Micro-average\t prec: 0.8526, recall: 0.8526, f1: 0.8526\n", + "1 2136\t 377\t 351\t 0.8499801\t 0.8588661\t 0.8544\n", + "0 2136\t 351\t 377\t 0.8588661\t 0.8499801\t 0.8544\n", + "tp: 4272 fp: 728 fn: 728 labels: 2\n", + "Macro-average\t prec: 0.8544231, rec: 0.8544231, f1: 0.8544231\n", + "Micro-average\t prec: 0.8544, recall: 0.8544, f1: 0.8544\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.86s\n", + "time to finish evaluation: 0.33s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 10683\t 1872\t 1817\t 0.85089606\t 0.85464\t 0.8527639\n", - "0 10628\t 1817\t 1872\t 0.8539976\t 0.85024\t 0.8521146\n", - "tp: 21311 fp: 3689 fn: 3689 labels: 2\n", - "Macro-average\t prec: 0.8524468, rec: 0.85244, f1: 0.85244346\n", - "Micro-average\t prec: 0.85244, recall: 0.85244, f1: 0.85244\n", - "Epoch 4/5 - 3.46s - loss: 139.95955 - acc: 0.8659856 - batches: 313\n", + "1 10759\t 2033\t 1741\t 0.84107256\t 0.86072\t 0.85078293\n", + "0 10467\t 1741\t 2033\t 0.8573886\t 0.83736\t 0.84725595\n", + "tp: 21226 fp: 3774 fn: 3774 labels: 2\n", + "Macro-average\t prec: 0.8492306, rec: 0.84904003, f1: 0.84913534\n", + "Micro-average\t prec: 0.84904, recall: 0.84904, f1: 0.84904\n", + "Epoch 4/5 - 1.73s - loss: 130.34096 - acc: 0.86708736 - batches: 313\n", "Quality on validation dataset (20.0%), validation examples = 5000\n", - "time to finish evaluation: 0.25s\n", + "time to finish evaluation: 0.06s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 2151\t 399\t 327\t 0.8435294\t 0.8680387\t 0.8556086\n", - "0 2123\t 327\t 399\t 0.8665306\t 0.8417922\t 0.8539823\n", - "tp: 4274 fp: 726 fn: 726 labels: 2\n", - "Macro-average\t prec: 0.85503, rec: 0.8549155, f1: 0.85497284\n", - "Micro-average\t prec: 0.8548, recall: 0.8548, f1: 0.8548\n", + "1 2167\t 409\t 320\t 0.8412267\t 0.8713309\t 0.8560142\n", + "0 2104\t 320\t 409\t 0.8679868\t 0.8372463\t 0.85233945\n", + "tp: 4271 fp: 729 fn: 729 labels: 2\n", + "Macro-average\t prec: 0.85460675, rec: 0.8542886, f1: 0.85444766\n", + "Micro-average\t prec: 0.8542, recall: 0.8542, f1: 0.8542\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.86s\n", + "time to finish evaluation: 0.25s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 10728\t 1927\t 1772\t 0.8477282\t 0.85824\t 0.85295177\n", - "0 10573\t 1772\t 1927\t 0.8564601\t 0.84584\t 0.8511169\n", - "tp: 21301 fp: 3699 fn: 3699 labels: 2\n", - "Macro-average\t prec: 0.8520942, rec: 0.85204, f1: 0.85206705\n", - "Micro-average\t prec: 0.85204, recall: 0.85204, f1: 0.85204005\n", - "Epoch 5/5 - 3.57s - loss: 138.94417 - acc: 0.87184495 - batches: 313\n", + "1 10896\t 2135\t 1604\t 0.83615994\t 0.87168\t 0.85355055\n", + "0 10365\t 1604\t 2135\t 0.8659871\t 0.8292\t 0.84719443\n", + "tp: 21261 fp: 3739 fn: 3739 labels: 2\n", + "Macro-average\t prec: 0.8510735, rec: 0.85044, f1: 0.85075665\n", + "Micro-average\t prec: 0.85044, recall: 0.85044, f1: 0.85044\n", + "Epoch 5/5 - 1.58s - loss: 129.25305 - acc: 0.8711438 - batches: 313\n", "Quality on validation dataset (20.0%), validation examples = 5000\n", - "time to finish evaluation: 0.27s\n", + "time to finish evaluation: 0.05s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 2160\t 400\t 318\t 0.84375\t 0.8716707\t 0.85748315\n", - "0 2122\t 318\t 400\t 0.8696721\t 0.84139574\t 0.85530025\n", - "tp: 4282 fp: 718 fn: 718 labels: 2\n", - "Macro-average\t prec: 0.85671103, rec: 0.8565332, f1: 0.85662216\n", - "Micro-average\t prec: 0.8564, recall: 0.8564, f1: 0.8564\n", + "1 2172\t 410\t 315\t 0.84120834\t 0.8733414\t 0.8569737\n", + "0 2103\t 315\t 410\t 0.8697271\t 0.8368484\t 0.852971\n", + "tp: 4275 fp: 725 fn: 725 labels: 2\n", + "Macro-average\t prec: 0.8554677, rec: 0.8550949, f1: 0.85528123\n", + "Micro-average\t prec: 0.855, recall: 0.855, f1: 0.855\n", "Quality on test dataset: \n", - "time to finish evaluation: 0.86s\n", + "time to finish evaluation: 0.31s\n", "label tp\t fp\t fn\t prec\t rec\t f1\n", - "1 10769\t 1970\t 1731\t 0.84535676\t 0.86152\t 0.85336185\n", - "0 10530\t 1731\t 1970\t 0.8588207\t 0.8424\t 0.8505311\n", - "tp: 21299 fp: 3701 fn: 3701 labels: 2\n", - "Macro-average\t prec: 0.8520887, rec: 0.85196, f1: 0.8520244\n", - "Micro-average\t prec: 0.85196, recall: 0.85196, f1: 0.85196\n" + "1 10935\t 2176\t 1565\t 0.8340325\t 0.8748\t 0.85392994\n", + "0 10324\t 1565\t 2176\t 0.8683657\t 0.82592\t 0.8466112\n", + "tp: 21259 fp: 3741 fn: 3741 labels: 2\n", + "Macro-average\t prec: 0.8511991, rec: 0.85036004, f1: 0.8507794\n", + "Micro-average\t prec: 0.85036, recall: 0.85036, f1: 0.85036\n" ] } ], @@ -557,9 +468,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zMLuwQSCB05e" - }, + "metadata": {}, "source": [ "# How to use already trained SentimentDL pipeline or its model\n", "\n", @@ -570,19 +479,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4I7COUCPCPe3" - }, + "metadata": {}, "source": [ "## Save and load pre-trained SentimentDL pipeline" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "QTDQ3riLD-zW" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Google Colab is free so it comes with a little memory. \n", @@ -595,45 +500,32 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "TI9JR8AoLbW3" - }, + "metadata": {}, "source": [ "# Save and load pre-trained SentimentDL model" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "3r3_q4CJLkZR" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# hdfs:/ if you are saving it on distributed file systems in Hadoop\n", - "pipelineModel.stages[-1].write().overwrite().save('./tmp_sentimentdl_model')\n", - "\n" + "pipelineModel.stages[-1].write().overwrite().save('./tmp_sentimentdl_model')" ] }, { "cell_type": "markdown", - "metadata": { - "id": "3JaclNFsQJ-X" - }, + "metadata": {}, "source": [ "Let's use our pre-trained SentimentDLModel in a pipeline: " ] }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "NTJ53PbYQI-f", - "outputId": "e235cc7e-5be2-49f0-a1dd-ec362804930e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -670,19 +562,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "VOrjIsKXHea8" - }, + "metadata": {}, "source": [ "Now let's load it back so we can have prediction all together with everything in that pipeline:" ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "ccy54HeERCZ1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.types import StringType\n", @@ -695,10 +583,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "3BsNAWS4VRkd" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "prediction = pipeline.fit(dfTest).transform(dfTest)" @@ -706,14 +592,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nz32PDjEVUTk", - "outputId": "6700b1cc-08f2-41b0-e334-8505ca630bd8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -726,12 +606,12 @@ "|[negative]|\n", "+----------+\n", "\n", - "+------------------------------------------------------------------+\n", - "|metadata |\n", - "+------------------------------------------------------------------+\n", - "|[{sentence -> 0, positive -> 1.0, negative -> 2.8792261E-8}] |\n", - "|[{sentence -> 0, positive -> 1.8572706E-5, negative -> 0.9999814}]|\n", - "+------------------------------------------------------------------+\n", + "+-------------------------------------------------------------------+\n", + "|metadata |\n", + "+-------------------------------------------------------------------+\n", + "|[{sentence -> 0, positive -> 1.0, negative -> 2.8575936E-8}] |\n", + "|[{sentence -> 0, positive -> 1.2174318E-5, negative -> 0.99998784}]|\n", + "+-------------------------------------------------------------------+\n", "\n" ] } @@ -763,8 +643,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.21.0" diff --git a/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb b/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb index 9ba2148d7bf98f..1a3d57865e082d 100644 --- a/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb +++ b/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb @@ -1,99 +1,45 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ph3bDypIEXdd" - }, + "metadata": {}, "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aaVmDt1TEXdh" - }, - "source": [ - "# Spark NLP\n", - "### Multi-class Sentiment Classification\n", - "#### By using SentimentDL" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jmo3o-b3MF5W" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb)\n", + "\n", + "# Multi-class Sentiment Classification using SentimentDL" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "h4fQwZ46x4fu" - }, + "metadata": {}, "source": [ "Only run this block if you are inside Google Colab otherwise skip it" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "id": "MzishpT-MF5X", - "outputId": "3a2d6929-41dc-476c-c3fa-6d7afab8164a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 56kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 50.1MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.8MB/s \n", - "\u001B[?25h" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "Hmvv5Q4jMF5b", - "outputId": "5fd4b51d-6248-49fc-de80-43f7c5baab4d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.1\n", - "Apache Spark version 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version 3.3.0\n" ] } ], @@ -103,46 +49,36 @@ "spark = sparknlp.start()\n", "\n", "print(\"Spark NLP version: \", sparknlp.version())\n", - "print(\"Apache Spark version\", spark.version)\n" + "print(\"Apache Spark version\", spark.version)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "xKQcm8R6MF5e" - }, + "metadata": {}, "source": [ "Let's download IMDB movie reviews dataset for training our multi-class sentiment classifier" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 224 - }, - "colab_type": "code", - "id": "W0FkrTb4MF5f", - "outputId": "1ff0c760-8b45-41a9-d1b9-4d4654d8ff76" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-06-01 15:21:56-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.236.101\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.236.101|:443... connected.\n", + "--2023-02-20 18:03:45-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.79.54, 52.216.35.184, 52.217.205.136, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.79.54|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 33497180 (32M) [text/csv]\n", "Saving to: ‘aclimdb_train.csv’\n", "\n", - "aclimdb_train.csv 100%[===================>] 31.95M 112MB/s in 0.3s \n", + "aclimdb_train.csv 100%[===================>] 31,95M 13,1MB/s in 2,4s \n", "\n", - "2020-06-01 15:21:57 (112 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]\n", + "2023-02-20 18:03:48 (13,1 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]\n", "\n" ] } @@ -153,34 +89,24 @@ }, { "cell_type": "code", - "execution_count": 99, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 224 - }, - "colab_type": "code", - "id": "QDIQgMv6tuqu", - "outputId": "6e986f7d-9304-4ba9-e61e-2b728442ad81" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-06-01 17:32:56-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.37\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.37|:443... connected.\n", + "--2023-02-20 18:03:49-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.44.40, 54.231.171.192, 52.217.32.158, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.44.40|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 32715164 (31M) [text/csv]\n", "Saving to: ‘aclimdb_test.csv’\n", "\n", - "\r\n", - "aclimdb_test.csv 0%[ ] 0 --.-KB/s \r\n", - "aclimdb_test.csv 53%[=========> ] 16.83M 84.0MB/s \r\n", - "aclimdb_test.csv 100%[===================>] 31.20M 111MB/s in 0.3s \n", + "aclimdb_test.csv 100%[===================>] 31,20M 14,6MB/s in 2,1s \n", "\n", - "2020-06-01 17:32:56 (111 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]\n", + "2023-02-20 18:03:52 (14,6 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]\n", "\n" ] } @@ -191,16 +117,8 @@ }, { "cell_type": "code", - "execution_count": 100, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 122 - }, - "colab_type": "code", - "id": "QYolNmBtMF5h", - "outputId": "919b361d-6767-45ea-bfbb-bf7b0a7beb17" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -220,10 +138,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zWFUDI6jMF5k" - }, + "metadata": {}, "source": [ "The content is inside `text` column and the sentiment is inside `label` column" ] @@ -231,11 +146,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "S9TRr7iAMF5l" - }, + "metadata": {}, "outputs": [], "source": [ "trainDataset = spark.read \\\n", @@ -245,16 +156,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 459 - }, - "colab_type": "code", - "id": "nURzgFJ7MF5o", - "outputId": "946800cf-033c-4285-dfd6-922b7bef778e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -295,16 +198,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "colab_type": "code", - "id": "5NIHJuVKx4gk", - "outputId": "9a7f2d8b-3ddf-420a-cbc0-78d84cd533d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -312,10 +207,8 @@ "25000" ] }, - "execution_count": 11, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "output_type": "execute_result" } ], @@ -326,11 +219,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0oKvNZaEMF5q" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -342,16 +231,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "H30A4FgNMF5t", - "outputId": "2a8d2ea2-aac4-4258-f94c-7d8eb8086beb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -392,11 +273,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "kia7NpRJMF5v" - }, + "metadata": {}, "outputs": [], "source": [ "pipelineModel = pipeline.fit(trainDataset)" @@ -404,23 +281,86 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "id": "TOLU30ilMF5y", - "outputId": "2f2ae24b-9d00-45fb-c33c-f59ec3af2cf5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total 4\n", - "-rw-r--r-- 1 root root 523 Jun 1 15:30 SentimentDLApproach_2ea7dc3149c2.log\n" + "total 288\n", + "-rw-r--r-- 1 root root 456 20. Feb 17:41 ClassifierDLApproach_0375e3a8df00.log\n", + "-rw-r--r-- 1 root root 918 20. Feb 17:38 ClassifierDLApproach_6fdb8a569309.log\n", + "-rw-r--r-- 1 root root 446 20. Feb 15:55 ClassifierDLApproach_97ff5c76d735.log\n", + "-rw-r--r-- 1 root root 438 20. Feb 17:38 ClassifierMetrics_09bd6fa2ecf7.log\n", + "-rw-r--r-- 1 root root 317 20. Feb 18:02 ClassifierMetrics_0b655035ca3d.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_17606bbb7d1f.log\n", + "-rw-r--r-- 1 root root 571 20. Feb 17:45 ClassifierMetrics_176ce729caa6.log\n", + "-rw-r--r-- 1 root root 327 20. Feb 18:02 ClassifierMetrics_19e043d5e316.log\n", + "-rw-r--r-- 1 root root 313 10. Feb 16:54 ClassifierMetrics_1a6c515483ae.log\n", + "-rw-r--r-- 1 root root 441 20. Feb 17:38 ClassifierMetrics_1e0c8ea78e67.log\n", + "-rw-r--r-- 1 root root 323 10. Feb 16:54 ClassifierMetrics_2530315112a8.log\n", + "-rw-r--r-- 1 root root 566 20. Feb 17:45 ClassifierMetrics_26e8744dc78c.log\n", + "-rw-r--r-- 1 root root 565 20. Feb 17:45 ClassifierMetrics_284f041511fb.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_2b7b458fc84d.log\n", + "-rw-r--r-- 1 root root 551 20. Feb 17:45 ClassifierMetrics_2fde2811a93c.log\n", + "-rw-r--r-- 1 root root 323 20. Feb 18:02 ClassifierMetrics_3567e301a9ec.log\n", + "-rw-r--r-- 1 root root 133 20. Feb 17:52 ClassifierMetrics_387f03f0b7a0.log\n", + "-rw-r--r-- 1 root root 314 10. Feb 16:54 ClassifierMetrics_3ccf43933a23.log\n", + "-rw-r--r-- 1 root root 132 20. Feb 17:59 ClassifierMetrics_41db1fc54c4f.log\n", + "-rw-r--r-- 1 root root 322 20. Feb 18:02 ClassifierMetrics_44e8954b5f5d.log\n", + "-rw-r--r-- 1 root root 559 20. Feb 17:45 ClassifierMetrics_49fdfe64394f.log\n", + "-rw-r--r-- 1 root root 449 20. Feb 17:38 ClassifierMetrics_4a2e4a7dac7c.log\n", + "-rw-r--r-- 1 root root 126 20. Feb 17:59 ClassifierMetrics_4a623cb68ecc.log\n", + "-rw-r--r-- 1 root root 323 20. Feb 18:02 ClassifierMetrics_536a85621ba7.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_55c7e364bf2b.log\n", + "-rw-r--r-- 1 root root 128 20. Feb 17:52 ClassifierMetrics_66b22a01b7d3.log\n", + "-rw-r--r-- 1 root root 316 20. Feb 18:02 ClassifierMetrics_6a4855e04b2f.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:59 ClassifierMetrics_6f4f96da828e.log\n", + "-rw-r--r-- 1 root root 555 20. Feb 17:45 ClassifierMetrics_71effbac2282.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:59 ClassifierMetrics_73bcd38f71f7.log\n", + "-rw-r--r-- 1 root root 426 20. Feb 17:38 ClassifierMetrics_73fa92fe4be8.log\n", + "-rw-r--r-- 1 root root 433 20. Feb 17:38 ClassifierMetrics_7764aa9b23e3.log\n", + "-rw-r--r-- 1 root root 127 20. Feb 17:52 ClassifierMetrics_7dc198897be3.log\n", + "-rw-r--r-- 1 root root 570 20. Feb 17:45 ClassifierMetrics_80808e6b12d1.log\n", + "-rw-r--r-- 1 root root 445 20. Feb 17:38 ClassifierMetrics_890dcfe0db80.log\n", + "-rw-r--r-- 1 root root 444 20. Feb 17:38 ClassifierMetrics_8ecc3f83e12d.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_9290b613e8d7.log\n", + "-rw-r--r-- 1 root root 567 20. Feb 17:45 ClassifierMetrics_9ba6210e2c94.log\n", + "-rw-r--r-- 1 root root 308 20. Feb 18:02 ClassifierMetrics_9e50a6c01b6a.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:52 ClassifierMetrics_a579e188cf6b.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_aa0e2812a3b9.log\n", + "-rw-r--r-- 1 root root 318 10. Feb 16:54 ClassifierMetrics_ad4cb4a650fa.log\n", + "-rw-r--r-- 1 root root 129 20. Feb 17:52 ClassifierMetrics_b901376087b3.log\n", + "-rw-r--r-- 1 root root 564 20. Feb 17:45 ClassifierMetrics_d302c6e17f10.log\n", + "-rw-r--r-- 1 root root 317 20. Feb 18:02 ClassifierMetrics_e0199b46eaa2.log\n", + "-rw-r--r-- 1 root root 452 20. Feb 17:38 ClassifierMetrics_e0da6952b2c6.log\n", + "-rw-r--r-- 1 root root 317 20. Feb 18:02 ClassifierMetrics_e0dd3cc9595e.log\n", + "-rw-r--r-- 1 root root 567 20. Feb 17:45 ClassifierMetrics_e29d5ee5fe87.log\n", + "-rw-r--r-- 1 root root 312 20. Feb 18:02 ClassifierMetrics_e2fa7c36f711.log\n", + "-rw-r--r-- 1 root root 312 10. Feb 16:54 ClassifierMetrics_efc7f6345e79.log\n", + "-rw-r--r-- 1 root root 319 10. Feb 16:54 ClassifierMetrics_f571876aaa09.log\n", + "-rw-r--r-- 1 root root 131 20. Feb 17:59 ClassifierMetrics_fbe1c172154f.log\n", + "-rw-r--r-- 1 root root 436 20. Feb 17:38 ClassifierMetrics_fdc5fa307baf.log\n", + "-rw-r--r-- 1 root root 922 20. Feb 17:45 MultiClassifierDLApproach_0420b23f4851.log\n", + "-rw-r--r-- 1 root root 792 20. Feb 17:52 MultiClassifierDLApproach_73f999799c2b.log\n", + "-rw-r--r-- 1 root root 792 20. Feb 17:59 MultiClassifierDLApproach_e6ae1c4549a9.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:23 NerDL_0f47f69f09e6.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_10e337c8a3ef.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 17:31 NerDL_18e7b1673dab.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_27f18f749174.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_3ae0321ce66a.log\n", + "-rw-r--r-- 1 root root 319 26. Okt 09:13 NerDL_568d747656b8.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:03 NerDL_5970e276422f.log\n", + "-rw-r--r-- 1 root root 320 16. Jan 11:10 NerDL_759a68c3769d.log\n", + "-rw-r--r-- 1 root root 320 3. Nov 19:22 NerDL_891f9b941985.log\n", + "-rw-r--r-- 1 root root 320 2. Feb 2022 NerDL_8e8184f259cb.log\n", + "-rw-r--r-- 1 root root 320 27. Okt 13:02 NerDL_add5b34b2ecb.log\n", + "-rw-r--r-- 1 root root 320 21. Okt 19:06 NerDL_bc57a96c68c3.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 16:47 NerDL_ff0a43f20378.log\n", + "-rw-r--r-- 1 root root 437 20. Feb 18:04 SentimentDLApproach_1955fb8515af.log\n", + "-rw-r--r-- 1 root root 899 20. Feb 18:02 SentimentDLApproach_1e4403144e6c.log\n", + "-rw-r--r-- 1 root root 897 10. Feb 16:54 SentimentDLApproach_98dfd2c1fdee.log\n" ] } ], @@ -430,27 +370,14 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "id": "c6TAuRqBNs4_", - "outputId": "adc8a02b-3c3c-4bcf-8aa4-adcafc4eb4e4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Training started - total epochs: 5 - learning rate: 0.005 - batch size: 64 - training examples: 25000\n", - "Epoch 0/5 - 7.261388839%.2fs - loss: 184.75143 - accuracy: 0.82907856 - batches: 391\n", - "Epoch 1/5 - 7.019650974%.2fs - loss: 174.16364 - accuracy: 0.85578525 - batches: 391\n", - "Epoch 2/5 - 6.98391997%.2fs - loss: 171.41266 - accuracy: 0.8602084 - batches: 391\n", - "Epoch 3/5 - 7.030380175%.2fs - loss: 170.09117 - accuracy: 0.86528045 - batches: 391\n", - "Epoch 4/5 - 7.01538049%.2fs - loss: 168.41052 - accuracy: 0.8704247 - batches: 391\n" + "cat: /home/root/annotator_logs/SentimentDLApproach_2ea7dc3149c2.log: No such file or directory\n" ] } ], @@ -460,10 +387,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zMLuwQSCB05e" - }, + "metadata": {}, "source": [ "# How to use already trained SentimentDL pipeline or its model\n", "\n", @@ -474,10 +398,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4I7COUCPCPe3" - }, + "metadata": {}, "source": [ "## Save and load pre-trained SentimentDL pipeline" ] @@ -485,11 +406,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "QTDQ3riLD-zW" - }, + "metadata": {}, "outputs": [], "source": [ "# Google Colab is free so it comes with a little memory. \n", @@ -502,10 +419,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "TI9JR8AoLbW3" - }, + "metadata": {}, "source": [ "# Save and load pre-trained SentimentDL model" ] @@ -513,11 +427,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3r3_q4CJLkZR" - }, + "metadata": {}, "outputs": [], "source": [ "# hdfs:/ if you are saving it on distributed file systems in Hadoop\n", @@ -527,26 +437,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "3JaclNFsQJ-X" - }, + "metadata": {}, "source": [ "Let's use our pre-trained SentimentDLModel in a pipeline: " ] }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "id": "NTJ53PbYQI-f", - "outputId": "d372fbec-2c90-4101-856b-601ff539bc33" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -583,10 +482,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "VOrjIsKXHea8" - }, + "metadata": {}, "source": [ "Now let's load it back so we can have prediction all together with everything in that pipeline:" ] @@ -594,11 +490,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ccy54HeERCZ1" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.types import StringType\n", @@ -612,11 +504,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3BsNAWS4VRkd" - }, + "metadata": {}, "outputs": [], "source": [ "prediction = pipeline.fit(dfTest).transform(dfTest)" @@ -624,16 +512,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "colab_type": "code", - "id": "nz32PDjEVUTk", - "outputId": "8c939612-eb7e-4c7e-a3e6-69df9bfcc467" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -646,12 +526,12 @@ "|[negative]|\n", "+----------+\n", "\n", - "+-----------------------------------------------------------------+\n", - "|metadata |\n", - "+-----------------------------------------------------------------+\n", - "|[[sentence -> 0, positive -> 1.0, negative -> 9.762569E-10]] |\n", - "|[[sentence -> 0, positive -> 4.49094E-5, negative -> 0.99995506]]|\n", - "+-----------------------------------------------------------------+\n", + "+------------------------------------------------------------------+\n", + "|metadata |\n", + "+------------------------------------------------------------------+\n", + "|[{sentence -> 0, positive -> 1.0, negative -> 1.7301151E-10}] |\n", + "|[{sentence -> 0, positive -> 7.5793296E-6, negative -> 0.9999924}]|\n", + "+------------------------------------------------------------------+\n", "\n" ] } @@ -664,10 +544,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "UoJH3kA7RJpD" - }, + "metadata": {}, "source": [ "# Evaluation \n", "\n", @@ -677,11 +554,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5HkV5BAiWPAo" - }, + "metadata": {}, "outputs": [], "source": [ "testDataset = spark.read \\\n", @@ -692,11 +565,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_aVPZXgst0-V" - }, + "metadata": {}, "outputs": [], "source": [ "preds = pipelineModel.transform(testDataset)" @@ -704,16 +573,8 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 969 - }, - "colab_type": "code", - "id": "-H9UAWO_t-b9", - "outputId": "937b7588-a5c0-4688-87d1-176e7152e3b8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -770,7 +631,7 @@ "|negative|Back in the cold and creepy early 90's,a show c...|[negative]|\n", "|negative|If you're in the mood for a really bad porno wi...|[negative]|\n", "|negative|The bearings of western-style Feminism on the v...|[positive]|\n", - "|positive|In the changing world of CG and what-not of car...|[positive]|\n", + "|positive|In the changing world of CG and what-not of car...|[negative]|\n", "|positive|I wonder why I haven't heard of this movie befo...|[positive]|\n", "+--------+--------------------------------------------------+----------+\n", "only showing top 50 rows\n", @@ -785,11 +646,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8-JF5_Y9uPFj" - }, + "metadata": {}, "outputs": [], "source": [ "preds_df = preds.select('label','text',\"class.result\").toPandas()" @@ -798,11 +655,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CS2q_OajuZyO" - }, + "metadata": {}, "outputs": [], "source": [ "# The result is an array since in Spark NLP you can have multiple sentences.\n", @@ -814,10 +667,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "uf_s83c0sT91" - }, + "metadata": {}, "source": [ "The `SentimentDL` has the ability to accept a threshold to set a label on any result that is less than that number. \n", "\n", @@ -829,11 +679,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "-hmSFmRiqiZO" - }, + "metadata": {}, "outputs": [], "source": [ "preds_df = preds_df[preds_df['result'] != 'neutral']\n" @@ -842,11 +688,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "naAHGWV5ugNX" - }, + "metadata": {}, "outputs": [], "source": [ "# We are going to use sklearn to evalute the results on test dataset\n", @@ -855,26 +697,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "o2BiHF_sR3Cz" - }, + "metadata": {}, "source": [ "Let's use `classification_report` from `sklearn` to evaluate the final scores. (keep in mind due to limited resources on a free Google Colab we only used 5 Epochs :)" ] }, { "cell_type": "code", - "execution_count": 98, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 170 - }, - "colab_type": "code", - "id": "kLeO9u1bunPB", - "outputId": "71f9b831-f638-46b0-fad3-5e0a3b75835b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -882,12 +713,12 @@ "text": [ " precision recall f1-score support\n", "\n", - " negative 0.87 0.86 0.86 12449\n", - " positive 0.85 0.87 0.86 12041\n", + " negative 0.88 0.85 0.86 12787\n", + " positive 0.84 0.87 0.86 11819\n", "\n", - " accuracy 0.86 24490\n", - " macro avg 0.86 0.86 0.86 24490\n", - "weighted avg 0.86 0.86 0.86 24490\n", + " accuracy 0.86 24606\n", + " macro avg 0.86 0.86 0.86 24606\n", + "weighted avg 0.86 0.86 0.86 24606\n", "\n" ] } @@ -895,17 +726,6 @@ "source": [ "print (classification_report(preds_df['result'], preds_df['label']))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3Zlwshvwx4hu" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -929,8 +749,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.21.0" diff --git a/examples/python/training/english/crf-ner/ner_dl_crf.ipynb b/examples/python/training/english/crf-ner/ner_dl_crf.ipynb index 919bc2fa92c1a1..75b8e5feb0d369 100644 --- a/examples/python/training/english/crf-ner/ner_dl_crf.ipynb +++ b/examples/python/training/english/crf-ner/ner_dl_crf.ipynb @@ -1,96 +1,50 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "m32x7R0tyHH6" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/crf-ner/ner_dl_crf.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb)\n", "\n", - "## 0. Colab Setup" + "# CRF Named Entity Recognition" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "95EqKJCoySwe", - "outputId": "45aa98d4-e6be-49f8-f491-29ba3af04171" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 15:09:40-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 15:09:40-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 15:09:41-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 15:09:41 (72.2 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 67 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 74.8 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 58.6 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "bxBXISVpyHIA" - }, + "metadata": {}, "source": [ - "## CRF Named Entity Recognition\n", "In the following example, we walk-through a Conditional Random Fields NER model training and prediction.\n", "\n", "This challenging annotator will require the user to provide either a labeled dataset during fit() stage, or use external CoNLL 2003 resources to train. It may optionally use an external word embeddings set and a list of additional entities.\n", "\n", - "The CRF Annotator will also require Part-of-speech tags so we add those in the same Pipeline.\n", - "\n" + "The CRF Annotator will also require Part-of-speech tags so we add those in the same Pipeline." ] }, { "cell_type": "markdown", - "metadata": { - "id": "jePQ1RvIyHIC" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource path to read local data files" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "yljiat0_yHIE" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -109,27 +63,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "o9kYUUsYyHIP" - }, + "metadata": {}, "source": [ "#### 2. Download training dataset if not already there" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "IKJQpW57yHIR", - "outputId": "17a986a0-d32d-4a91-b294-3b9f274fa0e6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "File Not found will downloading it!\n" ] @@ -151,30 +97,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "c7ixJew3yHIc" - }, + "metadata": {}, "source": [ "#### 3. Load SparkSession if not already there" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0ENw-DRoyHIe", - "outputId": "1bdf3d19-e8e8-43b3-c826-4fb4074cb5e6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -189,19 +127,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FmJej2gGyHIp" - }, + "metadata": {}, "source": [ "#### 4. Create annotator components in the right order, with their training Params. Finisher will output only NER. Put all in pipeline." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "6uKWI52qyHIs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "nerTagger = NerCrfApproach()\\\n", @@ -219,27 +153,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PVRUX_hpyHIz" - }, + "metadata": {}, "source": [ "#### 6. Load a dataset for prediction. Training is not relevant from this dataset." ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "edIbRJiNyHI1", - "outputId": "37d8cf0a-a119-4a82-c185-fa2f52bcc8fa" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "glove_100d download started this may take some time.\n", "Approximate size to download 145.3 MB\n", @@ -272,31 +198,23 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "bUPzqnGqyHI-" - }, + "metadata": {}, "source": [ "#### 7. Training the model. Training doesn't really do anything from the dataset itself." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7bCbmaLDyHI_", - "outputId": "5321a525-1efe-4a82-d75b-9854f8f60564" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Start fitting\n", "Fitting has ended\n", - "5.912823915481567\n" + "2.631434202194214\n" ] } ], @@ -311,32 +229,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "W7vA5MiOyHJH" - }, + "metadata": {}, "source": [ "#### 8. Save NerCrfModel into disk after training" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "n_LY10D9yHJJ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "ner_model.write().overwrite().save(\"./pip_wo_embedd/\")" ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "H6qtW2x5yHJQ" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -359,8 +264,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dl-ner/mfa_ner_graphs_s3.ipynb b/examples/python/training/english/dl-ner/mfa_ner_graphs_s3.ipynb index 7e2bca34b21d86..d488360ea6922b 100644 --- a/examples/python/training/english/dl-ner/mfa_ner_graphs_s3.ipynb +++ b/examples/python/training/english/dl-ner/mfa_ner_graphs_s3.ipynb @@ -1,36 +1,36 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/mfa_ner_graphs_s3.ipynb)\n", + "\n", + "# Configuring MFA for S3 access" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "4SVtLznZXe6K" - }, + "metadata": {}, "outputs": [], "source": [ - "# Install pyspark\n", - "! pip install --ignore-installed pyspark\n", - "\n", - "# Install Spark NLP\n", - "! pip install --ignore-installed spark-nlp" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HL7dLz15XTGr", - "outputId": "27f959d3-bb48-483d-cea5-550b89bc883b" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 3.2.2\n" + "Spark NLP version 4.3.1\n" ] } ], @@ -47,7 +47,41 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.3.0
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
Spark NLP
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "spark = sparknlp.start()\n", "spark" @@ -55,9 +89,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FNFI0FavqLV9" - }, + "metadata": {}, "source": [ "To configure MFA we just need to define the requires values in spark properties as show below. Look an example to get temporal credentials [here](https://github.com/JohnSnowLabs/spark-nlp/blob/master/scripts/aws_tmp_credentials.sh) " ] @@ -65,9 +97,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "A5kl8WaWO1zD" - }, + "metadata": {}, "outputs": [], "source": [ "spark.conf.set(\"spark.jsl.settings.aws.credentials.access_key_id\", \"MY_ACCESS_KEY_ID\")\n", @@ -79,13 +109,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DXVydy4LXbLY", - "outputId": "328e7393-16de-457f-c3c8-24e06b9ef23a" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -101,7 +125,7 @@ } ], "source": [ - "1from sparknlp.training import CoNLL\n", + "from sparknlp.training import CoNLL\n", "\n", "training_data = CoNLL().readDataset(spark, 'sample_data/test_ner_dataset.txt')\n", "training_data.show(3)" @@ -110,13 +134,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7H7PAVXj-KIn", - "outputId": "2c861d69-d5cc-4d1d-a74f-2c6bf509129e" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -136,11 +154,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "rquF22gI-OPY" - }, + "metadata": {}, "outputs": [], "source": [ + "# External Graph folder on S3\n", "graphFolder = \"s3://my.bucket.com/my/s3/path\"\n", "\n", "ner_tagger = NerDLApproach() \\\n", @@ -159,13 +176,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wYxnTYlsKgBX", - "outputId": "91e1ae49-da92-4246-ad77-7fb986afe3b0" - }, + "metadata": {}, "outputs": [ { "data": { @@ -173,7 +184,7 @@ "NerDLModel_18c6a5b33e9a" ] }, - "execution_count": 10, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -203,8 +214,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dl-ner/ner_albert.ipynb b/examples/python/training/english/dl-ner/ner_albert.ipynb index ec6ab5f09e33ee..300eceae186902 100644 --- a/examples/python/training/english/dl-ner/ner_albert.ipynb +++ b/examples/python/training/english/dl-ner/ner_albert.ipynb @@ -1,102 +1,46 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "uZhJcUl06r8w" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_albert.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_albert.ipynb)\n", "\n", - "## 0. Colab Setup" + "# How to train a NER classifier with Albert embeddings based on Char CNNs - BiLSTM - CRF" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "22mElNLo6rUI", - "outputId": "4c094532-a49a-4453-a7b7-a901e634020c" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 11:33:23-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 11:33:23-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 11:33:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 11:33:25 (40.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 49 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 45.7 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 53.9 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "A_QE6hqA4WHh" - }, - "source": [ - "# How to train a NER classifier with Albert embeddings based on Char CNNs - BiLSTM - CRF" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wybDus1P4WHk" - }, + "metadata": {}, "source": [ - "## Download the file into the local File System \n", - "### It is a standard conll2003 format training file" + "First, we download the file into the local File System.\n", + "It is a standard conll2003 formatted training file." ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EA0QHrLF4WHl", - "outputId": "6ff3cd78-94d3-4d7c-c4ad-2c5a0900cc14" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "File Not found will downloading it!\n" + "File already present.\n" ] } ], @@ -106,24 +50,19 @@ "from pathlib import Path\n", "import urllib.request\n", "\n", - "\n", "download_path = \"./eng.train\"\n", "\n", - "\n", "if not Path(download_path).is_file():\n", - " print(\"File Not found will downloading it!\")\n", + " print(\"Local file not found. Attempting download!\")\n", " url = \"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train\"\n", " urllib.request.urlretrieve(url, download_path)\n", "else:\n", - " printalbert(\"File already present.\")\n", - " \n" + " print(\"File already present.\")\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "uYZhNUVH4WHs" - }, + "metadata": {}, "source": [ "# Read CoNLL Dataset into Spark dataframe and automagically generate features for futures tasks\n", "The readDataset method of the CoNLL class handily adds all the features required in the next steps" @@ -131,18 +70,12 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lQExmc684WHu", - "outputId": "6da2bb18-c211-4a21-c103-642c8c128ffc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label|\n", @@ -184,9 +117,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JF9dJWoW4WH6" - }, + "metadata": {}, "source": [ "# Define the NER Pipeline \n", "\n", @@ -197,18 +128,12 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Z0xFttkH4WH7", - "outputId": "13ebdfff-4ae4-4334-f684-871bc199d09a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "albert_base_uncased download started this may take some time.\n", "Approximate size to download 42.7 MB\n", @@ -224,12 +149,12 @@ "from sparknlp.common import *\n", "from sparknlp.base import *\n", "\n", - "# Define the pretrained Albert model. \n", + "# Define the pretrained Albert model.\n", "albert = AlbertEmbeddings.pretrained().setInputCols(\"sentence\", \"token\")\\\n", " .setOutputCol(\"albert\")\\\n", "\n", "\n", - "# Define the Char CNN - BiLSTM - CRF model. We will feed it the Albert tokens \n", + "# Define the Char CNN - BiLSTM - CRF model. We will feed it the Albert tokens\n", "nerTagger = NerDLApproach()\\\n", " .setInputCols([\"sentence\", \"token\", \"albert\"])\\\n", " .setLabelColumn(\"label\")\\\n", @@ -248,27 +173,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "YpcIr8b_4WIB" - }, + "metadata": {}, "source": [ "# Fit the Pipeline and get results" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hDKsFDRy4WIC", - "outputId": "4f98bced-6e5b-4b9e-a30c-a3e28c32d46e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label| albert| ner|\n", @@ -306,27 +223,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "HFSKuv-x4WIH" - }, + "metadata": {}, "source": [ "### Checkout only result columns" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ObW2xBPn4WII", - "outputId": "390e7c45-9138-4098-f49c-49a852c55949" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|text |ner |\n", @@ -343,9 +252,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "TEDXS6R9KtXm" - }, + "metadata": {}, "source": [ "## Alternative Albert models \n", "\n", @@ -358,18 +265,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_XsIHHpAKp2-", - "outputId": "80b92cd5-590f-4e1e-bce9-2d4c6889c586" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "albert_xxlarge_uncased download started this may take some time.\n", "Approximate size to download 795 MB\n", @@ -439,18 +340,12 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "i9HkYsNFLTHD", - "outputId": "5ff2a22e-a5aa-4d0b-d4cd-8138ab5e2cee" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|text |ner |\n", @@ -465,15 +360,6 @@ "\n", "ner_df.select(*['text', 'ner']).limit(1).show(truncate=False)" ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "TYCBAgL9LjL1" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -495,8 +381,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "name": "NER-Tutorial", "notebookId": 3359671281044291 diff --git a/examples/python/training/english/dl-ner/ner_bert.ipynb b/examples/python/training/english/dl-ner/ner_bert.ipynb index dd5fa9037d2cbd..0f92d814e2bcbc 100644 --- a/examples/python/training/english/dl-ner/ner_bert.ipynb +++ b/examples/python/training/english/dl-ner/ner_bert.ipynb @@ -1,80 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MI3at4LA4TO4" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_bert.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_bert.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Deep Learning NER with Bert Embeddings" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 14853, - "status": "ok", - "timestamp": 1589704571189, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "CQkc9O5V6vJ5", - "outputId": "b3698e71-5966-42e8-82bd-c433dfaa666f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "import os\n", - "\n", - "# Install java\n", - "! apt-get update -qq\n", - "! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null\n", - "\n", - "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", - "os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]\n", - "! java -version\n", - "\n", - "# Install pyspark\n", - "! pip install --ignore-installed pyspark==2.4.4\n", - "\n", - "# Install Spark NLP\n", - "! pip install --ignore-installed spark-nlp" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "OnbkiY634TO7" - }, + "metadata": {}, "source": [ - "## Deep Learning NER\n", - "\n", "In the following example, we walk-through a LSTM NER model training and prediction. This annotator is implemented on top of TensorFlow.\n", "\n", "This annotator will take a series of word embedding vectors, training CoNLL dataset, plus a validation dataset. We include our own predefined Tensorflow Graphs, but it will train all layers during fit() stage.\n", @@ -84,60 +36,49 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "P5NoZwVw4TO8" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource folder path." ] }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "HVMuFdHz4TO-" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", - "sys.path.append('../../')\n", "\n", "from pyspark.sql import SparkSession\n", "from pyspark.ml import Pipeline\n", "\n", "from sparknlp.annotator import *\n", "from sparknlp.common import *\n", - "from sparknlp.base import *\n", - "\n", - "import time\n", - "import zipfile\n", - "#Setting location of resource Directory\n", - "resource_path= \"../../../src/test/resources/\"" + "from sparknlp.base import *\n" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Dnkqe7Db4TPG" - }, + "metadata": {}, "source": [ "#### 2. Download CoNLL 2003 data if not present" ] }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "DtNyZXDc4TPH" - }, - "outputs": [], + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading eng.testa\n", + "Downloading eng.testb\n" + ] + } + ], "source": [ "# Download CoNLL 2003 Dataset\n", "import os\n", @@ -148,7 +89,7 @@ "file_testa= \"eng.testa\"\n", "file_testb= \"eng.testb\"\n", "# https://github.com/patverga/torch-ner-nlp-from-scratch/tree/master/data/conll2003\n", - "if not Path(file_train).is_file(): \n", + "if not Path(file_train).is_file():\n", " print(\"Downloading \"+file_train)\n", " urllib.request.urlretrieve(url+file_train, file_train)\n", "if not Path(file_testa).is_file():\n", @@ -162,49 +103,27 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "_vmA0JH44TPP" - }, + "metadata": {}, "source": [ "#### 3. Create the spark session" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 22643, - "status": "ok", - "timestamp": 1589704579011, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "O3wvVq-14TPQ", - "outputId": "a3282669-5d17-41e7-dd4b-edf0ad9c27b0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], "source": [ - "import sparknlp \n", + "import sparknlp\n", "\n", "spark = sparknlp.start()\n", "\n", @@ -214,37 +133,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fxv7jokO4TPY" - }, + "metadata": {}, "source": [ "#### 4. Load dataset and cache into memory" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 459 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 39584, - "status": "ok", - "timestamp": 1589704595967, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "xeuwKgWB4TPZ", - "outputId": "816df8b8-f98c-4d1d-d4f6-33946e405bd0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -253,26 +150,26 @@ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label|\n", "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", - "|EU rejects German...|[[document, 0, 47...|[[document, 0, 47...|[[token, 0, 1, EU...|[[pos, 0, 1, NNP,...|[[named_entity, 0...|\n", - "| Peter Blackburn|[[document, 0, 14...|[[document, 0, 14...|[[token, 0, 4, Pe...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|\n", - "| BRUSSELS 1996-08-22|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 7, BR...|[[pos, 0, 7, NNP,...|[[named_entity, 0...|\n", - "|The European Comm...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|\n", - "|Germany 's repres...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 6, Ge...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|\n", - "|\" We do n't suppo...|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 0, \",...|[[pos, 0, 0, \", [...|[[named_entity, 0...|\n", - "|He said further s...|[[document, 0, 13...|[[document, 0, 13...|[[token, 0, 1, He...|[[pos, 0, 1, PRP,...|[[named_entity, 0...|\n", - "|He said a proposa...|[[document, 0, 22...|[[document, 0, 22...|[[token, 0, 1, He...|[[pos, 0, 1, PRP,...|[[named_entity, 0...|\n", - "|Fischler proposed...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 7, Fi...|[[pos, 0, 7, JJR,...|[[named_entity, 0...|\n", - "|But Fischler agre...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 2, Bu...|[[pos, 0, 2, CC, ...|[[named_entity, 0...|\n", - "|Spanish Farm Mini...|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 6, Sp...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|\n", - "| .|[[document, 0, 0,...|[[document, 0, 0,...|[[token, 0, 0, .,...|[[pos, 0, 0, ., [...|[[named_entity, 0...|\n", - "|Only France and B...|[[document, 0, 52...|[[document, 0, 52...|[[token, 0, 3, On...|[[pos, 0, 3, RB, ...|[[named_entity, 0...|\n", - "|The EU 's scienti...|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|\n", - "|Sheep have long b...|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 4, Sh...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|\n", - "|British farmers d...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 6, Br...|[[pos, 0, 6, JJ, ...|[[named_entity, 0...|\n", - "|\" What we have to...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 0, \",...|[[pos, 0, 0, \", [...|[[named_entity, 0...|\n", - "|Bonn has led effo...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 3, Bo...|[[pos, 0, 3, NNP,...|[[named_entity, 0...|\n", - "|Germany imported ...|[[document, 0, 84...|[[document, 0, 84...|[[token, 0, 6, Ge...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|\n", - "|It brought in 4,2...|[[document, 0, 82...|[[document, 0, 82...|[[token, 0, 1, It...|[[pos, 0, 1, PRP,...|[[named_entity, 0...|\n", + "|EU rejects German...|[{document, 0, 47...|[{document, 0, 47...|[{token, 0, 1, EU...|[{pos, 0, 1, NNP,...|[{named_entity, 0...|\n", + "| Peter Blackburn|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 4, Pe...|[{pos, 0, 4, NNP,...|[{named_entity, 0...|\n", + "| BRUSSELS 1996-08-22|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 7, BR...|[{pos, 0, 7, NNP,...|[{named_entity, 0...|\n", + "|The European Comm...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|\n", + "|Germany 's repres...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 6, Ge...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|\n", + "|\" We do n't suppo...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 0, \",...|[{pos, 0, 0, \", {...|[{named_entity, 0...|\n", + "|He said further s...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|\n", + "|He said a proposa...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|\n", + "|Fischler proposed...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 7, Fi...|[{pos, 0, 7, JJR,...|[{named_entity, 0...|\n", + "|But Fischler agre...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 2, Bu...|[{pos, 0, 2, CC, ...|[{named_entity, 0...|\n", + "|Spanish Farm Mini...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 6, Sp...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|\n", + "| .|[{document, 0, 0,...|[{document, 0, 0,...|[{token, 0, 0, .,...|[{pos, 0, 0, ., {...|[{named_entity, 0...|\n", + "|Only France and B...|[{document, 0, 52...|[{document, 0, 52...|[{token, 0, 3, On...|[{pos, 0, 3, RB, ...|[{named_entity, 0...|\n", + "|The EU 's scienti...|[{document, 0, 17...|[{document, 0, 17...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|\n", + "|Sheep have long b...|[{document, 0, 17...|[{document, 0, 17...|[{token, 0, 4, Sh...|[{pos, 0, 4, NNP,...|[{named_entity, 0...|\n", + "|British farmers d...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 6, Br...|[{pos, 0, 6, JJ, ...|[{named_entity, 0...|\n", + "|\" What we have to...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 0, \",...|[{pos, 0, 0, \", {...|[{named_entity, 0...|\n", + "|Bonn has led effo...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 3, Bo...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|\n", + "|Germany imported ...|[{document, 0, 84...|[{document, 0, 84...|[{token, 0, 6, Ge...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|\n", + "|It brought in 4,2...|[{document, 0, 82...|[{document, 0, 82...|[{token, 0, 1, It...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|\n", "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n" @@ -287,44 +184,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4BO6oz8i4TPh" - }, + "metadata": {}, "source": [ "#### 5. Create annotator components with appropriate params and in the right order. The finisher will output only NER. Put everything in Pipeline" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 56765, - "status": "ok", - "timestamp": 1589704613167, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "nxArxJq_4TPj", - "outputId": "2e716ef5-c8c9-48cb-9e02-47b959fe7a60" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "bert_base_cased download started this may take some time.\n", - "Approximate size to download 389.2 MB\n", + "small_bert_L2_768 download started this may take some time.\n", + "Approximate size to download 139.6 MB\n", "[OK!]\n" ] } @@ -338,27 +213,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 459 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 57266, - "status": "ok", - "timestamp": 1589704613688, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "NMEx77d3bVpp", - "outputId": "170fb930-7b88-4c51-fafd-1d892ab02508" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -367,26 +223,26 @@ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label|\n", "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", - "|EU rejects German...|[[document, 0, 47...|[[document, 0, 47...|[[token, 0, 1, EU...|[[pos, 0, 1, NNP,...|[[named_entity, 0...|\n", - "| Peter Blackburn|[[document, 0, 14...|[[document, 0, 14...|[[token, 0, 4, Pe...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|\n", - "| BRUSSELS 1996-08-22|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 7, BR...|[[pos, 0, 7, NNP,...|[[named_entity, 0...|\n", - "|The European Comm...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|\n", - "|Germany 's repres...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 6, Ge...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|\n", - "|\" We do n't suppo...|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 0, \",...|[[pos, 0, 0, \", [...|[[named_entity, 0...|\n", - "|He said further s...|[[document, 0, 13...|[[document, 0, 13...|[[token, 0, 1, He...|[[pos, 0, 1, PRP,...|[[named_entity, 0...|\n", - "|He said a proposa...|[[document, 0, 22...|[[document, 0, 22...|[[token, 0, 1, He...|[[pos, 0, 1, PRP,...|[[named_entity, 0...|\n", - "|Fischler proposed...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 7, Fi...|[[pos, 0, 7, JJR,...|[[named_entity, 0...|\n", - "|But Fischler agre...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 2, Bu...|[[pos, 0, 2, CC, ...|[[named_entity, 0...|\n", - "|Spanish Farm Mini...|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 6, Sp...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|\n", - "| .|[[document, 0, 0,...|[[document, 0, 0,...|[[token, 0, 0, .,...|[[pos, 0, 0, ., [...|[[named_entity, 0...|\n", - "|Only France and B...|[[document, 0, 52...|[[document, 0, 52...|[[token, 0, 3, On...|[[pos, 0, 3, RB, ...|[[named_entity, 0...|\n", - "|The EU 's scienti...|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|\n", - "|Sheep have long b...|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 4, Sh...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|\n", - "|British farmers d...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 6, Br...|[[pos, 0, 6, JJ, ...|[[named_entity, 0...|\n", - "|\" What we have to...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 0, \",...|[[pos, 0, 0, \", [...|[[named_entity, 0...|\n", - "|Bonn has led effo...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 3, Bo...|[[pos, 0, 3, NNP,...|[[named_entity, 0...|\n", - "|Germany imported ...|[[document, 0, 84...|[[document, 0, 84...|[[token, 0, 6, Ge...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|\n", - "|It brought in 4,2...|[[document, 0, 82...|[[document, 0, 82...|[[token, 0, 1, It...|[[pos, 0, 1, PRP,...|[[named_entity, 0...|\n", + "|EU rejects German...|[{document, 0, 47...|[{document, 0, 47...|[{token, 0, 1, EU...|[{pos, 0, 1, NNP,...|[{named_entity, 0...|\n", + "| Peter Blackburn|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 4, Pe...|[{pos, 0, 4, NNP,...|[{named_entity, 0...|\n", + "| BRUSSELS 1996-08-22|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 7, BR...|[{pos, 0, 7, NNP,...|[{named_entity, 0...|\n", + "|The European Comm...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|\n", + "|Germany 's repres...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 6, Ge...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|\n", + "|\" We do n't suppo...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 0, \",...|[{pos, 0, 0, \", {...|[{named_entity, 0...|\n", + "|He said further s...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|\n", + "|He said a proposa...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|\n", + "|Fischler proposed...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 7, Fi...|[{pos, 0, 7, JJR,...|[{named_entity, 0...|\n", + "|But Fischler agre...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 2, Bu...|[{pos, 0, 2, CC, ...|[{named_entity, 0...|\n", + "|Spanish Farm Mini...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 6, Sp...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|\n", + "| .|[{document, 0, 0,...|[{document, 0, 0,...|[{token, 0, 0, .,...|[{pos, 0, 0, ., {...|[{named_entity, 0...|\n", + "|Only France and B...|[{document, 0, 52...|[{document, 0, 52...|[{token, 0, 3, On...|[{pos, 0, 3, RB, ...|[{named_entity, 0...|\n", + "|The EU 's scienti...|[{document, 0, 17...|[{document, 0, 17...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|\n", + "|Sheep have long b...|[{document, 0, 17...|[{document, 0, 17...|[{token, 0, 4, Sh...|[{pos, 0, 4, NNP,...|[{named_entity, 0...|\n", + "|British farmers d...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 6, Br...|[{pos, 0, 6, JJ, ...|[{named_entity, 0...|\n", + "|\" What we have to...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 0, \",...|[{pos, 0, 0, \", {...|[{named_entity, 0...|\n", + "|Bonn has led effo...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 3, Bo...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|\n", + "|Germany imported ...|[{document, 0, 84...|[{document, 0, 84...|[{token, 0, 6, Ge...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|\n", + "|It brought in 4,2...|[{document, 0, 82...|[{document, 0, 82...|[{token, 0, 1, It...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|\n", "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n" @@ -399,27 +255,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 510 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 64843, - "status": "ok", - "timestamp": 1589704621280, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "cP9nXTCl4TPq", - "outputId": "80f3bdff-b46e-445c-adca-2602ce68c27c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -429,31 +266,31 @@ "+--------------------+--------------------+\n", "| token| bert|\n", "+--------------------+--------------------+\n", - "|[[token, 0, 1, EU...|[[word_embeddings...|\n", - "|[[token, 0, 4, Pe...|[[word_embeddings...|\n", - "|[[token, 0, 7, BR...|[[word_embeddings...|\n", - "|[[token, 0, 2, Th...|[[word_embeddings...|\n", - "|[[token, 0, 6, Ge...|[[word_embeddings...|\n", - "|[[token, 0, 0, \",...|[[word_embeddings...|\n", - "|[[token, 0, 1, He...|[[word_embeddings...|\n", - "|[[token, 0, 1, He...|[[word_embeddings...|\n", - "|[[token, 0, 7, Fi...|[[word_embeddings...|\n", - "|[[token, 0, 2, Bu...|[[word_embeddings...|\n", - "|[[token, 0, 6, Sp...|[[word_embeddings...|\n", - "|[[token, 0, 0, .,...|[[word_embeddings...|\n", - "|[[token, 0, 3, On...|[[word_embeddings...|\n", - "|[[token, 0, 2, Th...|[[word_embeddings...|\n", - "|[[token, 0, 4, Sh...|[[word_embeddings...|\n", - "|[[token, 0, 6, Br...|[[word_embeddings...|\n", - "|[[token, 0, 0, \",...|[[word_embeddings...|\n", - "|[[token, 0, 3, Bo...|[[word_embeddings...|\n", - "|[[token, 0, 6, Ge...|[[word_embeddings...|\n", - "|[[token, 0, 1, It...|[[word_embeddings...|\n", + "|[{token, 0, 1, EU...|[{word_embeddings...|\n", + "|[{token, 0, 4, Pe...|[{word_embeddings...|\n", + "|[{token, 0, 7, BR...|[{word_embeddings...|\n", + "|[{token, 0, 2, Th...|[{word_embeddings...|\n", + "|[{token, 0, 6, Ge...|[{word_embeddings...|\n", + "|[{token, 0, 0, \",...|[{word_embeddings...|\n", + "|[{token, 0, 1, He...|[{word_embeddings...|\n", + "|[{token, 0, 1, He...|[{word_embeddings...|\n", + "|[{token, 0, 7, Fi...|[{word_embeddings...|\n", + "|[{token, 0, 2, Bu...|[{word_embeddings...|\n", + "|[{token, 0, 6, Sp...|[{word_embeddings...|\n", + "|[{token, 0, 0, .,...|[{word_embeddings...|\n", + "|[{token, 0, 3, On...|[{word_embeddings...|\n", + "|[{token, 0, 2, Th...|[{word_embeddings...|\n", + "|[{token, 0, 4, Sh...|[{word_embeddings...|\n", + "|[{token, 0, 6, Br...|[{word_embeddings...|\n", + "|[{token, 0, 0, \",...|[{word_embeddings...|\n", + "|[{token, 0, 3, Bo...|[{word_embeddings...|\n", + "|[{token, 0, 6, Ge...|[{word_embeddings...|\n", + "|[{token, 0, 1, It...|[{word_embeddings...|\n", "+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n", - "CPU times: user 11.6 ms, sys: 1.54 ms, total: 13.1 ms\n", - "Wall time: 7.53 s\n" + "CPU times: user 16.6 ms, sys: 0 ns, total: 16.6 ms\n", + "Wall time: 1min 1s\n" ] } ], @@ -465,12 +302,12 @@ "# WARNING: Setting benchmark to true is slow and might crash your system and is not recommended on standardCollab notebooks-- High end hardware and/or GPU required\n", "## dataframe.cache() does not solve this. Results must be serialized to disk for maximum efficiency\n", "### You might need to restart your driver after this step finishes\n", - "benchmark = False \n", + "benchmark = False\n", "\n", "\n", "with_bert_path = \"./with_bert.parquet\"\n", "if benchmark == True :\n", - " if not Path(with_bert_path).is_dir(): \n", + " if not Path(with_bert_path).is_dir():\n", " bert.transform(training_data).write.parquet(\"./with_bert.parquet\")\n", " training_with_bert = spark.read.parquet(\"./with_bert.parquet\").cache()\n", "else : training_with_bert = bert.transform(training_data)\n", @@ -482,12 +319,8 @@ }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ewZNMRkX4TPz" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "nerTagger = NerDLApproach()\\\n", @@ -511,37 +344,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Jmrxa0zb4TP5" - }, + "metadata": {}, "source": [ "#### 6. Train the pipeline. (This will take some time)" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 72071, - "status": "ok", - "timestamp": 1589704628541, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "M1EsnzJD4TP6", - "outputId": "9f39ed1e-f436-4f3a-abf3-96449e68ebb3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -549,9 +360,9 @@ "text": [ "Start fitting\n", "Fitting is ended\n", - "7.180534839630127\n", - "CPU times: user 21.5 ms, sys: 6.81 ms, total: 28.3 ms\n", - "Wall time: 7.18 s\n" + "4.826304197311401\n", + "CPU times: user 13.5 ms, sys: 0 ns, total: 13.5 ms\n", + "Wall time: 4.83 s\n" ] } ], @@ -561,29 +372,22 @@ "start = time.time()\n", "print(\"Start fitting\")\n", "#We have to limit the rows in Collab, otherwise we will encounter exceptions because of RAM limitations\n", - "model = pipeline.fit(training_with_bert.limit(25)) \n", + "model = pipeline.fit(training_with_bert.limit(25))\n", "print(\"Fitting is ended\")\n", "print (time.time() - start)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "N13yqmUu4TQA" - }, + "metadata": {}, "source": [ "#### 7. Lets predict with the model" ] }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "sc9NJ1EV4TQB" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document = DocumentAssembler()\\\n", @@ -611,27 +415,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 73124, - "status": "ok", - "timestamp": 1589704629618, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "07EWw0mG4TQR", - "outputId": "d87f8139-8958-4a9e-ce95-e5dc44a629d8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -653,12 +438,8 @@ }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "50yCGM6F4TQZ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "prediction_model = prediction_pipeline.fit(prediction_data)" @@ -666,27 +447,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 357 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 74477, - "status": "ok", - "timestamp": 1589704630997, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "iHk2VbE_4TQf", - "outputId": "f13adccb-1770-4ca6-c789-fd175133b274" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -710,8 +472,8 @@ "('in', 'O')\n", "('Armonk', 'O')\n", "('.', 'O')\n", - "CPU times: user 56.3 ms, sys: 7.62 ms, total: 63.9 ms\n", - "Wall time: 1.19 s\n" + "CPU times: user 28.1 ms, sys: 3.93 ms, total: 32.1 ms\n", + "Wall time: 678 ms\n" ] } ], @@ -726,27 +488,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 153 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 75321, - "status": "ok", - "timestamp": 1589704631851, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "XwNEGQts4TQl", - "outputId": "69632669-e907-4555-cda5-c046565bb61c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -755,11 +498,11 @@ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+\n", "| text| document| sentence| token| bert| ner|ner_span|\n", "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+\n", - "|Germany is a nice...|[[document, 0, 22...|[[document, 0, 22...|[[token, 0, 6, Ge...|[[word_embeddings...|[[named_entity, 0...| []|\n", + "|Germany is a nice...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 6, Ge...|[{word_embeddings...|[{named_entity, 0...| []|\n", "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+\n", "\n", - "CPU times: user 27.2 ms, sys: 6.09 ms, total: 33.3 ms\n", - "Wall time: 883 ms\n" + "CPU times: user 11.2 ms, sys: 1.61 ms, total: 12.8 ms\n", + "Wall time: 819 ms\n" ] } ], @@ -773,22 +516,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hxYs7v3F4TQq" - }, + "metadata": {}, "source": [ "#### 8. Save both pipeline and single model once trained, on disk" ] }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "nDk4xWbT4TQr" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "prediction_model.write().overwrite().save(\"./ner_dl_model\")" @@ -796,22 +532,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "xzvHfHBr4TQx" - }, + "metadata": {}, "source": [ "#### 9. Load both again, deserialize from disk" ] }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ARYxI8594TQz" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import PipelineModel, Pipeline\n", @@ -821,27 +550,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 153 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 254641, - "status": "ok", - "timestamp": 1589704811204, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "xfVgn3ZI4TQ4", - "outputId": "0848f1aa-10e6-4caa-c3e7-d7527d23da94" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -853,8 +563,8 @@ "('good', 'O')\n", "('person', 'O')\n", "('.', 'O')\n", - "CPU times: user 55.9 ms, sys: 12.4 ms, total: 68.3 ms\n", - "Wall time: 723 ms\n" + "CPU times: user 31.2 ms, sys: 1.59 ms, total: 32.8 ms\n", + "Wall time: 479 ms\n" ] } ], @@ -868,38 +578,19 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 254630, - "status": "ok", - "timestamp": 1589704811206, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "UpDPutD_4TQ-", - "outputId": "f4c423cf-8534-4506-c5ea-9ac1df2f46d7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DocumentAssembler_7a6bc03a0a25\n", - "SentenceDetector_8130627c0d5f\n", - "REGEX_TOKENIZER_cf7c9407b892\n", - "BERT_EMBEDDINGS_abf30dcdf344\n", - "PipelineModel_e7f7bc4a5dcc\n", - "[NerDLModel_ba63241e33e5, NerConverter_422eed39d1e4]\n" + "DocumentAssembler_40d21f31b5d3\n", + "SentenceDetector_e1b0e714c446\n", + "REGEX_TOKENIZER_a0a1816c8b3c\n", + "BERT_EMBEDDINGS_e3d4eaf62b32\n", + "PipelineModel_7b435b373a60\n", + "[NerDLModel_264694148c20, NerConverter_0d470ddc9080]\n" ] } ], @@ -908,17 +599,6 @@ " print(stage)\n", "print(loaded_prediction_model.stages[-1].stages)" ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "aH191rNe4TRC" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -942,8 +622,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dl-ner/ner_dl.ipynb b/examples/python/training/english/dl-ner/ner_dl.ipynb index 035def4dd6c50c..4bf4ac72d8f96e 100644 --- a/examples/python/training/english/dl-ner/ner_dl.ipynb +++ b/examples/python/training/english/dl-ner/ner_dl.ipynb @@ -1,75 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "d86L_FUK4U0O" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_dl.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_dl.ipynb)\n", "\n", - "## 0. Colab Setup" + "# NER with Deep Learning" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dz7v8B5i6uu5", - "outputId": "9bfe3e4a-0e8a-458f-e2ec-c8b70386ace5" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 11:33:31-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 11:33:31-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 11:33:32-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 11:33:32 (48.9 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 45 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 53.0 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 47.3 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "-oTcQcU74U0Q" - }, + "metadata": {}, "source": [ - "## Deep Learning NER\n", - "\n", "In the following example, we walk-through a LSTM NER model training and prediction. This annotator is implemented on top of TensorFlow.\n", "\n", "This annotator will take a series of word embedding vectors, training CoNLL dataset, plus a validation dataset. We include our own predefined Tensorflow Graphs, but it will train all layers during fit() stage.\n", @@ -79,19 +36,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FKf4cQ0s4U0R" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource folder path." ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "Wejw_DrU4U0S" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -110,32 +63,16 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JcH7A7yG4U0X" - }, + "metadata": {}, "source": [ "#### 2. Download CoNLL 2003 data if not present" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HwJXvsuR4U0Y", - "outputId": "cc558594-73f9-46f3-831d-74ac900ffbe0" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Downloading eng.train\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Download CoNLL 2003 Dataset\n", "import os\n", @@ -146,37 +83,29 @@ "file_testa= \"eng.testa\"\n", "file_testb= \"eng.testb\"\n", "# https://github.com/patverga/torch-ner-nlp-from-scratch/tree/master/data/conll2003\n", - "if not Path(file_train).is_file(): \n", + "if not Path(file_train).is_file():\n", " print(\"Downloading \"+file_train)\n", " urllib.request.urlretrieve(url+file_train, file_train)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "5Voa04Sj4U0d" - }, + "metadata": {}, "source": [ "#### 4. Create the spark session" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kdIbj0Mo4U0e", - "outputId": "33ae3dc9-9c82-4f6d-8f16-69ed04fba897" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -191,27 +120,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "YhRg5VSh4U0j" - }, + "metadata": {}, "source": [ "#### 6. Load parquet dataset and cache into memory" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zaxPfBBJ4U0k", - "outputId": "ab2560ab-3ea5-477a-af1b-8366594ddc2d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "glove_100d download started this may take some time.\n", "Approximate size to download 145.3 MB\n", @@ -252,19 +173,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "qYnYyImW4U0p" - }, + "metadata": {}, "source": [ "#### 5. Create annotator components with appropriate params and in the right order. The finisher will output only NER. Put everything in Pipeline" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "3638abOy4U0p" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "nerTagger = NerDLApproach()\\\n", @@ -279,31 +196,23 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "IHrjgNUq4U0t" - }, + "metadata": {}, "source": [ "#### 7. Train the NerDLModel. (This will take some time)" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8gwIpiU74U0u", - "outputId": "0b16ebc6-59ac-495f-e49e-b34ae9b26df7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Start fitting\n", "Fitting is ended\n", - "349.63567996025085\n" + "75.75428247451782\n" ] } ], @@ -317,27 +226,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "S86y-YiZ4U0z" - }, + "metadata": {}, "source": [ "#### 8. Lets predict with the model" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ywx7fsIj4U0z", - "outputId": "5b4c2b58-f080-4162-d92c-63888f54622a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "glove_100d download started this may take some time.\n", "Approximate size to download 145.3 MB\n", @@ -374,18 +275,12 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OZgAI4wF4U04", - "outputId": "85ca3cec-f3ce-4196-e2b4-d9ae79c3e6ef" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+\n", "| text|\n", @@ -403,18 +298,12 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Tw_r0Ris4U08", - "outputId": "6ce29e7b-a108-41de-9adc-8bd4efb18064" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| embeddings| ner|\n", @@ -432,17 +321,10 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GZRKCjrt4U0_", - "outputId": "2d7f568b-67f5-4eb1-d27a-e5103b92c5e8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[('International', 'I-ORG'),\n", @@ -465,8 +347,9 @@ " ('.', 'O')]" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } ], "source": [ @@ -498,8 +381,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dl-ner/ner_elmo.ipynb b/examples/python/training/english/dl-ner/ner_elmo.ipynb index 4de4882e2eff5d..813f44290ba6eb 100644 --- a/examples/python/training/english/dl-ner/ner_elmo.ipynb +++ b/examples/python/training/english/dl-ner/ner_elmo.ipynb @@ -1,102 +1,46 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "uZhJcUl06r8w" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_elmo.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_elmo.ipynb)\n", "\n", - "## 0. Colab Setup" + "# How to train a NER classifier with ELMO embeddings based on Char CNNs - BiLSTM - CRF" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "22mElNLo6rUI", - "outputId": "59a5d505-8442-4a21-8576-b0020f515a1f" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 11:34:24-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 11:34:24-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 11:34:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 0%[ ] 0 --.-KB/s Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 11:34:24 (64.7 MB/s) - written to stdout [1191/1191]\n", - "\n", - "\u001B[K |████████████████████████████████| 281.5 MB 48 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 64.9 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 58.0 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "A_QE6hqA4WHh" - }, + "metadata": {}, "source": [ - "# How to train a NER classifier with ELMO embeddings based on Char CNNs - BiLSTM - CRF" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wybDus1P4WHk" - }, - "source": [ - "## Download the file into the local File System \n", - "### It is a standard conll2003 format training file" + "First, we download the file into the local File System.\n", + "It is a standard conll2003 formatted training file." ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EA0QHrLF4WHl", - "outputId": "46b41d60-6f3d-4078-fbe0-ede4e85e4819" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "File Not found will downloading it!\n" + "File already present.\n" ] } ], @@ -106,24 +50,19 @@ "from pathlib import Path\n", "import urllib.request\n", "\n", - "\n", "download_path = \"./eng.train\"\n", "\n", - "\n", "if not Path(download_path).is_file():\n", - " print(\"File Not found will downloading it!\")\n", + " print(\"Local file not found. Attempting download!\")\n", " url = \"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train\"\n", " urllib.request.urlretrieve(url, download_path)\n", "else:\n", - " print(\"File already present.\")\n", - " \n" + " print(\"File already present.\")" ] }, { "cell_type": "markdown", - "metadata": { - "id": "uYZhNUVH4WHs" - }, + "metadata": {}, "source": [ "# Read CoNLL Dataset into Spark dataframe and automagically generate features for futures tasks\n", "The readDataset method of the CoNLL class handily adds all the features required in the next steps" @@ -131,18 +70,12 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lQExmc684WHu", - "outputId": "6d8fa534-09dd-4480-a4da-8e10f73b57c3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label|\n", @@ -184,9 +117,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JF9dJWoW4WH6" - }, + "metadata": {}, "source": [ "# Define the NER Pipeline \n", "\n", @@ -197,18 +128,12 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Z0xFttkH4WH7", - "outputId": "a58b0808-a544-4ec2-d51d-678166b1e551" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "elmo download started this may take some time.\n", "Approximate size to download 334.1 MB\n", @@ -224,14 +149,14 @@ "from sparknlp.common import *\n", "from sparknlp.base import *\n", "\n", - "# Define the pretrained Elmo model. \n", + "# Define the pretrained Elmo model.\n", "# We need to set lstm_outputs2 pooling layer, because the elmo layer is not yet compatible with NerDL\n", "elmo = ElmoEmbeddings.pretrained().setPoolingLayer(\"lstm_outputs2\") \\\n", " .setInputCols(\"sentence\", \"token\")\\\n", " .setOutputCol(\"elmo\")\\\n", "\n", "\n", - "# Defien the Char CNN - BiLSTM - CRF model. We will feed it the Elmo tokens \n", + "# Defien the Char CNN - BiLSTM - CRF model. We will feed it the Elmo tokens\n", "nerTagger = NerDLApproach()\\\n", " .setInputCols([\"sentence\", \"token\", \"elmo\"])\\\n", " .setLabelColumn(\"label\")\\\n", @@ -250,27 +175,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "YpcIr8b_4WIB" - }, + "metadata": {}, "source": [ "# Fit the Pipeline and get results" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hDKsFDRy4WIC", - "outputId": "ff273bcc-d9b4-45c6-82f1-cc6fa4216596" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label| elmo| ner|\n", @@ -299,27 +216,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "HFSKuv-x4WIH" - }, + "metadata": {}, "source": [ "### Checkout only result columns" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ObW2xBPn4WII", - "outputId": "0b09fd17-e91c-4552-c7e1-416107918984" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|text |ner |\n", @@ -333,15 +242,6 @@ "source": [ "ner_df.select(*['text', 'ner']).limit(1).show(truncate=False)" ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "CAGIS-vS4WIO" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -363,8 +263,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "name": "NER-Tutorial", "notebookId": 3359671281044291 diff --git a/examples/python/training/english/dl-ner/ner_graph_builder.ipynb b/examples/python/training/english/dl-ner/ner_graph_builder.ipynb index 93b8bf3a499701..6fa748615a0a1d 100644 --- a/examples/python/training/english/dl-ner/ner_graph_builder.ipynb +++ b/examples/python/training/english/dl-ner/ner_graph_builder.ipynb @@ -1,18 +1,31 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Annotator to build a Graph for NER" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_graph_builder.ipynb)\n", + "\n", + "# Building Graphs for NER" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZDHwES6rTGHd" - }, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -52,30 +65,8 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Tfv686d8XXF4", - "outputId": "1d6fff57-c0c3-4d39-eddf-3be620ad7a04" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting tensorflow-addons\n", - " Downloading tensorflow_addons-0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n", - "\u001B[K |████████████████████████████████| 1.1 MB 4.3 MB/s \n", - "\u001B[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from tensorflow-addons) (21.3)\n", - "Requirement already satisfied: typeguard>=2.7 in /usr/local/lib/python3.7/dist-packages (from tensorflow-addons) (2.7.1)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->tensorflow-addons) (3.0.9)\n", - "Installing collected packages: tensorflow-addons\n", - "Successfully installed tensorflow-addons-0.17.1\n" - ] - } - ], + "metadata": {}, + "outputs": [], "source": [ "pip install tensorflow-addons" ] @@ -100,20 +91,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hkLNhaUOXthX", - "outputId": "caa03c8c-443f-470c-e4a4-e61aedf8ab88", - "scrolled": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Apache Spark version: 3.2.0\n" + "Apache Spark version: 3.3.0\n" ] } ], @@ -125,7 +109,7 @@ " .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n", " .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n", " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", - " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.1.0\") \\\n", + " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.1\") \\\n", " .config(\"spark.jsl.settings.aws.credentials.access_key_id\", \"MY_ACCESS_KEY_ID\") \\\n", " .config(\"spark.jsl.settings.aws.credentials.secret_access_key\", \"MY_SECRET_ACCESS_KEY\") \\\n", " .config(\"spark:spark.jsl.settings.aws.credentials.session_token\", \"MY_SESSION_TOKEN\") \\\n", @@ -144,9 +128,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "iernkDqnS-pE" - }, + "metadata": {}, "source": [ "We use a variable to define the location that we will set to generate the graph. This example uses S3, but we can define a local, HDFS or DBFS path." ] @@ -154,19 +136,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "gqlL6Q5QS7ov" - }, + "metadata": {}, "outputs": [], "source": [ - "graph_folder = \"s3://my_bucket/my_path/ner_graphs\"" + "# graph_folder = \"s3://my_bucket/my_path/ner_graphs\"\n", + "graph_folder = \"ner_graphs\"" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yfxK9do4THUg" - }, + "metadata": {}, "source": [ "### Prepare NER test data" ] @@ -174,13 +153,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5UmKEfSLTJZK", - "outputId": "e16b9eaf-896e-45b1-87c5-36ef0ea7502f" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -217,9 +190,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "WdKW3pzCURcW" - }, + "metadata": {}, "source": [ "We define `TFNerDLGraphBuilder` to generate the graph and store it in the selected folder" ] @@ -227,9 +198,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "AE0AU1-dUTFD" - }, + "metadata": {}, "outputs": [], "source": [ "graph_builder = TFNerDLGraphBuilder()\\\n", @@ -242,9 +211,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "tzlNfjseUVId" - }, + "metadata": {}, "source": [ "Then, we use `NerApproach`and let it use the graph generated by the builder" ] @@ -252,9 +219,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "eCdKfBnAUUTT" - }, + "metadata": {}, "outputs": [], "source": [ "ner_dl = NerDLApproach() \\\n", @@ -275,9 +240,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Juz3SWgtUYF4" - }, + "metadata": {}, "source": [ "Put pipeline together" ] @@ -285,23 +248,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "_AUxaelHUZj0" - }, + "metadata": {}, "outputs": [], "source": [ "ner_pipeline = sparknlp.base.Pipeline().setStages([\n", - " embeddings, \n", - " graph_builder, \n", - " ner_dl \n", + " embeddings,\n", + " graph_builder,\n", + " ner_dl\n", "])" ] }, { "cell_type": "markdown", - "metadata": { - "id": "bGKNDnohUbOK" - }, + "metadata": {}, "source": [ "Fit data" ] @@ -335,8 +294,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dl-ner/ner_logs.ipynb b/examples/python/training/english/dl-ner/ner_logs.ipynb index 12f312d72ecf3e..8182b74503594b 100644 --- a/examples/python/training/english/dl-ner/ner_logs.ipynb +++ b/examples/python/training/english/dl-ner/ner_logs.ipynb @@ -1,18 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Exporting Logs in NER training" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_logs.ipynb)\n", + "\n", + "\n", + "# Exporting Logs in NER training" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "r1hRF4ZW3j_K" - }, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -24,9 +38,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NnscekQY6zT9" - }, + "metadata": {}, "source": [ "To use S3 to store training logs, we have two options:\n", "- Defining S3 path information as well as AWS credentials while starting spark\n", @@ -36,13 +48,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QenaYYxA3mBR", - "outputId": "6a9d5c0a-ce80-4b19-eeea-0d17ff4a94f6" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -80,9 +86,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "oiofOUwd7QTw" - }, + "metadata": {}, "source": [ "### Training NER DL" ] @@ -90,13 +94,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eJ8AHeSr7VLM", - "outputId": "6f920dc4-50a4-4b2a-e640-65d92f3cbdc7" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -119,13 +117,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Ok_p0tM7iwk", - "outputId": "44c8418b-f5ef-49e6-af8f-70cfb02baf5f" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -152,9 +144,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "GM4JoXsA7naY" - }, + "metadata": {}, "outputs": [], "source": [ "ner_tagger = NerDLApproach() \\\n", @@ -167,20 +157,14 @@ " .setVerbose(2) \\\n", " .setDropout(0.8) \\\n", " .setBatchSize(18) \\\n", - " .setEnableOutputLogs(True)\n", + " .setEnableOutputLogs(True) \\\n", " .setOutputLogsPath(\"s3://my_bucket/my_path/ner_logs\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "l9AUW20Q8Aah", - "outputId": "22338e79-86c9-4023-ee90-5287fe3daeb8" - }, + "metadata": {}, "outputs": [ { "data": { @@ -188,7 +172,7 @@ "NerDLModel_4cc29d1aa9e3" ] }, - "execution_count": 10, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -242,8 +226,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dl-ner/ner_xlnet.ipynb b/examples/python/training/english/dl-ner/ner_xlnet.ipynb index 88eebc9a6ec0d6..df183f54b96b35 100644 --- a/examples/python/training/english/dl-ner/ner_xlnet.ipynb +++ b/examples/python/training/english/dl-ner/ner_xlnet.ipynb @@ -2,31 +2,23 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "uZhJcUl06r8w" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_xlnet.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/training/english/dl-ner/ner_xlnet.ipynb)\n", "\n", "## 0. Colab Setup" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "22mElNLo6rUI", - "outputId": "3be40661-9ca2-4b32-deae-6b8b53707d41" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2022-12-23 11:36:39-- http://setup.johnsnowlabs.com/colab.sh\n", "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", @@ -54,24 +46,20 @@ } ], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A_QE6hqA4WHh" - }, + "metadata": {}, "source": [ "# How to train a NER classifier with Xlnet embeddings based on Char CNNs - BiLSTM - CRF" ] }, { "cell_type": "markdown", - "metadata": { - "id": "wybDus1P4WHk" - }, + "metadata": {}, "source": [ "## Download the file into the local File System \n", "### It is a standard conll2003 format training file" @@ -79,18 +67,12 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EA0QHrLF4WHl", - "outputId": "d26c536f-3757-4b6c-a150-d511f60efc48" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "File already present.\n" ] @@ -117,9 +99,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "uYZhNUVH4WHs" - }, + "metadata": {}, "source": [ "# Read CoNLL Dataset into Spark dataframe and automagically generate features for futures tasks\n", "The readDataset method of the CoNLL class handily adds all the features required in the next steps" @@ -127,18 +107,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lQExmc684WHu", - "outputId": "fa606ffb-7cea-4a02-ee69-dde713a31945" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label|\n", @@ -180,9 +154,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JF9dJWoW4WH6" - }, + "metadata": {}, "source": [ "# Define the NER Pipeline \n", "\n", @@ -193,18 +165,12 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Z0xFttkH4WH7", - "outputId": "531577c6-0b9f-4497-d895-5ea25ee1f570" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "xlnet_base_cased download started this may take some time.\n", "Approximate size to download 417.5 MB\n", @@ -244,27 +210,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "YpcIr8b_4WIB" - }, + "metadata": {}, "source": [ "# Fit the Pipeline and get results" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hDKsFDRy4WIC", - "outputId": "64aa76e3-eb75-496b-c871-29798a25b5fb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n", "| text| document| sentence| token| pos| label| xlnet| ner|\n", @@ -291,27 +249,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "HFSKuv-x4WIH" - }, + "metadata": {}, "source": [ "### Checkout only result columns" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ObW2xBPn4WII", - "outputId": "59d2dc76-85f6-45d2-eaab-a69089dc4bd3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|text |ner |\n", @@ -328,9 +278,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JJAr8usHLsmw" - }, + "metadata": {}, "source": [ "## Alternative Albert models \n", "\n", @@ -339,18 +287,12 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CAGIS-vS4WIO", - "outputId": "a32af11d-6577-46c5-e382-c651eec98d1e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "albert_embeddings_albert_base_v1 download started this may take some time.\n", "Approximate size to download 42.8 MB\n", @@ -417,15 +359,6 @@ "ner_df = pipeline.fit(training_data.limit(10)).transform(training_data.limit(50))\n", "ner_df.show()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6xX4uVtcMVKF" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -447,8 +380,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" }, "name": "NER-Tutorial", "notebookId": 3359671281044291 diff --git a/examples/python/training/english/doc2vec/Train_Doc2Vec_and_Text_Classification.ipynb b/examples/python/training/english/doc2vec/Train_Doc2Vec_and_Text_Classification.ipynb index 44229855a8d2a4..6bb0289f731af3 100644 --- a/examples/python/training/english/doc2vec/Train_Doc2Vec_and_Text_Classification.ipynb +++ b/examples/python/training/english/doc2vec/Train_Doc2Vec_and_Text_Classification.ipynb @@ -1,23 +1,30 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/doc2vec/Train_Doc2Vec_and_Text_Classification.ipynb)\n", + "\n", + "# Document Embeddings with Doc2Vec" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "77mVF2ES4S01" - }, + "metadata": {}, "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "VCiyzqtH4VCC" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -27,40 +34,36 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JSE7xgQc4gTg", - "outputId": "4a6296be-f211-48b9-816e-55cab2e37426" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2021-11-21 09:52:29-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.92.54\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.92.54|:443... connected.\n", + "--2023-02-20 15:54:06-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.231.104, 52.217.198.144, 52.216.212.120, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.231.104|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 33497180 (32M) [text/csv]\n", "Saving to: ‘aclimdb_train.csv’\n", "\n", - "aclimdb_train.csv 100%[===================>] 31.95M 81.6MB/s in 0.4s \n", + "aclimdb_train.csv 100%[===================>] 31,95M 14,6MB/s in 2,2s \n", "\n", - "2021-11-21 09:52:29 (81.6 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]\n", + "2023-02-20 15:54:09 (14,6 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]\n", "\n", - "--2021-11-21 09:52:30-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.92.54\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.92.54|:443... connected.\n", + "--2023-02-20 15:54:09-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.67.14, 3.5.19.152, 52.217.93.246, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.67.14|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 32715164 (31M) [text/csv]\n", "Saving to: ‘aclimdb_test.csv’\n", "\n", - "aclimdb_test.csv 100%[===================>] 31.20M 46.9MB/s in 0.7s \n", + "aclimdb_test.csv 100%[===================>] 31,20M 15,5MB/s in 2,0s \n", "\n", - "2021-11-21 09:52:30 (46.9 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]\n", + "2023-02-20 15:54:12 (15,5 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]\n", "\n" ] } @@ -72,14 +75,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VOSCO4hg4jp9", - "outputId": "9a4ef71b-772a-4242-b947-1b6f09468ebb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -128,10 +125,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "M_6wrm1X4nQP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -143,14 +138,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "YFLQsOby4rPg", - "outputId": "10d4508e-9562-4ee0-cfa2-42ed37a3d0a9" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -179,7 +168,7 @@ "stops = StopWordsCleaner.pretrained()\\\n", " .setInputCols(\"normalized\")\\\n", " .setOutputCol(\"cleanedToken\")\n", - " \n", + "\n", "doc2Vec = Doc2VecApproach()\\\n", " .setInputCols(\"cleanedToken\")\\\n", " .setOutputCol(\"sentence_embeddings\")\\\n", @@ -212,10 +201,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "ZT4dQu328okt" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipelineModel = pipeline.fit(trainDataset)" @@ -223,21 +210,39 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_vTRFsKV92Yz", - "outputId": "54af004f-47dd-4038-b0b1-c1dd2c09228b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total 4\n", - "-rw-r--r-- 1 root root 452 Nov 21 09:58 ClassifierDLApproach_b126569e5e91.log\n" + "total 100\n", + "-rw-r--r-- 1 root root 446 20. Feb 15:55 ClassifierDLApproach_97ff5c76d735.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_17606bbb7d1f.log\n", + "-rw-r--r-- 1 root root 313 10. Feb 16:54 ClassifierMetrics_1a6c515483ae.log\n", + "-rw-r--r-- 1 root root 323 10. Feb 16:54 ClassifierMetrics_2530315112a8.log\n", + "-rw-r--r-- 1 root root 314 10. Feb 16:54 ClassifierMetrics_3ccf43933a23.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_55c7e364bf2b.log\n", + "-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_9290b613e8d7.log\n", + "-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_aa0e2812a3b9.log\n", + "-rw-r--r-- 1 root root 318 10. Feb 16:54 ClassifierMetrics_ad4cb4a650fa.log\n", + "-rw-r--r-- 1 root root 312 10. Feb 16:54 ClassifierMetrics_efc7f6345e79.log\n", + "-rw-r--r-- 1 root root 319 10. Feb 16:54 ClassifierMetrics_f571876aaa09.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:23 NerDL_0f47f69f09e6.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_10e337c8a3ef.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 17:31 NerDL_18e7b1673dab.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_27f18f749174.log\n", + "-rw-r--r-- 1 root root 320 2. Aug 2022 NerDL_3ae0321ce66a.log\n", + "-rw-r--r-- 1 root root 319 26. Okt 09:13 NerDL_568d747656b8.log\n", + "-rw-r--r-- 1 root root 320 26. Okt 09:03 NerDL_5970e276422f.log\n", + "-rw-r--r-- 1 root root 320 16. Jan 11:10 NerDL_759a68c3769d.log\n", + "-rw-r--r-- 1 root root 320 3. Nov 19:22 NerDL_891f9b941985.log\n", + "-rw-r--r-- 1 root root 320 2. Feb 2022 NerDL_8e8184f259cb.log\n", + "-rw-r--r-- 1 root root 320 27. Okt 13:02 NerDL_add5b34b2ecb.log\n", + "-rw-r--r-- 1 root root 320 21. Okt 19:06 NerDL_bc57a96c68c3.log\n", + "-rw-r--r-- 1 root root 320 12. Jan 16:47 NerDL_ff0a43f20378.log\n", + "-rw-r--r-- 1 root root 897 10. Feb 16:54 SentimentDLApproach_98dfd2c1fdee.log\n" ] } ], @@ -247,25 +252,19 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qzYCO5j3EkAu", - "outputId": "2f225170-73f1-41d9-de9a-2353e3d8610a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 25000 - classes: 2\n", - "Epoch 0/5 - 6.51s - loss: 184.16612 - acc: 0.8153926 - batches: 391\n", - "Epoch 1/5 - 5.91s - loss: 178.30418 - acc: 0.8358334 - batches: 391\n", - "Epoch 2/5 - 5.65s - loss: 179.25107 - acc: 0.84036857 - batches: 391\n", - "Epoch 3/5 - 6.31s - loss: 178.86932 - acc: 0.84237176 - batches: 391\n", - "Epoch 4/5 - 5.80s - loss: 178.13194 - acc: 0.84489584 - batches: 391\n" + "Epoch 0/5 - 2.27s - loss: 194.4157 - acc: 0.814335 - batches: 391\n", + "Epoch 1/5 - 1.74s - loss: 186.7701 - acc: 0.8377324 - batches: 391\n", + "Epoch 2/5 - 1.75s - loss: 184.50777 - acc: 0.8419792 - batches: 391\n", + "Epoch 3/5 - 1.79s - loss: 182.49121 - acc: 0.8430609 - batches: 391\n", + "Epoch 4/5 - 1.69s - loss: 180.77087 - acc: 0.8451843 - batches: 391\n" ] } ], @@ -275,10 +274,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "NZJuax-nFHTQ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "prediction = pipelineModel.transform(testDataset)" @@ -286,14 +283,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yyjerNbgFZWg", - "outputId": "fc7b650b-5e35-4f90-f40b-2deadfd0e049" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -301,8 +292,8 @@ "text": [ " precision recall f1-score support\n", "\n", - " negative 0.87 0.80 0.84 13575\n", - " positive 0.79 0.86 0.82 11425\n", + " negative 0.86 0.82 0.84 13143\n", + " positive 0.81 0.85 0.83 11857\n", "\n", " accuracy 0.83 25000\n", " macro avg 0.83 0.83 0.83 25000\n", @@ -321,9 +312,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ZT6UH3NJ5heL" - }, + "metadata": {}, "source": [ "## Save and Restore\n", "### Pipeline Model\n", @@ -333,27 +322,21 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rmfetBzV5nUn", - "outputId": "181a1c0f-8ea1-4aa3-8c78-a4989ddb2920" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[DocumentAssembler_2f9c0247af19,\n", - " REGEX_TOKENIZER_1f492672ab16,\n", - " NORMALIZER_5f6019207ea3,\n", + "[DocumentAssembler_eb3006c82ed9,\n", + " REGEX_TOKENIZER_62be3e2cd631,\n", + " NORMALIZER_8c22ec321476,\n", " STOPWORDS_CLEANER_3e62acb2648b,\n", - " Doc2VecModel_7921b49ae1a0,\n", - " ClassifierDLModel_4fb2630de611]" + " Doc2VecModel_8e5707c8288a,\n", + " ClassifierDLModel_9e82f8b9ca8b]" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -367,10 +350,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "L1zq3lyO8cOq" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipelineModel.write().overwrite().save(\"./imdb_classifier_doc2vec_pipeline\")" @@ -378,27 +359,21 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "I5ZAJIbx8p20", - "outputId": "bfc48c93-9915-44ee-d63e-f9c2f65cad0f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[DocumentAssembler_2f9c0247af19,\n", - " REGEX_TOKENIZER_1f492672ab16,\n", - " NORMALIZER_5f6019207ea3,\n", + "[DocumentAssembler_eb3006c82ed9,\n", + " REGEX_TOKENIZER_62be3e2cd631,\n", + " NORMALIZER_8c22ec321476,\n", " STOPWORDS_CLEANER_3e62acb2648b,\n", - " Doc2VecModel_7921b49ae1a0,\n", - " ClassifierDLModel_4fb2630de611]" + " Doc2VecModel_8e5707c8288a,\n", + " ClassifierDLModel_9e82f8b9ca8b]" ] }, - "execution_count": 19, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -412,33 +387,27 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "m68VFQuG9Dzf", - "outputId": "328a4d6d-409e-4084-dee4-b685928cd9c2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'class': ['positive'],\n", + "{'document': ['This movie was really good!'],\n", " 'cleanedToken': ['movie', 'good'],\n", - " 'document': ['This movie was really good!'],\n", " 'normalized': ['this', 'movie', 'was', 'really', 'good'],\n", " 'sentence_embeddings': ['movie good'],\n", - " 'token': ['This', 'movie', 'was', 'really', 'good', '!']}" + " 'token': ['This', 'movie', 'was', 'really', 'good', '!'],\n", + " 'class': ['positive']}" ] }, - "execution_count": 22, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# you can use it with Spark NLP LightPipeline \n", + "# you can use it with Spark NLP LightPipeline\n", "lp_loadedPipeline = LightPipeline(loadedPipelineModel)\n", "\n", "lp_loadedPipeline.annotate(\"This movie was really good!\")" @@ -446,14 +415,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "fOff6Sfr9VP6", - "outputId": "99923eae-b173-4937-a820-3146e285bba4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -486,9 +449,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "tnufdTmL5oyQ" - }, + "metadata": {}, "source": [ "### Annotator Models\n", "Now let's say you would like to only save the trained annotators inside your pipeline so you can load them inside another custom Pipeline" @@ -496,27 +457,21 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_dzzYJYQ5pJa", - "outputId": "83da0eae-3160-4b5f-983b-3101ff277ca3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[DocumentAssembler_2f9c0247af19,\n", - " REGEX_TOKENIZER_1f492672ab16,\n", - " NORMALIZER_5f6019207ea3,\n", + "[DocumentAssembler_eb3006c82ed9,\n", + " REGEX_TOKENIZER_62be3e2cd631,\n", + " NORMALIZER_8c22ec321476,\n", " STOPWORDS_CLEANER_3e62acb2648b,\n", - " Doc2VecModel_7921b49ae1a0,\n", - " ClassifierDLModel_4fb2630de611]" + " Doc2VecModel_8e5707c8288a,\n", + " ClassifierDLModel_9e82f8b9ca8b]" ] }, - "execution_count": 29, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -528,21 +483,15 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a0cEyPk298cd", - "outputId": "518b3aa8-070d-4cf8-e275-11eaa246dbb2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ClassifierDLModel_4fb2630de611\n", - "Doc2VecModel_7921b49ae1a0\n" + "ClassifierDLModel_9e82f8b9ca8b\n", + "Doc2VecModel_8e5707c8288a\n" ] } ], @@ -553,10 +502,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "id": "jM16Elha-Mj3" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# let's save our ClassifierDL - let's mention it was trained by doc2vec_aclImdb as well\n", @@ -565,10 +512,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "id": "AkFvbdQA-X1T" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# and here is our trained Doc2VecModel\n", @@ -584,10 +529,19 @@ }, "kernelspec": { "display_name": "Python 3", + "language": "python", "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/entity-ruler/EntityRuler.ipynb b/examples/python/training/english/entity-ruler/EntityRuler.ipynb index 53a9f5dc26100b..c5c8478c8d7e68 100644 --- a/examples/python/training/english/entity-ruler/EntityRuler.ipynb +++ b/examples/python/training/english/entity-ruler/EntityRuler.ipynb @@ -1,28 +1,15 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 58012, - "status": "ok", - "timestamp": 1661544638962, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "6KvNW4MU5rrF", - "outputId": "4f640fd8-41e8-4f35-c6d6-ed98ab926127" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/entity-ruler/EntityRuler.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/entity-ruler/EntityRuler.ipynb)\n", + "\n", + "# Training EntityRuler" ] }, { @@ -31,67 +18,47 @@ "metadata": {}, "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "executionInfo": { - "elapsed": 354, - "status": "ok", - "timestamp": 1661544665640, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "XLNO3Z9r6HgR" - }, - "outputs": [], + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.3.1\n", + "3.3.0\n" + ] + } + ], "source": [ "import sparknlp\n", "from sparknlp.base import *\n", "from sparknlp.annotator import *\n", - "from pyspark.sql import SparkSession" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "spark = sparknlp.start()" + "from pyspark.sql import SparkSession\n", + "\n", + "spark = sparknlp.start()\n", + "\n", + "print(sparknlp.version())\n", + "print(spark.version)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "YXbad43CyOKF" - }, + "metadata": {}, "source": [ "This notebook uses the default configuration (useStorage=true). This parameter tells the annotator to serialize patterns file data with RocksDB storage when saving the model." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "executionInfo": { - "elapsed": 3769, - "status": "ok", - "timestamp": 1661544683809, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "_eB72Yzg8_Jx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "data = spark.createDataFrame([[\"Lord Eddard Stark was the head of House Stark. John Snow lives in Winterfell.\"]]).toDF(\"text\")" @@ -99,24 +66,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 3605, - "status": "ok", - "timestamp": 1661544687408, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "mi7ENdn0MTvt", - "outputId": "911339e2-f1fc-41cc-e1c6-d348a2fae1a9" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -137,9 +88,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "69UzeVcXCcNc" - }, + "metadata": {}, "source": [ "# Keywords Patterns" ] @@ -153,20 +102,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "executionInfo": { - "elapsed": 37, - "status": "ok", - "timestamp": 1661544687409, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "-qPpbCxYIyHy" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -192,33 +129,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Rdrsm2rfrACF" - }, + "metadata": {}, "source": [ "We are going to use a JSON file with the following format:" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 36, - "status": "ok", - "timestamp": 1661544687410, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "FbP7UtSrLnQ3", - "outputId": "a34bf1ea-25a7-4d0c-cb1b-b3efb9c08a27" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -234,29 +153,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "dgSHiRHc8eM2" - }, + "metadata": {}, "source": [ "When working with keywords, we DON'T need a pipeline with Tokenizer" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "executionInfo": { - "elapsed": 321, - "status": "ok", - "timestamp": 1661544687703, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "tRyju8D-6XJ1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", @@ -271,20 +176,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "executionInfo": { - "elapsed": 2430, - "status": "ok", - "timestamp": 1661544690131, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "FhKPEMb09w6a" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[document_assembler, sentence_detector, entity_ruler])\n", @@ -293,24 +186,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 1857, - "status": "ok", - "timestamp": 1661544691984, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "D7mjcA2E_ehu", - "outputId": "23b74299-77e6-443b-a6f9-9581b6f95fb6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -331,20 +208,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "executionInfo": { - "elapsed": 15, - "status": "ok", - "timestamp": 1661544691985, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "XIEbSbpPjzvJ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "light_pipeline = LightPipeline(pipeline_model)" @@ -352,24 +217,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 12, - "status": "ok", - "timestamp": 1661544691985, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "9bJw1H9lj4NS", - "outputId": "d77eb1bf-d16f-4ecf-a774-71116992c857" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -377,7 +226,7 @@ "dict_keys(['document', 'sentence', 'entity'])" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -389,33 +238,17 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11, - "status": "ok", - "timestamp": 1661544691986, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "tmZcr-jnljP7", - "outputId": "5ee7baf5-4d7b-4ef5-a41e-46084550adb8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Annotation(chunk, 7, 15, John Snow, {'entity': 'PERSON', 'sentence': '0'}),\n", - " Annotation(chunk, 57, 64, Jon Snow, {'entity': 'PERSON', 'sentence': '0'})]" + "[Annotation(chunk, 7, 15, John Snow, {'entity': 'PERSON', 'sentence': '0'}, []),\n", + " Annotation(chunk, 57, 64, Jon Snow, {'entity': 'PERSON', 'sentence': '0'}, [])]" ] }, - "execution_count": 16, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -426,29 +259,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9LAxooiQNYVv" - }, + "metadata": {}, "source": [ "We can define an id field to identify entities and it supports JSON Lines format as the example below." ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "executionInfo": { - "elapsed": 328, - "status": "ok", - "timestamp": 1661544692307, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "V8_KVQvdBDm8" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "keywords = [\n", @@ -477,24 +296,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1661544692308, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "OWakfKMlB3Th", - "outputId": "b877fc3d-4a07-48d9-d243-581afcb48b48" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -512,20 +315,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1661544692308, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "-_4a1QYaNPfr" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler = EntityRulerApproach() \\\n", @@ -537,24 +328,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 1505, - "status": "ok", - "timestamp": 1661544693809, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "Cd0sNKNeOcUg", - "outputId": "7835c801-c821-444d-92ee-c60959a00ed4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -577,29 +352,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "lDZ21hp3rOHV" - }, + "metadata": {}, "source": [ "For the CSV file we use the following configuration:\n" ] }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "executionInfo": { - "elapsed": 4, - "status": "ok", - "timestamp": 1661544693810, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "_MLFqq-ICy56" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "with open('./keywords.csv', 'w') as csvfile:\n", @@ -611,24 +372,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 319, - "status": "ok", - "timestamp": 1661544694126, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "Bz4129WyDNwd", - "outputId": "fa00dc46-a624-4b99-f817-248e4e646c28" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -647,20 +392,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "executionInfo": { - "elapsed": 3, - "status": "ok", - "timestamp": 1661544694126, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "0HLcNfrdoAmP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler_csv = EntityRulerApproach() \\\n", @@ -672,20 +405,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "executionInfo": { - "elapsed": 623, - "status": "ok", - "timestamp": 1661544694747, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "NYTuwztwoHIK" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline_csv = Pipeline(stages=[document_assembler, sentence_detector, entity_ruler_csv])\n", @@ -694,24 +415,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 927, - "status": "ok", - "timestamp": 1661544695359, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "qEN-zRNQoLu5", - "outputId": "bda4f882-b34c-47ea-9c0e-1aefcbbfb5f7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -732,38 +437,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FmLiqAYhn5DT" - }, + "metadata": {}, "source": [ "# Regex Patterns" ] }, { "cell_type": "markdown", - "metadata": { - "id": "V4h5Ulxyn-rE" - }, + "metadata": {}, "source": [ "Starting with Spark NLP 4.2.0 regex patterns must be defined at a more granular level, with each label. For example we can have the JSON file below" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1661544695360, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "1QQvXA4Zqelm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "data = spark.createDataFrame([[\"The address is 123456 in Winterfell\"]]).toDF(\"text\")" @@ -771,20 +460,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "executionInfo": { - "elapsed": 4, - "status": "ok", - "timestamp": 1661544695360, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "oZZWlpFknvn1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "patterns_string = \"\"\"\n", @@ -810,24 +487,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 302, - "status": "ok", - "timestamp": 1661544695659, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "xnp0zMqpogVU", - "outputId": "37018d09-d0ea-4533-9447-9a02fbdc6fca" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -843,29 +504,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9RcZIelWorQ9" - }, + "metadata": {}, "source": [ "When defining a regex pattern, we need to define Tokenizer annotator in the pipeline" ] }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1661544695659, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "rCUYCM56oq-e" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer().setInputCols(\"sentence\").setOutputCol(\"token\")" @@ -873,20 +520,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1661544695660, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "TJvrwk18pGqk" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "regex_entity_ruler = EntityRulerApproach() \\\n", @@ -898,20 +533,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "executionInfo": { - "elapsed": 713, - "status": "ok", - "timestamp": 1661544696368, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "7v8TbKbo0Izg" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "regex_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, regex_entity_ruler])\n", @@ -920,24 +543,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 926, - "status": "ok", - "timestamp": 1661544697291, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "pl63WAaq0TKa", - "outputId": "e8c9987a-4462-4b81-89a8-ad69d604c62e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -983,8 +590,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/entity-ruler/EntityRuler_Alphabet.ipynb b/examples/python/training/english/entity-ruler/EntityRuler_Alphabet.ipynb index aa346e36d54ee1..a0f65fac523593 100644 --- a/examples/python/training/english/entity-ruler/EntityRuler_Alphabet.ipynb +++ b/examples/python/training/english/entity-ruler/EntityRuler_Alphabet.ipynb @@ -1,67 +1,31 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 60512, - "status": "ok", - "timestamp": 1661605164779, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "MoG6TxHvBTS_", - "outputId": "f395e09e-be37-4515-d32e-af3447fbbe28" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/entity-ruler/EntityRuler_Alphabet.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/entity-ruler/EntityRuler_Alphabet.ipynb)\n", + "\n", + "# Defining EntityRuler with an Alphabet" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "executionInfo": { - "elapsed": 6254, - "status": "ok", - "timestamp": 1661605188633, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "zhXe9MYMBcYs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "executionInfo": { - "elapsed": 319, - "status": "ok", - "timestamp": 1661605188942, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "A3THWvj7GO12" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -81,33 +45,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Oe4Ih3IuBg0V" - }, + "metadata": {}, "source": [ - "Since Spark NLP version 4.3.0, we reduce significantly the latency of Entity Ruler by implementing Aho-Corasick algorithm. This requires defining an alphabet for some cases. For English documents, you won't need to define it because under the hood Entity Ruler annotator uses an English alphabet by default. However, for special use cases we will need to proceed like the example below:" + "Since Spark NLP version 4.3.1, we reduce significantly the latency of Entity Ruler by implementing Aho-Corasick algorithm. This requires defining an alphabet for some cases. For English documents, you won't need to define it because under the hood Entity Ruler annotator uses an English alphabet by default. However, for special use cases we will need to proceed like the example below:" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7534, - "status": "ok", - "timestamp": 1661605211009, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "VnowuheCB5KQ", - "outputId": "09dcaa4f-ab90-41ca-b92d-04790ed0e277" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -129,29 +75,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "lSz72yQ4CPgI" - }, + "metadata": {}, "source": [ "The text above has an special character, an accent in vowel u (ú)" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "executionInfo": { - "elapsed": 38, - "status": "ok", - "timestamp": 1661605211010, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "fJqB0X5ZCnJ0" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -170,9 +102,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "wnJzZ35_EpI4" - }, + "metadata": {}, "source": [ "In addition, a pattern in `locations.json` file has also hyphen punctuation mark (-).\n", "So, we need to define our custom alphabet to use Entity Ruler for Tolkien's books. Here, we will define just the 2 special characters for our text." @@ -180,20 +110,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "executionInfo": { - "elapsed": 36, - "status": "ok", - "timestamp": 1661605211011, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "N6vAOMFGE5Et" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "alphabet = \"abcdefghijklmnopqrstuvwxyz\"\n", @@ -207,24 +125,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 290, - "status": "ok", - "timestamp": 1661605211266, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "tXR9y_qyFTgK", - "outputId": "e37de04c-43a0-4ff7-ca9b-45f8c41cb16c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -242,20 +144,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1661605211267, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "NKW2B_PWFVY3" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", @@ -270,20 +160,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "executionInfo": { - "elapsed": 2090, - "status": "ok", - "timestamp": 1661605213350, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "qfcJeJJcFWqE" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[document_assembler, sentence_detector, entity_ruler])\n", @@ -292,24 +170,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 1606, - "status": "ok", - "timestamp": 1661605214949, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "OZVJLMxuFX_M", - "outputId": "776ea9cc-3e42-41a1-ddc2-6270a72a0670" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -330,9 +192,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "AGpnI4Z5FZSk" - }, + "metadata": {}, "source": [ "If you don't define the required alphabet, you will get this error: \n", "\n", @@ -345,42 +205,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "X2UyyMf6HCP5" - }, + "metadata": {}, "source": [ "# Non-English Languages" ] }, { "cell_type": "markdown", - "metadata": { - "id": "-OVPFyufHHLV" - }, + "metadata": {}, "source": [ "EntityRuler has some predefined alphabets for the most common languages: English, Spanish, French, and German. So, if you have documents in Spanish, you just need to set an alphabet like the example below:" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 321, - "status": "ok", - "timestamp": 1661605215261, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "5cWTH7XSG49Z", - "outputId": "24998614-f891-4158-eab1-fbf8a8b4e7d7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -402,20 +242,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "executionInfo": { - "elapsed": 6, - "status": "ok", - "timestamp": 1661605215262, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "PDYUq3loHqOV" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler = EntityRulerApproach() \\\n", @@ -427,20 +255,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "executionInfo": { - "elapsed": 313, - "status": "ok", - "timestamp": 1661605215570, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "CoalQ1ttH-jN" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[document_assembler, sentence_detector, entity_ruler])\n", @@ -449,24 +265,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 617, - "status": "ok", - "timestamp": 1661605216181, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "udZMwNuiIA2n", - "outputId": "5b5ed233-5d62-4fe1-b5d8-4dfdf6f0aea8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -487,9 +287,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_IDY3uG4IMfV" - }, + "metadata": {}, "source": [ "If your language is not a predefined alphabet, you will need to define all the characters of your alphabet, as shown in the first example. \n", "Keep in mind that an alphabet may require not only letters but also numbers, punctuation marks, and symbol characters." @@ -516,8 +314,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/entity-ruler/EntityRuler_LightPipeline.ipynb b/examples/python/training/english/entity-ruler/EntityRuler_LightPipeline.ipynb index 593e2dfdd00e74..095b49d2e1c3d9 100644 --- a/examples/python/training/english/entity-ruler/EntityRuler_LightPipeline.ipynb +++ b/examples/python/training/english/entity-ruler/EntityRuler_LightPipeline.ipynb @@ -1,82 +1,46 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 57305, - "status": "ok", - "timestamp": 1661544131455, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "6KvNW4MU5rrF", - "outputId": "d5299652-c828-48d3-e7ee-c10c9f733586" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/entity-ruler/EntityRuler_LightPipeline.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/entity-ruler/EntityRuler_LightPipeline.ipynb)\n", + "\n", + "# EntityRuler in a LightPipeline" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "executionInfo": { - "elapsed": 6337, - "status": "ok", - "timestamp": 1661544177059, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "1953uewL-Jll" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "P_GiBDlsja-o" - }, + "metadata": {}, "source": [ "This notebook showcases serialization and LightPipeline for EntityRuler" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "executionInfo": { - "elapsed": 344, - "status": "ok", - "timestamp": 1661544177397, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "XLNO3Z9r6HgR" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", "from sparknlp.base import *\n", "from sparknlp.annotator import *\n", - "from pyspark.sql import SparkSession" + "from pyspark.sql import SparkSession\n", + "\n", + "spark = sparknlp.start()" ] }, { @@ -84,47 +48,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "executionInfo": { - "elapsed": 3492, - "status": "ok", - "timestamp": 1661544192888, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "_eB72Yzg8_Jx" - }, - "outputs": [], "source": [ "data = spark.createDataFrame([[\"\"]]).toDF(\"text\")" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "executionInfo": { - "elapsed": 9, - "status": "ok", - "timestamp": 1661544192889, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "OmOTCKpV84Xs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -150,20 +81,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "executionInfo": { - "elapsed": 2713, - "status": "ok", - "timestamp": 1661544195595, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "tRyju8D-6XJ1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler = EntityRulerApproach() \\\n", @@ -177,20 +96,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "executionInfo": { - "elapsed": 859, - "status": "ok", - "timestamp": 1661544196447, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "OqFTxvxRC5aw" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler_loaded = EntityRulerModel().load(\"tmp_entity_ruler_model\")" @@ -198,20 +105,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "executionInfo": { - "elapsed": 855, - "status": "ok", - "timestamp": 1661544197298, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "jtMK0ZekjSeB" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", @@ -223,24 +118,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1661544197299, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "d7qy0hxlkX3u", - "outputId": "4f0e696d-7de1-4796-cdfd-28d594815911" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -286,8 +165,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/entity-ruler/EntityRuler_Without_Storage.ipynb b/examples/python/training/english/entity-ruler/EntityRuler_Without_Storage.ipynb index cf93a0f6d09ca9..ecb65628fdbf3b 100644 --- a/examples/python/training/english/entity-ruler/EntityRuler_Without_Storage.ipynb +++ b/examples/python/training/english/entity-ruler/EntityRuler_Without_Storage.ipynb @@ -1,91 +1,53 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 59873, - "status": "ok", - "timestamp": 1661544296082, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "6KvNW4MU5rrF", - "outputId": "98cff245-1318-482f-816d-2283c7ca0f86" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/entity-ruler/EntityRuler_Without_Storage.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/entity-ruler/EntityRuler_Without_Storage.ipynb)\n", + "\n", + "# EntityRuler without Storage" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "executionInfo": { - "elapsed": 13788, - "status": "ok", - "timestamp": 1661544344949, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "ZXU_LZZUJI6V" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "cy1qmHPFzjyK" - }, + "metadata": {}, "source": [ "This notebook uses useStorage parameter as false (default). So the annotator will serialize patterns file data with SparkML parameters when saving the model." ] }, { "cell_type": "markdown", - "metadata": { - "id": "0uWBDs0a1HaF" - }, + "metadata": {}, "source": [ "**We recommend using the default value (setUseStorage=False), as shown in this notebook since the results of our benchmarks reflect that this configuration is faster than setUseStorage=True**" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "executionInfo": { - "elapsed": 489, - "status": "ok", - "timestamp": 1661544345433, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "XLNO3Z9r6HgR" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", "from sparknlp.base import *\n", "from sparknlp.annotator import *\n", - "from pyspark.sql import SparkSession" + "from pyspark.sql import SparkSession\n", + "\n", + "spark = sparknlp.start()" ] }, { @@ -93,51 +55,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "executionInfo": { - "elapsed": 4048, - "status": "ok", - "timestamp": 1661544362278, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "_eB72Yzg8_Jx" - }, - "outputs": [], "source": [ "data = spark.createDataFrame([[\"Lord Eddard Stark was the head of House Stark. John Snow lives in Winterfell.\"]]).toDF(\"text\")" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 3265, - "status": "ok", - "timestamp": 1661544365513, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "mi7ENdn0MTvt", - "outputId": "a7801922-557b-4403-9b2f-9524f719bb69" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -158,29 +83,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "69UzeVcXCcNc" - }, + "metadata": {}, "source": [ "# Keywords Patterns" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "executionInfo": { - "elapsed": 36, - "status": "ok", - "timestamp": 1661544365513, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "-qPpbCxYIyHy" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -206,33 +117,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Rdrsm2rfrACF" - }, + "metadata": {}, "source": [ "We are going to use a JSON file with the following format:" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 35, - "status": "ok", - "timestamp": 1661544365514, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "FbP7UtSrLnQ3", - "outputId": "fd8d47dd-1f4d-4dc5-c5b4-22675cd93fbd" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -248,29 +141,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "dgSHiRHc8eM2" - }, + "metadata": {}, "source": [ "When working with keywords, we DON'T need a pipeline with Tokenizer anymore." ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "executionInfo": { - "elapsed": 27, - "status": "ok", - "timestamp": 1661544365514, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "tRyju8D-6XJ1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", @@ -284,20 +163,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "executionInfo": { - "elapsed": 1803, - "status": "ok", - "timestamp": 1661544367291, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "FhKPEMb09w6a" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[document_assembler, sentence_detector, entity_ruler])\n", @@ -306,24 +173,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2129, - "status": "ok", - "timestamp": 1661544369417, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "D7mjcA2E_ehu", - "outputId": "9cabfe81-5ba2-4586-8f4d-c36cffcd3706" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -344,20 +195,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "executionInfo": { - "elapsed": 23, - "status": "ok", - "timestamp": 1661544369418, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "XIEbSbpPjzvJ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "light_pipeline = LightPipeline(pipeline_model)" @@ -365,24 +204,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 21, - "status": "ok", - "timestamp": 1661544369418, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "9bJw1H9lj4NS", - "outputId": "b8436081-6cbc-45ea-ab0f-eb40accd5089" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -390,7 +213,7 @@ "dict_keys(['document', 'sentence', 'entity'])" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -402,33 +225,17 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 15, - "status": "ok", - "timestamp": 1661544369419, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "tmZcr-jnljP7", - "outputId": "79c29d19-9daa-4cae-9daa-53d1797fe940" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Annotation(chunk, 7, 15, John Snow, {'entity': 'PERSON', 'sentence': '0'}),\n", - " Annotation(chunk, 57, 64, Jon Snow, {'entity': 'PERSON', 'sentence': '0'})]" + "[Annotation(chunk, 7, 15, John Snow, {'entity': 'PERSON', 'sentence': '0'}, []),\n", + " Annotation(chunk, 57, 64, Jon Snow, {'entity': 'PERSON', 'sentence': '0'}, [])]" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -439,29 +246,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9LAxooiQNYVv" - }, + "metadata": {}, "source": [ "We can define an id field to identify entities and it supports JSON Lines format as the example below." ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "executionInfo": { - "elapsed": 11, - "status": "ok", - "timestamp": 1661544369420, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "V8_KVQvdBDm8" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "keywords = [\n", @@ -490,24 +283,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 502, - "status": "ok", - "timestamp": 1661544369912, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "OWakfKMlB3Th", - "outputId": "89d7523e-4fb6-4c8a-b041-844d8d5de119" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -525,20 +302,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1661544369913, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "-_4a1QYaNPfr" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler = EntityRulerApproach() \\\n", @@ -549,24 +314,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 897, - "status": "ok", - "timestamp": 1661544370801, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "Cd0sNKNeOcUg", - "outputId": "84c5835f-de67-44da-c10d-93204d66c6ff" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -589,29 +338,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "lDZ21hp3rOHV" - }, + "metadata": {}, "source": [ "For the CSV file we use the following configuration:\n" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "executionInfo": { - "elapsed": 13, - "status": "ok", - "timestamp": 1661544370802, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "_MLFqq-ICy56" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "with open('./keywords.csv', 'w') as csvfile:\n", @@ -623,24 +358,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 441, - "status": "ok", - "timestamp": 1661544371232, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "Bz4129WyDNwd", - "outputId": "fe939959-59b8-43cd-cf39-ae2181988c81" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -659,20 +378,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1661544371233, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "0HLcNfrdoAmP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "entity_ruler_csv = EntityRulerApproach() \\\n", @@ -683,20 +390,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1661544371234, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "NYTuwztwoHIK" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline_csv = Pipeline(stages=[document_assembler, sentence_detector, entity_ruler_csv])\n", @@ -705,24 +400,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 845, - "status": "ok", - "timestamp": 1661544372070, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "qEN-zRNQoLu5", - "outputId": "91d0470d-ff37-4dca-b958-32895932722c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -743,38 +422,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FmLiqAYhn5DT" - }, + "metadata": {}, "source": [ "# Regex Patterns" ] }, { "cell_type": "markdown", - "metadata": { - "id": "V4h5Ulxyn-rE" - }, + "metadata": {}, "source": [ "Starting Spark NLP 4.2.0, regex patterns are defined at a more granular level, with each label. For example, we can have the JSON file below:" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "executionInfo": { - "elapsed": 18, - "status": "ok", - "timestamp": 1661544372071, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "1QQvXA4Zqelm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "data = spark.createDataFrame([[\"The address is 123456 in Winterfell\"]]).toDF(\"text\")" @@ -782,20 +445,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "executionInfo": { - "elapsed": 18, - "status": "ok", - "timestamp": 1661544372072, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "oZZWlpFknvn1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "patterns_string = \"\"\"\n", @@ -821,24 +472,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 18, - "status": "ok", - "timestamp": 1661544372073, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "xnp0zMqpogVU", - "outputId": "6098f1da-d789-4ad0-da1d-3c49180ccc1d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -854,29 +489,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9RcZIelWorQ9" - }, + "metadata": {}, "source": [ "When defining a regex pattern, we need to define Tokenizer annotator in the pipeline" ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "executionInfo": { - "elapsed": 13, - "status": "ok", - "timestamp": 1661544372074, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "rCUYCM56oq-e" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer().setInputCols(\"sentence\").setOutputCol(\"token\")" @@ -884,20 +505,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "executionInfo": { - "elapsed": 414, - "status": "ok", - "timestamp": 1661544372478, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "TJvrwk18pGqk" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "regex_entity_ruler = EntityRulerApproach() \\\n", @@ -908,20 +517,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1661544372479, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "7v8TbKbo0Izg" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "regex_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, regex_entity_ruler])\n", @@ -930,24 +527,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 849, - "status": "ok", - "timestamp": 1661544373319, - "user": { - "displayName": "Danilo Burbano", - "userId": "08593331088765378019" - }, - "user_tz": 300 - }, - "id": "pl63WAaq0TKa", - "outputId": "38fc2401-571b-476c-d222-ceb11c3e3d35" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -992,8 +573,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/dictionary-sentiment/sentiment.ipynb b/examples/python/training/english/sentiment-detection/RuleBasedSentiment.ipynb similarity index 65% rename from examples/python/training/english/dictionary-sentiment/sentiment.ipynb rename to examples/python/training/english/sentiment-detection/RuleBasedSentiment.ipynb index 07ca9fe12e6d44..b378e502580d8b 100644 --- a/examples/python/training/english/dictionary-sentiment/sentiment.ipynb +++ b/examples/python/training/english/sentiment-detection/RuleBasedSentiment.ipynb @@ -1,108 +1,59 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "b4BBN50oyiwG" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dictionary-sentiment/sentiment.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/sentiment-detection/RuleBasedSentiment.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Rule-based Sentiment Analysis" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nTH23Yu1yqfD", - "outputId": "775f3049-be2a-4845-f487-66917347a3bf" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 11:26:26-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 11:26:26-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 11:26:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 11:26:26 (60.4 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 53 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 65.4 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 54.6 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "4Ow6rjyOyiwN" - }, + "metadata": {}, "source": [ - "## Rule-based Sentiment Analysis\n", - "\n", "In the following example, we walk-through a simple use case for our straight forward SentimentDetector annotator.\n", "\n", "This annotator will work on top of a list of labeled sentences which can have any of the following features\n", - " \n", + "\n", " positive\n", " negative\n", " revert\n", " increment\n", " decrement\n", "\n", - "Each of these sentences will be used for giving a score to text " + "Each of these sentences will be used for giving a score to text" ] }, { "cell_type": "markdown", - "metadata": { - "id": "K_1aCdWNyiwQ" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource path to read local data files" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "jIH8pFdPyiwS" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "#Imports\n", "import sys\n", - "sys.path.append('../../')\n", "\n", "import sparknlp\n", "\n", @@ -116,30 +67,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "58CQiS99yiwh" - }, + "metadata": {}, "source": [ "#### 2. Load SparkSession if not already there" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ub7u0Z2yyiwj", - "outputId": "34b4f4db-defc-4e52-e17e-2ad17af97e4c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -154,52 +97,48 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XYXJ8Lrhyiwz", - "outputId": "e2fdbc33-f558-401f-91f9-bb5dd904e40e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "rm: cannot remove '/tmp/sentiment.parquet.zip': No such file or directory\n", - "--2022-12-23 11:28:03-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.136.198, 52.216.112.141, 52.217.137.208, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.136.198|:443... connected.\n", + "--2023-02-20 17:34:18-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.77.126, 52.216.88.173, 54.231.136.0, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.77.126|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 76127532 (73M) [application/zip]\n", "Saving to: ‘/tmp/sentiment.parquet.zip’\n", "\n", - "sentiment.parquet.z 100%[===================>] 72.60M 57.7MB/s in 1.3s \n", + "sentiment.parquet.z 100%[===================>] 72,60M 22,5MB/s in 4,0s \n", "\n", - "2022-12-23 11:28:05 (57.7 MB/s) - ‘/tmp/sentiment.parquet.zip’ saved [76127532/76127532]\n", + "2023-02-20 17:34:22 (18,3 MB/s) - ‘/tmp/sentiment.parquet.zip’ saved [76127532/76127532]\n", "\n", - "--2022-12-23 11:28:05-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/lemma-corpus-small/lemmas_small.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.136.198, 52.216.112.141, 52.217.137.208, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.136.198|:443... connected.\n", + "--2023-02-20 17:34:23-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/lemma-corpus-small/lemmas_small.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.135.240, 54.231.224.176, 52.217.196.88, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.135.240|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 189437 (185K) [text/plain]\n", "Saving to: ‘/tmp/lemmas_small.txt’\n", "\n", - "lemmas_small.txt 100%[===================>] 185.00K --.-KB/s in 0.09s \n", + "lemmas_small.txt 100%[===================>] 185,00K 541KB/s in 0,3s \n", "\n", - "2022-12-23 11:28:05 (2.08 MB/s) - ‘/tmp/lemmas_small.txt’ saved [189437/189437]\n", + "2023-02-20 17:34:23 (541 KB/s) - ‘/tmp/lemmas_small.txt’ saved [189437/189437]\n", "\n", - "--2022-12-23 11:28:05-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/default-sentiment-dict.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.136.198, 52.216.112.141, 52.217.137.208, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.136.198|:443... connected.\n", + "--2023-02-20 17:34:24-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/default-sentiment-dict.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.138.160, 52.216.107.206, 52.217.172.96, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.138.160|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 289 [text/plain]\n", "Saving to: ‘/tmp/default-sentiment-dict.txt’\n", "\n", - "default-sentiment-d 100%[===================>] 289 --.-KB/s in 0.001s \n", + "default-sentiment-d 100%[===================>] 289 --.-KB/s in 0s \n", "\n", - "2022-12-23 11:28:06 (334 KB/s) - ‘/tmp/default-sentiment-dict.txt’ saved [289/289]\n", + "2023-02-20 17:34:24 (1,84 MB/s) - ‘/tmp/default-sentiment-dict.txt’ saved [289/289]\n", "\n" ] } @@ -209,23 +148,17 @@ "! rm -rf /tmp/sentiment.parquet\n", "! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip -P /tmp\n", "! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/lemma-corpus-small/lemmas_small.txt -P /tmp\n", - "! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/default-sentiment-dict.txt -P /tmp " + "! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/default-sentiment-dict.txt -P /tmp" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zu_lzjvXyiw6", - "outputId": "fcf13542-c6b8-4349-ec81-5c44df492ea8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Archive: /tmp/sentiment.parquet.zip\n", " creating: /tmp/sentiment.parquet/\n", @@ -247,18 +180,12 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8ycCJ0Vyyiw_", - "outputId": "f424406a-7eb1-42e4-b0f1-d7701654fe72" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+\n", "|itemid|sentiment| text|\n", @@ -300,19 +227,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "HPH7HLK8yixE" - }, + "metadata": {}, "source": [ "#### 3. Create appropriate annotators. We are using Sentence Detection, Tokenizing the sentences, and find the lemmas of those tokens. The Finisher will only output the Sentiment." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "rPDSRAXtyixG" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -330,12 +253,12 @@ " .setInputCols([\"token\"]) \\\n", " .setOutputCol(\"lemma\") \\\n", " .setDictionary(\"/tmp/lemmas_small.txt\", key_delimiter=\"->\", value_delimiter=\"\\t\")\n", - " \n", + "\n", "sentiment_detector = SentimentDetector() \\\n", " .setInputCols([\"lemma\", \"sentence\"]) \\\n", " .setOutputCol(\"sentiment_score\") \\\n", " .setDictionary(\"/tmp/default-sentiment-dict.txt\", \",\")\n", - " \n", + "\n", "finisher = Finisher() \\\n", " .setInputCols([\"sentiment_score\"]) \\\n", " .setOutputCols([\"sentiment\"])" @@ -343,19 +266,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "3tYe_QijyixO" - }, + "metadata": {}, "source": [ "#### 4. Train the pipeline, which is only being trained from external resources, not from the dataset we pass on. The prediction runs on the target dataset" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "o53EAomsyixQ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher])\n", @@ -365,27 +284,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MgvlQ7TiyixV" - }, + "metadata": {}, "source": [ "#### 5. filter the finisher output, to find the positive sentiment lines" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FD8jYLEsyixW", - "outputId": "a84b1597-ac5e-48aa-ff07-fe270c996345" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+----------+------------------------------------------------------------------------------------------------------------------------------------+\n", "|itemid|sentiment |text |\n", @@ -409,15 +320,6 @@ "source": [ "result.where(array_contains(result.sentiment, \"positive\")).show(10,False)" ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "j8pjkB7Zyixd" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -439,8 +341,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb b/examples/python/training/english/sentiment-detection/VivekNarayanSentimentApproach.ipynb similarity index 72% rename from examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb rename to examples/python/training/english/sentiment-detection/VivekNarayanSentimentApproach.ipynb index b8eca27b2ebfe6..0ed6f28ee17fbc 100644 --- a/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb +++ b/examples/python/training/english/sentiment-detection/VivekNarayanSentimentApproach.ipynb @@ -1,74 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "YI2vj-VJyzM-" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/sentiment-detection/VivekNarayanSentimentApproach.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Vivekn Sentiment Analysis" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wfXHpaBVy8PY", - "outputId": "a66383aa-fffc-4f50-c2b4-2c6311985c86" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 11:57:17-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 11:57:17-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 11:57:17-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 11:57:17 (34.3 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 44 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 49.5 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 50.6 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "N3lJrZweyzNA" - }, + "metadata": {}, "source": [ - "## Vivekn Sentiment Analysis\n", - "\n", "In the following example, we walk-through Sentiment Analysis training and prediction using Spark NLP Annotators.\n", "\n", "The ViveknSentimentApproach annotator will compute [Vivek Narayanan algorithm](https://arxiv.org/pdf/1305.6143.pdf) with either a column in training dataset with rows labelled 'positive' or 'negative' or a folder full of positive text and a folder with negative text. Using n-grams and negation of sequences, this statistical model can achieve high accuracy if trained properly.\n", @@ -80,22 +38,17 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zWmdcLPGyzNB" - }, + "metadata": {}, "source": [ "#### 1. Call necessary imports and set the resource path to read local data files" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "1KcgP4dWyzNC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "#Imports\n", "import time\n", "import sys\n", "import os\n", @@ -112,30 +65,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JvGfY8_jyzNI" - }, + "metadata": {}, "source": [ "#### 2. Load SparkSession if not already there" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oycji8wiyzNJ", - "outputId": "3604d242-a35f-4faa-8c5c-f29d603d807e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version: 4.2.6\n", - "Apache Spark version: 3.2.3\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -148,40 +93,36 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "T4gVI6pwyzNP", - "outputId": "751df905-954c-4707-88cf-8dc7f6fb941a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "--2022-12-23 11:59:02-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/spell/words.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.166.48, 52.217.203.104, 3.5.20.150, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.166.48|:443... connected.\n", + "--2023-02-20 17:32:19-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/spell/words.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.166.80, 52.217.174.96, 54.231.131.48, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.166.80|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", - "Length: 4862966 (4.6M) [text/plain]\n", + "Length: 4862966 (4,6M) [text/plain]\n", "Saving to: ‘/tmp/words.txt’\n", "\n", - "words.txt 100%[===================>] 4.64M 30.9MB/s in 0.2s \n", + "words.txt 100%[===================>] 4,64M 5,17MB/s in 0,9s \n", "\n", - "2022-12-23 11:59:03 (30.9 MB/s) - ‘/tmp/words.txt’ saved [4862966/4862966]\n", + "2023-02-20 17:32:21 (5,17 MB/s) - ‘/tmp/words.txt’ saved [4862966/4862966]\n", "\n", - "--2022-12-23 11:59:03-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.166.48, 52.217.203.104, 3.5.20.150, ...\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.166.48|:443... connected.\n", + "--2023-02-20 17:32:21-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.94.30, 54.231.132.80, 52.217.36.182, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.94.30|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 76127532 (73M) [application/zip]\n", "Saving to: ‘/tmp/sentiment.parquet.zip’\n", "\n", - "sentiment.parquet.z 100%[===================>] 72.60M 55.2MB/s in 1.3s \n", + "sentiment.parquet.z 100%[===================>] 72,60M 22,6MB/s in 3,9s \n", "\n", - "2022-12-23 11:59:05 (55.2 MB/s) - ‘/tmp/sentiment.parquet.zip’ saved [76127532/76127532]\n", + "2023-02-20 17:32:26 (18,8 MB/s) - ‘/tmp/sentiment.parquet.zip’ saved [76127532/76127532]\n", "\n", "Archive: /tmp/sentiment.parquet.zip\n", " creating: /tmp/sentiment.parquet/\n", @@ -206,27 +147,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "6-8QQ6YMyzNZ" - }, + "metadata": {}, "source": [ " #### 3. Load a spark dataset and put it in memory" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6iVXyeX5yzNa", - "outputId": "233cdf4f-be44-4e38-d115-bd5e56653a29" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+---------------+\n", "|itemid|sentiment| text|sentiment_label|\n", @@ -270,19 +203,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "RTiRUnXHyzNi" - }, + "metadata": {}, "source": [ "#### 4. Create the document assembler, which will put target text column into Annotation form" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "I7kDWrFZyzNj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Define the dataframe\n", @@ -293,18 +222,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6Vi5ImpwyzNq", - "outputId": "71448be7-be60-4689-95ad-6ec6c21ecaac" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+---------------+--------------------+\n", "|itemid|sentiment| text|sentiment_label| document|\n", @@ -328,19 +251,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "DqFWhtGZyzN0" - }, + "metadata": {}, "source": [ "#### 5. Create Sentence detector to parse sub sentences in every document" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "HK4qRt2tyzN1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Sentence detector\n", @@ -351,18 +270,12 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7pkcAyQnyzN8", - "outputId": "aabb86f1-1515-403b-deb8-f22bc0d130a8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+---------------+--------------------+--------------------+\n", "|itemid|sentiment| text|sentiment_label| document| sentence|\n", @@ -386,19 +299,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "JaVLnDbxyzOA" - }, + "metadata": {}, "source": [ "#### 6. The tokenizer will match standard tokens" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "vwBEG3y6yzOB" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Tokenizer\n", @@ -409,18 +318,12 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "40PP804uyzOE", - "outputId": "84e55609-a666-4f65-c91f-f5cc648f58b0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+---------+--------------------+---------------+--------------------+--------------------+--------------------+\n", "|itemid|sentiment| text|sentiment_label| document| sentence| token|\n", @@ -444,19 +347,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "3LhoPH8fyzOJ" - }, + "metadata": {}, "source": [ "#### 7. Normalizer will clean out the tokens" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "cDOtkZF7yzOK" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "normalizer = Normalizer() \\\n", @@ -466,19 +365,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "CvMB0iMGyzOP" - }, + "metadata": {}, "source": [ "#### 8. The spell checker will correct normalized tokens, this trains with a dictionary of english words" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "_EziC6v0yzOP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "### Spell Checker\n", @@ -490,19 +385,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "f0zDsQloyzOT" - }, + "metadata": {}, "source": [ "#### 9. Create the ViveknSentimentApproach and set resources to train it" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "jgGbnXcryzOU" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sentiment_detector = ViveknSentimentApproach() \\\n", @@ -514,19 +405,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "8A1uXXmxyzOd" - }, + "metadata": {}, "source": [ "#### 10. The finisher will utilize sentiment analysis output" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "EcJeVOzVyzOe" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "finisher = Finisher() \\\n", @@ -536,29 +423,21 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ccQhdcDXyzOk" - }, + "metadata": {}, "source": [ "##### 11. Fit and predict over data" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "btpI76ViyzOl", - "outputId": "9b8b6718-6268-4f27-ec79-9e7444658d27" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Time elapsed pipeline process: 22.8889741897583\n" + "Time elapsed pipeline process: 5.950879335403442\n" ] } ], @@ -582,27 +461,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NcYkKyN-yzOq" - }, + "metadata": {}, "source": [ "##### 13. Check the result" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wdOIFzD7yzOr", - "outputId": "e068d899-37aa-407d-89ff-ae64c3711de8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------+\n", "|itemid|text |sentiment_label|finished_sentiment |\n", @@ -624,24 +495,18 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wPvfyTPdyzOw", - "outputId": "72cfc7fd-a27a-45e2-f1bc-512648c4874d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "pyspark.sql.dataframe.DataFrame" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 20 + "output_type": "execute_result" } ], "source": [ @@ -650,18 +515,12 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "h0vCTEo5yzO2", - "outputId": "86a45c72-ff86-4489-9704-601363130a58" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "is so sad for my APL friend............. -> ['negative']\n", "I missed the New Moon trailer... -> ['negative']\n", @@ -679,18 +538,12 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MM47a2PHyzPC", - "outputId": "df713d0a-7d7b-45ab-8e22-ed13242a2bfe" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "omg its already 7:30 :O -> ['positive']\n", "Juuuuuuuuuuuuuuuuussssst Chillin!! -> ['positive']\n", @@ -705,15 +558,6 @@ "for r in sentiment_data.where(array_contains(sentiment_data.finished_sentiment, \"positive\")).take(5):\n", " print(r['text'].strip(),\"->\",r['finished_sentiment'])" ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "9QagrcsKyzPK" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -736,8 +580,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/english/word2vec/Train_Word2Vec_and_Named_Entity_Recognition.ipynb b/examples/python/training/english/word2vec/Train_Word2Vec_and_Named_Entity_Recognition.ipynb index 1e2d10dd8adabf..00b01b40885148 100644 --- a/examples/python/training/english/word2vec/Train_Word2Vec_and_Named_Entity_Recognition.ipynb +++ b/examples/python/training/english/word2vec/Train_Word2Vec_and_Named_Entity_Recognition.ipynb @@ -1,216 +1,32 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/word2vec/Train_Word2Vec_and_Named_Entity_Recognition.ipynb)\n", + "\n", + "# Train NER Model with Word2Vec Embeddings" + ] + }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "77mVF2ES4S01" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "VCiyzqtH4VCC" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":: loading settings :: url = jar:file:/Users/maziyar/anaconda3/envs/sparknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Ivy Default Cache set to: /Users/maziyar/.ivy2/cache\n", - "The jars for the packages stored in: /Users/maziyar/.ivy2/jars\n", - "com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-074434f8-be75-400c-9b86-3fd89d7cbdf4;1.0\n", - "\tconfs: [default]\n", - "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;4.2.6 in central\n", - "\tfound com.typesafe#config;1.4.2 in spark-list\n", - "\tfound org.rocksdb#rocksdbjni;6.29.5 in central\n", - "\tfound com.amazonaws#aws-java-sdk-bundle;1.11.828 in central\n", - "\tfound com.github.universal-automata#liblevenshtein;3.0.0 in spark-list\n", - "\tfound com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in spark-list\n", - "\tfound com.google.protobuf#protobuf-java;3.0.0-beta-3 in spark-list\n", - "\tfound com.google.code.gson#gson;2.3 in spark-list\n", - "\tfound it.unimi.dsi#fastutil;7.0.12 in spark-list\n", - "\tfound org.projectlombok#lombok;1.16.8 in spark-list\n", - "\tfound com.google.cloud#google-cloud-storage;2.15.0 in central\n", - "\tfound com.google.guava#guava;31.1-jre in central\n", - "\tfound com.google.guava#failureaccess;1.0.1 in local-m2-cache\n", - "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in local-m2-cache\n", - "\tfound com.google.j2objc#j2objc-annotations;1.3 in local-m2-cache\n", - "\tfound com.google.http-client#google-http-client;1.42.3 in central\n", - "\tfound io.opencensus#opencensus-contrib-http-util;0.31.1 in central\n", - "\tfound com.google.http-client#google-http-client-jackson2;1.42.3 in central\n", - "\tfound com.google.http-client#google-http-client-gson;1.42.3 in central\n", - "\tfound com.google.api-client#google-api-client;2.0.1 in central\n", - "\tfound com.google.oauth-client#google-oauth-client;1.34.1 in central\n", - "\tfound com.google.http-client#google-http-client-apache-v2;1.42.3 in central\n", - "\tfound com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central\n", - "\tfound com.google.code.gson#gson;2.10 in central\n", - "\tfound com.google.cloud#google-cloud-core;2.8.27 in central\n", - "\tfound com.google.auto.value#auto-value-annotations;1.10 in central\n", - "\tfound com.google.cloud#google-cloud-core-http;2.8.27 in central\n", - "\tfound com.google.http-client#google-http-client-appengine;1.42.3 in central\n", - "\tfound com.google.api#gax-httpjson;0.104.5 in central\n", - "\tfound com.google.cloud#google-cloud-core-grpc;2.8.27 in central\n", - "\tfound io.grpc#grpc-core;1.50.2 in central\n", - "\tfound com.google.api#gax;2.19.5 in central\n", - "\tfound com.google.api#gax-grpc;2.19.5 in central\n", - "\tfound com.google.auth#google-auth-library-credentials;1.12.1 in central\n", - "\tfound com.google.auth#google-auth-library-oauth2-http;1.12.1 in central\n", - "\tfound com.google.api#api-common;2.2.2 in central\n", - "\tfound javax.annotation#javax.annotation-api;1.3.2 in central\n", - "\tfound io.opencensus#opencensus-api;0.31.1 in central\n", - "\tfound io.grpc#grpc-context;1.50.2 in central\n", - "\tfound com.google.api.grpc#proto-google-iam-v1;1.6.7 in central\n", - "\tfound com.google.protobuf#protobuf-java;3.21.9 in central\n", - "\tfound com.google.protobuf#protobuf-java-util;3.21.9 in central\n", - "\tfound com.google.api.grpc#proto-google-common-protos;2.10.0 in central\n", - "\tfound org.threeten#threetenbp;1.6.4 in central\n", - "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.15.0-alpha in central\n", - "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.15.0-alpha in central\n", - "\tfound io.grpc#grpc-protobuf;1.50.2 in central\n", - "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.15.0-alpha in central\n", - "\tfound com.fasterxml.jackson.core#jackson-core;2.14.0 in central\n", - "\tfound com.google.code.findbugs#jsr305;3.0.2 in spark-list\n", - "\tfound io.grpc#grpc-api;1.50.2 in central\n", - "\tfound io.grpc#grpc-auth;1.50.2 in central\n", - "\tfound io.grpc#grpc-stub;1.50.2 in central\n", - "\tfound org.checkerframework#checker-qual;3.27.0 in central\n", - "\tfound com.google.api.grpc#grpc-google-iam-v1;1.6.7 in central\n", - "\tfound io.grpc#grpc-protobuf-lite;1.50.2 in central\n", - "\tfound com.google.android#annotations;4.1.1.4 in local-m2-cache\n", - "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.22 in central\n", - "\tfound io.grpc#grpc-alts;1.50.2 in central\n", - "\tfound io.grpc#grpc-grpclb;1.50.2 in central\n", - "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n", - "\tfound io.grpc#grpc-netty-shaded;1.50.2 in central\n", - "\tfound io.perfmark#perfmark-api;0.25.0 in local-m2-cache\n", - "\tfound io.grpc#grpc-googleapis;1.50.2 in central\n", - "\tfound io.grpc#grpc-xds;1.50.2 in central\n", - "\tfound io.opencensus#opencensus-proto;0.2.0 in central\n", - "\tfound io.grpc#grpc-services;1.50.2 in central\n", - "\tfound com.google.re2j#re2j;1.6 in central\n", - "\tfound com.navigamez#greex;1.0 in spark-list\n", - "\tfound dk.brics.automaton#automaton;1.11-8 in spark-list\n", - "\tfound com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central\n", - ":: resolution report :: resolve 2084ms :: artifacts dl 42ms\n", - "\t:: modules in use:\n", - "\tcom.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]\n", - "\tcom.fasterxml.jackson.core#jackson-core;2.14.0 from central in [default]\n", - "\tcom.github.universal-automata#liblevenshtein;3.0.0 from spark-list in [default]\n", - "\tcom.google.android#annotations;4.1.1.4 from local-m2-cache in [default]\n", - "\tcom.google.api#api-common;2.2.2 from central in [default]\n", - "\tcom.google.api#gax;2.19.5 from central in [default]\n", - "\tcom.google.api#gax-grpc;2.19.5 from central in [default]\n", - "\tcom.google.api#gax-httpjson;0.104.5 from central in [default]\n", - "\tcom.google.api-client#google-api-client;2.0.1 from central in [default]\n", - "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.15.0-alpha from central in [default]\n", - "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.15.0-alpha from central in [default]\n", - "\tcom.google.api.grpc#grpc-google-iam-v1;1.6.7 from central in [default]\n", - "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.15.0-alpha from central in [default]\n", - "\tcom.google.api.grpc#proto-google-common-protos;2.10.0 from central in [default]\n", - "\tcom.google.api.grpc#proto-google-iam-v1;1.6.7 from central in [default]\n", - "\tcom.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]\n", - "\tcom.google.auth#google-auth-library-credentials;1.12.1 from central in [default]\n", - "\tcom.google.auth#google-auth-library-oauth2-http;1.12.1 from central in [default]\n", - "\tcom.google.auto.value#auto-value-annotations;1.10 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core;2.8.27 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core-grpc;2.8.27 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core-http;2.8.27 from central in [default]\n", - "\tcom.google.cloud#google-cloud-storage;2.15.0 from central in [default]\n", - "\tcom.google.code.findbugs#jsr305;3.0.2 from spark-list in [default]\n", - "\tcom.google.code.gson#gson;2.10 from central in [default]\n", - "\tcom.google.guava#failureaccess;1.0.1 from local-m2-cache in [default]\n", - "\tcom.google.guava#guava;31.1-jre from central in [default]\n", - "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from local-m2-cache in [default]\n", - "\tcom.google.http-client#google-http-client;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-apache-v2;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-appengine;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-gson;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-jackson2;1.42.3 from central in [default]\n", - "\tcom.google.j2objc#j2objc-annotations;1.3 from local-m2-cache in [default]\n", - "\tcom.google.oauth-client#google-oauth-client;1.34.1 from central in [default]\n", - "\tcom.google.protobuf#protobuf-java;3.21.9 from central in [default]\n", - "\tcom.google.protobuf#protobuf-java-util;3.21.9 from central in [default]\n", - "\tcom.google.re2j#re2j;1.6 from central in [default]\n", - "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;4.2.6 from central in [default]\n", - "\tcom.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]\n", - "\tcom.navigamez#greex;1.0 from spark-list in [default]\n", - "\tcom.typesafe#config;1.4.2 from spark-list in [default]\n", - "\tdk.brics.automaton#automaton;1.11-8 from spark-list in [default]\n", - "\tio.grpc#grpc-alts;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-api;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-auth;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-context;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-core;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-googleapis;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-grpclb;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-netty-shaded;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-protobuf;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-protobuf-lite;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-services;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-stub;1.50.2 from central in [default]\n", - "\tio.grpc#grpc-xds;1.50.2 from central in [default]\n", - "\tio.opencensus#opencensus-api;0.31.1 from central in [default]\n", - "\tio.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]\n", - "\tio.opencensus#opencensus-proto;0.2.0 from central in [default]\n", - "\tio.perfmark#perfmark-api;0.25.0 from local-m2-cache in [default]\n", - "\tit.unimi.dsi#fastutil;7.0.12 from spark-list in [default]\n", - "\tjavax.annotation#javax.annotation-api;1.3.2 from central in [default]\n", - "\torg.checkerframework#checker-qual;3.27.0 from central in [default]\n", - "\torg.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]\n", - "\torg.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]\n", - "\torg.projectlombok#lombok;1.16.8 from spark-list in [default]\n", - "\torg.rocksdb#rocksdbjni;6.29.5 from central in [default]\n", - "\torg.threeten#threetenbp;1.6.4 from central in [default]\n", - "\t:: evicted modules:\n", - "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.9] in [default]\n", - "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.9] in [default]\n", - "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10] in [default]\n", - "\t---------------------------------------------------------------------\n", - "\t| | modules || artifacts |\n", - "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", - "\t---------------------------------------------------------------------\n", - "\t| default | 71 | 1 | 1 | 3 || 68 | 0 |\n", - "\t---------------------------------------------------------------------\n", - "\n", - ":: problems summary ::\n", - ":::: ERRORS\n", - "\tunknown resolver null\n", - "\n", - "\n", - ":: USE VERBOSE OR DEBUG MESSAGE LEVEL FOR MORE DETAILS\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-074434f8-be75-400c-9b86-3fd89d7cbdf4\n", - "\tconfs: [default]\n", - "\t0 artifacts copied, 68 already retrieved (0kB/19ms)\n", - "22/12/29 13:47:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import sparknlp\n", "# let's start Spark with Spark NLP\n", @@ -219,14 +35,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JSE7xgQc4gTg", - "outputId": "4a6296be-f211-48b9-816e-55cab2e37426" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train\n", @@ -235,23 +45,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VOSCO4hg4jp9", - "outputId": "9a4ef71b-772a-4242-b947-1b6f09468ebb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "22/12/29 13:48:05 WARN TaskSetManager: Stage 0 contains a task of very large size (9058 KiB). The maximum recommended task size is 1000 KiB.\n", - "[Stage 0:> (0 + 1) / 1]\r" - ] - }, { "name": "stdout", "output_type": "stream", @@ -266,13 +62,6 @@ "only showing top 3 rows\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] } ], "source": [ @@ -285,10 +74,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "M_6wrm1X4nQP" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -303,14 +90,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "YFLQsOby4rPg", - "outputId": "10d4508e-9562-4ee0-cfa2-42ed37a3d0a9" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "word2Vec = Word2VecApproach()\\\n", @@ -350,2073 +131,24 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "ZT4dQu328okt" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "22/12/29 13:48:07 WARN TaskSetManager: Stage 1 contains a task of very large size (9058 KiB). The maximum recommended task size is 1000 KiB.\n", - "22/12/29 13:48:09 WARN TaskSetManager: Stage 3 contains a task of very large size (9058 KiB). The maximum recommended task size is 1000 KiB.\n", - "22/12/29 13:48:10 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS\n", - "22/12/29 13:48:10 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS\n", - "22/12/29 13:48:14 WARN TaskSetManager: Stage 6 contains a task of very large size (9058 KiB). The maximum recommended task size is 1000 KiB.\n", - "22/12/29 13:48:18 WARN TaskSetManager: Stage 7 contains a task of very large size (9058 KiB). The maximum recommended task size is 1000 KiB.\n", - "2022-12-29 13:48:27.921143: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-12-29 13:48:28.022993: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_cnn/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_2 (Assign) /device:GPU:0\n", - " save/Assign_41 (Assign) /device:GPU:0\n", - " save/Assign_42 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.023168: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "Add: CPU \n", - "VarHandleOp: CPU \n", - "RandomUniform: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform (Add) \n", - " char_repr_cnn/conv1d/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/conv1d/ExpandDims_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_1 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_10 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_11 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.023286: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "ReadVariableOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/bias/Initializer/zeros (Const) \n", - " char_repr_cnn/conv1d/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/BiasAdd/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_8 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_9 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.023487: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_lstm/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_3 (Assign) /device:GPU:0\n", - " save/Assign_43 (Assign) /device:GPU:0\n", - " save/Assign_44 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.023641: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_6 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_20 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_21 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.023804: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_7 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_22 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_23 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.023933: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_5 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_18 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_19 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.024076: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_3 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_14 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_15 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.024227: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_4 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_16 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_17 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.024354: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_2 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_12 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_13 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.024737: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " W/Initializer/random_uniform/shape (Const) \n", - " W/Initializer/random_uniform/min (Const) \n", - " W/Initializer/random_uniform/max (Const) \n", - " W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " W/Initializer/random_uniform/sub (Sub) \n", - " W/Initializer/random_uniform/mul (Mul) \n", - " W/Initializer/random_uniform (Add) \n", - " W (VariableV2) /device:GPU:0\n", - " W/Assign (Assign) /device:GPU:0\n", - " W/read (Identity) /device:GPU:0\n", - " training_1/beta1_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta1_power (VariableV2) /device:GPU:0\n", - " training_1/beta1_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta1_power/read (Identity) /device:GPU:0\n", - " training_1/beta2_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta2_power (VariableV2) /device:GPU:0\n", - " training_1/beta2_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta2_power/read (Identity) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam (VariableV2) /device:GPU:0\n", - " training/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/W/Adam/read (Identity) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " training_1/Adam/mul (Mul) /device:GPU:0\n", - " training_1/Adam/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign (Assign) /device:GPU:0\n", - " save/Assign_37 (Assign) /device:GPU:0\n", - " save/Assign_38 (Assign) /device:GPU:0\n", - " save/Assign_111 (Assign) /device:GPU:0\n", - " save/Assign_112 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.024857: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " b/Initializer/random_uniform/shape (Const) \n", - " b/Initializer/random_uniform/min (Const) \n", - " b/Initializer/random_uniform/max (Const) \n", - " b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " b/Initializer/random_uniform/sub (Sub) \n", - " b/Initializer/random_uniform/mul (Mul) \n", - " b/Initializer/random_uniform (Add) \n", - " b (VariableV2) /device:GPU:0\n", - " b/Assign (Assign) /device:GPU:0\n", - " b/read (Identity) /device:GPU:0\n", - " training/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam (VariableV2) /device:GPU:0\n", - " training/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/b/Adam/read (Identity) /device:GPU:0\n", - " training/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign_39 (Assign) /device:GPU:0\n", - " save/Assign_40 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.024999: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_7 (Assign) /device:GPU:0\n", - " save/Assign_51 (Assign) /device:GPU:0\n", - " save/Assign_52 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025109: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_6 (Assign) /device:GPU:0\n", - " save/Assign_49 (Assign) /device:GPU:0\n", - " save/Assign_50 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025226: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_9 (Assign) /device:GPU:0\n", - " save/Assign_55 (Assign) /device:GPU:0\n", - " save/Assign_56 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025343: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_8 (Assign) /device:GPU:0\n", - " save/Assign_53 (Assign) /device:GPU:0\n", - " save/Assign_54 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025460: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_10 (Assign) /device:GPU:0\n", - " save/Assign_57 (Assign) /device:GPU:0\n", - " save/Assign_58 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025610: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_12 (Assign) /device:GPU:0\n", - " save/Assign_61 (Assign) /device:GPU:0\n", - " save/Assign_62 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025719: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_11 (Assign) /device:GPU:0\n", - " save/Assign_59 (Assign) /device:GPU:0\n", - " save/Assign_60 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025834: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_14 (Assign) /device:GPU:0\n", - " save/Assign_65 (Assign) /device:GPU:0\n", - " save/Assign_66 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.025951: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_13 (Assign) /device:GPU:0\n", - " save/Assign_63 (Assign) /device:GPU:0\n", - " save/Assign_64 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026066: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_15 (Assign) /device:GPU:0\n", - " save/Assign_67 (Assign) /device:GPU:0\n", - " save/Assign_68 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026219: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_17 (Assign) /device:GPU:0\n", - " save/Assign_71 (Assign) /device:GPU:0\n", - " save/Assign_72 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026327: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_16 (Assign) /device:GPU:0\n", - " save/Assign_69 (Assign) /device:GPU:0\n", - " save/Assign_70 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026443: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_19 (Assign) /device:GPU:0\n", - " save/Assign_75 (Assign) /device:GPU:0\n", - " save/Assign_76 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026559: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_18 (Assign) /device:GPU:0\n", - " save/Assign_73 (Assign) /device:GPU:0\n", - " save/Assign_74 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026676: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_20 (Assign) /device:GPU:0\n", - " save/Assign_77 (Assign) /device:GPU:0\n", - " save/Assign_78 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026837: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_22 (Assign) /device:GPU:0\n", - " save/Assign_81 (Assign) /device:GPU:0\n", - " save/Assign_82 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.026958: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_21 (Assign) /device:GPU:0\n", - " save/Assign_79 (Assign) /device:GPU:0\n", - " save/Assign_80 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027076: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_24 (Assign) /device:GPU:0\n", - " save/Assign_85 (Assign) /device:GPU:0\n", - " save/Assign_86 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027202: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_23 (Assign) /device:GPU:0\n", - " save/Assign_83 (Assign) /device:GPU:0\n", - " save/Assign_84 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027397: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_25 (Assign) /device:GPU:0\n", - " save/Assign_87 (Assign) /device:GPU:0\n", - " save/Assign_88 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027577: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_27 (Assign) /device:GPU:0\n", - " save/Assign_91 (Assign) /device:GPU:0\n", - " save/Assign_92 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027693: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_26 (Assign) /device:GPU:0\n", - " save/Assign_89 (Assign) /device:GPU:0\n", - " save/Assign_90 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027814: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_29 (Assign) /device:GPU:0\n", - " save/Assign_95 (Assign) /device:GPU:0\n", - " save/Assign_96 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.027931: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_28 (Assign) /device:GPU:0\n", - " save/Assign_93 (Assign) /device:GPU:0\n", - " save/Assign_94 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.028049: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_30 (Assign) /device:GPU:0\n", - " save/Assign_97 (Assign) /device:GPU:0\n", - " save/Assign_98 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.028204: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_32 (Assign) /device:GPU:0\n", - " save/Assign_101 (Assign) /device:GPU:0\n", - " save/Assign_102 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.028326: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_31 (Assign) /device:GPU:0\n", - " save/Assign_99 (Assign) /device:GPU:0\n", - " save/Assign_100 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.028451: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_34 (Assign) /device:GPU:0\n", - " save/Assign_105 (Assign) /device:GPU:0\n", - " save/Assign_106 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.028643: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_33 (Assign) /device:GPU:0\n", - " save/Assign_103 (Assign) /device:GPU:0\n", - " save/Assign_104 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.028929: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_35 (Assign) /device:GPU:0\n", - " save/Assign_107 (Assign) /device:GPU:0\n", - " save/Assign_108 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.029239: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/W/Initializer/random_uniform/shape (Const) \n", - " context_repr/W/Initializer/random_uniform/min (Const) \n", - " context_repr/W/Initializer/random_uniform/max (Const) \n", - " context_repr/W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/W/Initializer/random_uniform/sub (Sub) \n", - " context_repr/W/Initializer/random_uniform/mul (Mul) \n", - " context_repr/W/Initializer/random_uniform (Add) \n", - " context_repr/W (VariableV2) /device:GPU:0\n", - " context_repr/W/Assign (Assign) /device:GPU:0\n", - " context_repr/W/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_4 (Assign) /device:GPU:0\n", - " save/Assign_45 (Assign) /device:GPU:0\n", - " save/Assign_46 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.029442: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/b/Initializer/random_uniform/shape (Const) \n", - " context_repr/b/Initializer/random_uniform/min (Const) \n", - " context_repr/b/Initializer/random_uniform/max (Const) \n", - " context_repr/b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/b/Initializer/random_uniform/sub (Sub) \n", - " context_repr/b/Initializer/random_uniform/mul (Mul) \n", - " context_repr/b/Initializer/random_uniform (Add) \n", - " context_repr/b (VariableV2) /device:GPU:0\n", - " context_repr/b/Assign (Assign) /device:GPU:0\n", - " context_repr/b/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_5 (Assign) /device:GPU:0\n", - " save/Assign_47 (Assign) /device:GPU:0\n", - " save/Assign_48 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:48:28.029649: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Switch: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " inference/transition_params/Initializer/random_uniform/shape (Const) \n", - " inference/transition_params/Initializer/random_uniform/min (Const) \n", - " inference/transition_params/Initializer/random_uniform/max (Const) \n", - " inference/transition_params/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " inference/transition_params/Initializer/random_uniform/sub (Sub) \n", - " inference/transition_params/Initializer/random_uniform/mul (Mul) \n", - " inference/transition_params/Initializer/random_uniform (Add) \n", - " inference/transition_params (VariableV2) /device:GPU:0\n", - " inference/transition_params/Assign (Assign) /device:GPU:0\n", - " inference/transition_params/read (Identity) /device:GPU:0\n", - " inference/cond/Reshape_4/Switch (Switch) /device:GPU:0\n", - " inference/cond_1/ExpandDims/Switch (Switch) /device:GPU:0\n", - " inference/cond_2/ExpandDims_1/Switch (Switch) /device:GPU:0\n", - " training/inference/transition_params/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam/read (Identity) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam_1 (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_inference/transition_params/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_36 (Assign) /device:GPU:0\n", - " save/Assign_109 (Assign) /device:GPU:0\n", - " save/Assign_110 (Assign) /device:GPU:0\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training started - total epochs: 1 - lr: 0.003 - batch size: 8 - labels: 9 - chars: 83 - training examples: 11265\n", - "Epoch 1/1 started, lr: 0.003, dataset size: 11265\n", - "Epoch 1/1 - 171.09s - loss: 3970.4763 - batches: 1411\n", - "Quality on validation dataset (20.0%), validation examples = 2253\n", - "time to finish evaluation: 11.93s\n", - "label\t tp\t fp\t fn\t prec\t rec\t f1\n", - "B-LOC\t 1175\t 263\t 217\t 0.8171071\t 0.8441092\t 0.8303887\n", - "I-ORG\t 458\t 141\t 207\t 0.76460767\t 0.6887218\t 0.7246835\n", - "I-MISC\t 104\t 45\t 136\t 0.6979866\t 0.43333334\t 0.5347043\n", - "I-LOC\t 132\t 29\t 101\t 0.8198758\t 0.5665236\t 0.6700508\n", - "I-PER\t 867\t 148\t 32\t 0.8541872\t 0.9644049\t 0.90595615\n", - "B-MISC\t 480\t 94\t 205\t 0.83623695\t 0.7007299\t 0.76250994\n", - "B-ORG\t 868\t 236\t 339\t 0.7862319\t 0.7191384\t 0.75119\n", - "B-PER\t 1189\t 321\t 138\t 0.78741723\t 0.89600605\t 0.8382094\n", - "tp: 5273 fp: 1277 fn: 1375 labels: 8\n", - "Macro-average\t prec: 0.7954563, rec: 0.72662085, f1: 0.7594821\n", - "Micro-average\t prec: 0.80503815, rec: 0.79317087, f1: 0.79906046\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "ner_model = ner_pipeline.fit(training_data)" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_vTRFsKV92Yz", - "outputId": "54af004f-47dd-4038-b0b1-c1dd2c09228b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total 8\n", - "-rw-r--r-- 1 maziyar staff 1016 Dec 29 13:51 NerDLApproach_56719b1fca3b.log\n" + "total 4\n", + "-rw-r--r-- 1 root root 1017 20. Feb 18:09 NerDLApproach_00802da54a15.log\n" ] } ], @@ -2426,41 +158,35 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qzYCO5j3EkAu", - "outputId": "2f225170-73f1-41d9-de9a-2353e3d8610a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Name of the selected graph: ner-dl/blstm_10_100_128_120.pb\n", - "Training started - total epochs: 1 - lr: 0.003 - batch size: 8 - labels: 9 - chars: 83 - training examples: 11265\n", + "Training started - total epochs: 1 - lr: 0.003 - batch size: 8 - labels: 9 - chars: 84 - training examples: 11239\n", "\n", "\n", - "Epoch 1/1 started, lr: 0.003, dataset size: 11265\n", + "Epoch 1/1 started, lr: 0.003, dataset size: 11239\n", "\n", "\n", - "Epoch 1/1 - 171.09s - loss: 3970.4763 - batches: 1411\n", - "Quality on validation dataset (20.0%), validation examples = 2253\n", - "time to finish evaluation: 11.93s\n", + "Epoch 1/1 - 48.29s - loss: 4617.783 - batches: 1407\n", + "Quality on validation dataset (20.0%), validation examples = 2247\n", + "time to finish evaluation: 3.36s\n", "label\t tp\t fp\t fn\t prec\t rec\t f1\n", - "B-LOC\t 1175\t 263\t 217\t 0.8171071\t 0.8441092\t 0.8303887\n", - "I-ORG\t 458\t 141\t 207\t 0.76460767\t 0.6887218\t 0.7246835\n", - "I-MISC\t 104\t 45\t 136\t 0.6979866\t 0.43333334\t 0.5347043\n", - "I-LOC\t 132\t 29\t 101\t 0.8198758\t 0.5665236\t 0.6700508\n", - "I-PER\t 867\t 148\t 32\t 0.8541872\t 0.9644049\t 0.90595615\n", - "B-MISC\t 480\t 94\t 205\t 0.83623695\t 0.7007299\t 0.76250994\n", - "B-ORG\t 868\t 236\t 339\t 0.7862319\t 0.7191384\t 0.75119\n", - "B-PER\t 1189\t 321\t 138\t 0.78741723\t 0.89600605\t 0.8382094\n", - "tp: 5273 fp: 1277 fn: 1375 labels: 8\n", - "Macro-average\t prec: 0.7954563, rec: 0.72662085, f1: 0.7594821\n", - "Micro-average\t prec: 0.80503815, rec: 0.79317087, f1: 0.79906046\n" + "B-LOC\t 1029\t 145\t 376\t 0.87649065\t 0.7323843\t 0.7979837\n", + "I-ORG\t 381\t 93\t 363\t 0.8037975\t 0.51209676\t 0.6256157\n", + "I-MISC\t 124\t 121\t 118\t 0.50612247\t 0.5123967\t 0.50924027\n", + "I-LOC\t 138\t 53\t 77\t 0.7225131\t 0.6418605\t 0.67980295\n", + "I-PER\t 898\t 237\t 21\t 0.79118943\t 0.97714907\t 0.8743915\n", + "B-MISC\t 528\t 192\t 190\t 0.73333335\t 0.73537606\t 0.73435324\n", + "B-ORG\t 771\t 139\t 507\t 0.8472527\t 0.6032864\t 0.7047532\n", + "B-PER\t 1200\t 630\t 111\t 0.6557377\t 0.9153318\t 0.7640879\n", + "tp: 5069 fp: 1610 fn: 1763 labels: 8\n", + "Macro-average\t prec: 0.74205464, rec: 0.7037352, f1: 0.7223871\n", + "Micro-average\t prec: 0.75894594, rec: 0.7419497, f1: 0.75035155\n" ] } ], @@ -2470,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -2487,13 +213,6 @@ "only showing top 3 rows\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "22/12/29 13:51:33 WARN TaskSetManager: Stage 8 contains a task of very large size (1773 KiB). The maximum recommended task size is 1000 KiB.\n" - ] } ], "source": [ @@ -2506,10 +225,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "NZJuax-nFHTQ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "predictions = ner_model.transform(test_data)" @@ -2517,2045 +234,28 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yyjerNbgFZWg", - "outputId": "fc7b650b-5e35-4f90-f40b-2deadfd0e049" - }, + "execution_count": null, + "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "22/12/29 13:51:36 WARN TaskSetManager: Stage 9 contains a task of very large size (1773 KiB). The maximum recommended task size is 1000 KiB.\n", - "2022-12-29 13:51:36.821957: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_cnn/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_2 (Assign) /device:GPU:0\n", - " save/Assign_41 (Assign) /device:GPU:0\n", - " save/Assign_42 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.822162: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "Add: CPU \n", - "VarHandleOp: CPU \n", - "RandomUniform: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform (Add) \n", - " char_repr_cnn/conv1d/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/conv1d/ExpandDims_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_1 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_10 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_11 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.822281: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "ReadVariableOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/bias/Initializer/zeros (Const) \n", - " char_repr_cnn/conv1d/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/BiasAdd/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_8 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_9 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.822654: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_lstm/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_3 (Assign) /device:GPU:0\n", - " save/Assign_43 (Assign) /device:GPU:0\n", - " save/Assign_44 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.822901: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_6 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_20 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_21 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.823085: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_7 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_22 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_23 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.823214: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_5 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_18 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_19 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.823348: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_3 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_14 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_15 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.823508: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_4 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_16 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_17 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.823633: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_2 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_12 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_13 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824040: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " W/Initializer/random_uniform/shape (Const) \n", - " W/Initializer/random_uniform/min (Const) \n", - " W/Initializer/random_uniform/max (Const) \n", - " W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " W/Initializer/random_uniform/sub (Sub) \n", - " W/Initializer/random_uniform/mul (Mul) \n", - " W/Initializer/random_uniform (Add) \n", - " W (VariableV2) /device:GPU:0\n", - " W/Assign (Assign) /device:GPU:0\n", - " W/read (Identity) /device:GPU:0\n", - " training_1/beta1_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta1_power (VariableV2) /device:GPU:0\n", - " training_1/beta1_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta1_power/read (Identity) /device:GPU:0\n", - " training_1/beta2_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta2_power (VariableV2) /device:GPU:0\n", - " training_1/beta2_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta2_power/read (Identity) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam (VariableV2) /device:GPU:0\n", - " training/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/W/Adam/read (Identity) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " training_1/Adam/mul (Mul) /device:GPU:0\n", - " training_1/Adam/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign (Assign) /device:GPU:0\n", - " save/Assign_37 (Assign) /device:GPU:0\n", - " save/Assign_38 (Assign) /device:GPU:0\n", - " save/Assign_111 (Assign) /device:GPU:0\n", - " save/Assign_112 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824154: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " b/Initializer/random_uniform/shape (Const) \n", - " b/Initializer/random_uniform/min (Const) \n", - " b/Initializer/random_uniform/max (Const) \n", - " b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " b/Initializer/random_uniform/sub (Sub) \n", - " b/Initializer/random_uniform/mul (Mul) \n", - " b/Initializer/random_uniform (Add) \n", - " b (VariableV2) /device:GPU:0\n", - " b/Assign (Assign) /device:GPU:0\n", - " b/read (Identity) /device:GPU:0\n", - " training/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam (VariableV2) /device:GPU:0\n", - " training/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/b/Adam/read (Identity) /device:GPU:0\n", - " training/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign_39 (Assign) /device:GPU:0\n", - " save/Assign_40 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824288: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_7 (Assign) /device:GPU:0\n", - " save/Assign_51 (Assign) /device:GPU:0\n", - " save/Assign_52 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824390: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_6 (Assign) /device:GPU:0\n", - " save/Assign_49 (Assign) /device:GPU:0\n", - " save/Assign_50 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824498: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_9 (Assign) /device:GPU:0\n", - " save/Assign_55 (Assign) /device:GPU:0\n", - " save/Assign_56 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824607: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_8 (Assign) /device:GPU:0\n", - " save/Assign_53 (Assign) /device:GPU:0\n", - " save/Assign_54 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824715: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_10 (Assign) /device:GPU:0\n", - " save/Assign_57 (Assign) /device:GPU:0\n", - " save/Assign_58 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824861: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_12 (Assign) /device:GPU:0\n", - " save/Assign_61 (Assign) /device:GPU:0\n", - " save/Assign_62 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.824963: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_11 (Assign) /device:GPU:0\n", - " save/Assign_59 (Assign) /device:GPU:0\n", - " save/Assign_60 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825072: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_14 (Assign) /device:GPU:0\n", - " save/Assign_65 (Assign) /device:GPU:0\n", - " save/Assign_66 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825180: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_13 (Assign) /device:GPU:0\n", - " save/Assign_63 (Assign) /device:GPU:0\n", - " save/Assign_64 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825287: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_15 (Assign) /device:GPU:0\n", - " save/Assign_67 (Assign) /device:GPU:0\n", - " save/Assign_68 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825430: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_17 (Assign) /device:GPU:0\n", - " save/Assign_71 (Assign) /device:GPU:0\n", - " save/Assign_72 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825531: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_16 (Assign) /device:GPU:0\n", - " save/Assign_69 (Assign) /device:GPU:0\n", - " save/Assign_70 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825640: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_19 (Assign) /device:GPU:0\n", - " save/Assign_75 (Assign) /device:GPU:0\n", - " save/Assign_76 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825750: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_18 (Assign) /device:GPU:0\n", - " save/Assign_73 (Assign) /device:GPU:0\n", - " save/Assign_74 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825856: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_20 (Assign) /device:GPU:0\n", - " save/Assign_77 (Assign) /device:GPU:0\n", - " save/Assign_78 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.825999: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_22 (Assign) /device:GPU:0\n", - " save/Assign_81 (Assign) /device:GPU:0\n", - " save/Assign_82 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826099: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_21 (Assign) /device:GPU:0\n", - " save/Assign_79 (Assign) /device:GPU:0\n", - " save/Assign_80 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826206: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_24 (Assign) /device:GPU:0\n", - " save/Assign_85 (Assign) /device:GPU:0\n", - " save/Assign_86 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826315: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_23 (Assign) /device:GPU:0\n", - " save/Assign_83 (Assign) /device:GPU:0\n", - " save/Assign_84 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826420: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_25 (Assign) /device:GPU:0\n", - " save/Assign_87 (Assign) /device:GPU:0\n", - " save/Assign_88 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826564: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_27 (Assign) /device:GPU:0\n", - " save/Assign_91 (Assign) /device:GPU:0\n", - " save/Assign_92 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826665: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_26 (Assign) /device:GPU:0\n", - " save/Assign_89 (Assign) /device:GPU:0\n", - " save/Assign_90 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826772: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_29 (Assign) /device:GPU:0\n", - " save/Assign_95 (Assign) /device:GPU:0\n", - " save/Assign_96 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.826903: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_28 (Assign) /device:GPU:0\n", - " save/Assign_93 (Assign) /device:GPU:0\n", - " save/Assign_94 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827035: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_30 (Assign) /device:GPU:0\n", - " save/Assign_97 (Assign) /device:GPU:0\n", - " save/Assign_98 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827184: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_32 (Assign) /device:GPU:0\n", - " save/Assign_101 (Assign) /device:GPU:0\n", - " save/Assign_102 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827287: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_31 (Assign) /device:GPU:0\n", - " save/Assign_99 (Assign) /device:GPU:0\n", - " save/Assign_100 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827394: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_34 (Assign) /device:GPU:0\n", - " save/Assign_105 (Assign) /device:GPU:0\n", - " save/Assign_106 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827499: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_33 (Assign) /device:GPU:0\n", - " save/Assign_103 (Assign) /device:GPU:0\n", - " save/Assign_104 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827607: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_35 (Assign) /device:GPU:0\n", - " save/Assign_107 (Assign) /device:GPU:0\n", - " save/Assign_108 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827779: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/W/Initializer/random_uniform/shape (Const) \n", - " context_repr/W/Initializer/random_uniform/min (Const) \n", - " context_repr/W/Initializer/random_uniform/max (Const) \n", - " context_repr/W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/W/Initializer/random_uniform/sub (Sub) \n", - " context_repr/W/Initializer/random_uniform/mul (Mul) \n", - " context_repr/W/Initializer/random_uniform (Add) \n", - " context_repr/W (VariableV2) /device:GPU:0\n", - " context_repr/W/Assign (Assign) /device:GPU:0\n", - " context_repr/W/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_4 (Assign) /device:GPU:0\n", - " save/Assign_45 (Assign) /device:GPU:0\n", - " save/Assign_46 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.827888: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/b/Initializer/random_uniform/shape (Const) \n", - " context_repr/b/Initializer/random_uniform/min (Const) \n", - " context_repr/b/Initializer/random_uniform/max (Const) \n", - " context_repr/b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/b/Initializer/random_uniform/sub (Sub) \n", - " context_repr/b/Initializer/random_uniform/mul (Mul) \n", - " context_repr/b/Initializer/random_uniform (Add) \n", - " context_repr/b (VariableV2) /device:GPU:0\n", - " context_repr/b/Assign (Assign) /device:GPU:0\n", - " context_repr/b/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_5 (Assign) /device:GPU:0\n", - " save/Assign_47 (Assign) /device:GPU:0\n", - " save/Assign_48 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:36.828009: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Switch: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " inference/transition_params/Initializer/random_uniform/shape (Const) \n", - " inference/transition_params/Initializer/random_uniform/min (Const) \n", - " inference/transition_params/Initializer/random_uniform/max (Const) \n", - " inference/transition_params/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " inference/transition_params/Initializer/random_uniform/sub (Sub) \n", - " inference/transition_params/Initializer/random_uniform/mul (Mul) \n", - " inference/transition_params/Initializer/random_uniform (Add) \n", - " inference/transition_params (VariableV2) /device:GPU:0\n", - " inference/transition_params/Assign (Assign) /device:GPU:0\n", - " inference/transition_params/read (Identity) /device:GPU:0\n", - " inference/cond/Reshape_4/Switch (Switch) /device:GPU:0\n", - " inference/cond_1/ExpandDims/Switch (Switch) /device:GPU:0\n", - " inference/cond_2/ExpandDims_1/Switch (Switch) /device:GPU:0\n", - " training/inference/transition_params/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam/read (Identity) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam_1 (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_inference/transition_params/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_36 (Assign) /device:GPU:0\n", - " save/Assign_109 (Assign) /device:GPU:0\n", - " save/Assign_110 (Assign) /device:GPU:0\n", - "\n", - " \r" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", - " B-LOC 0.82 0.85 0.83 1837\n", - " B-MISC 0.87 0.69 0.77 922\n", - " B-ORG 0.80 0.68 0.74 1341\n", - " B-PER 0.78 0.90 0.84 1842\n", - " I-LOC 0.76 0.56 0.65 257\n", - " I-MISC 0.75 0.53 0.62 346\n", - " I-ORG 0.79 0.58 0.67 751\n", - " I-PER 0.86 0.96 0.91 1307\n", + " B-LOC 0.88 0.71 0.79 1837\n", + " B-MISC 0.77 0.74 0.76 922\n", + " B-ORG 0.85 0.58 0.69 1341\n", + " B-PER 0.66 0.93 0.78 1842\n", + " I-LOC 0.70 0.53 0.60 257\n", + " I-MISC 0.62 0.59 0.60 346\n", + " I-ORG 0.83 0.42 0.56 751\n", + " I-PER 0.80 0.96 0.87 1307\n", " O 0.99 0.99 0.99 42759\n", "\n", - " accuracy 0.96 51362\n", - " macro avg 0.82 0.75 0.78 51362\n", - "weighted avg 0.96 0.96 0.96 51362\n", + " accuracy 0.95 51362\n", + " macro avg 0.79 0.72 0.74 51362\n", + "weighted avg 0.95 0.95 0.95 51362\n", "\n" ] } @@ -4578,9 +278,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "ZT6UH3NJ5heL" - }, + "metadata": {}, "source": [ "## Save and Restore\n" ] @@ -4588,9 +286,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "tnufdTmL5oyQ" - }, + "metadata": {}, "source": [ "### Annotator Models\n", "Let's say you would like to only save the trained annotators inside your pipeline so you can load them inside another custom Pipeline" @@ -4598,22 +294,16 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_dzzYJYQ5pJa", - "outputId": "83da0eae-3160-4b5f-983b-3101ff277ca3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Word2VecModel_039e5d98c036, NerDLModel_b873e071b194]" + "[Word2VecModel_2ebfbb8d7c3b, NerDLModel_c57cffac70ba]" ] }, - "execution_count": 13, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -4625,21 +315,15 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a0cEyPk298cd", - "outputId": "518b3aa8-070d-4cf8-e275-11eaa246dbb2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "NerDLModel_b873e071b194\n", - "Word2VecModel_039e5d98c036\n" + "NerDLModel_c57cffac70ba\n", + "Word2VecModel_2ebfbb8d7c3b\n" ] } ], @@ -4650,10 +334,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "jM16Elha-Mj3" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# let's save our NerDLModel - let's mention it was trained by word2vec_conll03 as well\n", @@ -4662,10 +344,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "AkFvbdQA-X1T" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# and here is our trained Word2VecModel\n", @@ -4682,2019 +362,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-12-29 13:51:50.782856: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_cnn/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_2 (Assign) /device:GPU:0\n", - " save/Assign_41 (Assign) /device:GPU:0\n", - " save/Assign_42 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.783046: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "Add: CPU \n", - "VarHandleOp: CPU \n", - "RandomUniform: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform (Add) \n", - " char_repr_cnn/conv1d/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/conv1d/ExpandDims_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_1 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_10 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_11 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.783171: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "ReadVariableOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/bias/Initializer/zeros (Const) \n", - " char_repr_cnn/conv1d/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/BiasAdd/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_8 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_9 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.783405: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_lstm/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_3 (Assign) /device:GPU:0\n", - " save/Assign_43 (Assign) /device:GPU:0\n", - " save/Assign_44 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.783572: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_6 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_20 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_21 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.783746: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_7 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_22 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_23 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.783875: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_5 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_18 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_19 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.784037: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_3 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_14 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_15 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.784220: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_4 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_16 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_17 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.784374: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_2 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_12 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_13 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.785092: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " W/Initializer/random_uniform/shape (Const) \n", - " W/Initializer/random_uniform/min (Const) \n", - " W/Initializer/random_uniform/max (Const) \n", - " W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " W/Initializer/random_uniform/sub (Sub) \n", - " W/Initializer/random_uniform/mul (Mul) \n", - " W/Initializer/random_uniform (Add) \n", - " W (VariableV2) /device:GPU:0\n", - " W/Assign (Assign) /device:GPU:0\n", - " W/read (Identity) /device:GPU:0\n", - " training_1/beta1_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta1_power (VariableV2) /device:GPU:0\n", - " training_1/beta1_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta1_power/read (Identity) /device:GPU:0\n", - " training_1/beta2_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta2_power (VariableV2) /device:GPU:0\n", - " training_1/beta2_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta2_power/read (Identity) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam (VariableV2) /device:GPU:0\n", - " training/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/W/Adam/read (Identity) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " training_1/Adam/mul (Mul) /device:GPU:0\n", - " training_1/Adam/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign (Assign) /device:GPU:0\n", - " save/Assign_37 (Assign) /device:GPU:0\n", - " save/Assign_38 (Assign) /device:GPU:0\n", - " save/Assign_111 (Assign) /device:GPU:0\n", - " save/Assign_112 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.785269: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " b/Initializer/random_uniform/shape (Const) \n", - " b/Initializer/random_uniform/min (Const) \n", - " b/Initializer/random_uniform/max (Const) \n", - " b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " b/Initializer/random_uniform/sub (Sub) \n", - " b/Initializer/random_uniform/mul (Mul) \n", - " b/Initializer/random_uniform (Add) \n", - " b (VariableV2) /device:GPU:0\n", - " b/Assign (Assign) /device:GPU:0\n", - " b/read (Identity) /device:GPU:0\n", - " training/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam (VariableV2) /device:GPU:0\n", - " training/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/b/Adam/read (Identity) /device:GPU:0\n", - " training/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign_39 (Assign) /device:GPU:0\n", - " save/Assign_40 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.785537: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_7 (Assign) /device:GPU:0\n", - " save/Assign_51 (Assign) /device:GPU:0\n", - " save/Assign_52 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.785668: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_6 (Assign) /device:GPU:0\n", - " save/Assign_49 (Assign) /device:GPU:0\n", - " save/Assign_50 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.785793: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_9 (Assign) /device:GPU:0\n", - " save/Assign_55 (Assign) /device:GPU:0\n", - " save/Assign_56 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.786084: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_8 (Assign) /device:GPU:0\n", - " save/Assign_53 (Assign) /device:GPU:0\n", - " save/Assign_54 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.786246: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_10 (Assign) /device:GPU:0\n", - " save/Assign_57 (Assign) /device:GPU:0\n", - " save/Assign_58 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.786471: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_12 (Assign) /device:GPU:0\n", - " save/Assign_61 (Assign) /device:GPU:0\n", - " save/Assign_62 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.786615: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_11 (Assign) /device:GPU:0\n", - " save/Assign_59 (Assign) /device:GPU:0\n", - " save/Assign_60 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.786762: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_14 (Assign) /device:GPU:0\n", - " save/Assign_65 (Assign) /device:GPU:0\n", - " save/Assign_66 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.786902: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_13 (Assign) /device:GPU:0\n", - " save/Assign_63 (Assign) /device:GPU:0\n", - " save/Assign_64 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.787051: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_15 (Assign) /device:GPU:0\n", - " save/Assign_67 (Assign) /device:GPU:0\n", - " save/Assign_68 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.787258: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_17 (Assign) /device:GPU:0\n", - " save/Assign_71 (Assign) /device:GPU:0\n", - " save/Assign_72 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.787477: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_16 (Assign) /device:GPU:0\n", - " save/Assign_69 (Assign) /device:GPU:0\n", - " save/Assign_70 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.787634: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_19 (Assign) /device:GPU:0\n", - " save/Assign_75 (Assign) /device:GPU:0\n", - " save/Assign_76 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.787766: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_18 (Assign) /device:GPU:0\n", - " save/Assign_73 (Assign) /device:GPU:0\n", - " save/Assign_74 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.787890: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_20 (Assign) /device:GPU:0\n", - " save/Assign_77 (Assign) /device:GPU:0\n", - " save/Assign_78 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788057: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_22 (Assign) /device:GPU:0\n", - " save/Assign_81 (Assign) /device:GPU:0\n", - " save/Assign_82 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788166: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_21 (Assign) /device:GPU:0\n", - " save/Assign_79 (Assign) /device:GPU:0\n", - " save/Assign_80 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788286: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_24 (Assign) /device:GPU:0\n", - " save/Assign_85 (Assign) /device:GPU:0\n", - " save/Assign_86 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788407: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_23 (Assign) /device:GPU:0\n", - " save/Assign_83 (Assign) /device:GPU:0\n", - " save/Assign_84 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788521: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_25 (Assign) /device:GPU:0\n", - " save/Assign_87 (Assign) /device:GPU:0\n", - " save/Assign_88 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788684: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_27 (Assign) /device:GPU:0\n", - " save/Assign_91 (Assign) /device:GPU:0\n", - " save/Assign_92 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788796: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_26 (Assign) /device:GPU:0\n", - " save/Assign_89 (Assign) /device:GPU:0\n", - " save/Assign_90 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.788918: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_29 (Assign) /device:GPU:0\n", - " save/Assign_95 (Assign) /device:GPU:0\n", - " save/Assign_96 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789033: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_28 (Assign) /device:GPU:0\n", - " save/Assign_93 (Assign) /device:GPU:0\n", - " save/Assign_94 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789159: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_30 (Assign) /device:GPU:0\n", - " save/Assign_97 (Assign) /device:GPU:0\n", - " save/Assign_98 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789316: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_32 (Assign) /device:GPU:0\n", - " save/Assign_101 (Assign) /device:GPU:0\n", - " save/Assign_102 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789420: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_31 (Assign) /device:GPU:0\n", - " save/Assign_99 (Assign) /device:GPU:0\n", - " save/Assign_100 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789537: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_34 (Assign) /device:GPU:0\n", - " save/Assign_105 (Assign) /device:GPU:0\n", - " save/Assign_106 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789657: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_33 (Assign) /device:GPU:0\n", - " save/Assign_103 (Assign) /device:GPU:0\n", - " save/Assign_104 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.789769: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_35 (Assign) /device:GPU:0\n", - " save/Assign_107 (Assign) /device:GPU:0\n", - " save/Assign_108 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.790054: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/W/Initializer/random_uniform/shape (Const) \n", - " context_repr/W/Initializer/random_uniform/min (Const) \n", - " context_repr/W/Initializer/random_uniform/max (Const) \n", - " context_repr/W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/W/Initializer/random_uniform/sub (Sub) \n", - " context_repr/W/Initializer/random_uniform/mul (Mul) \n", - " context_repr/W/Initializer/random_uniform (Add) \n", - " context_repr/W (VariableV2) /device:GPU:0\n", - " context_repr/W/Assign (Assign) /device:GPU:0\n", - " context_repr/W/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_4 (Assign) /device:GPU:0\n", - " save/Assign_45 (Assign) /device:GPU:0\n", - " save/Assign_46 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.790196: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/b/Initializer/random_uniform/shape (Const) \n", - " context_repr/b/Initializer/random_uniform/min (Const) \n", - " context_repr/b/Initializer/random_uniform/max (Const) \n", - " context_repr/b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/b/Initializer/random_uniform/sub (Sub) \n", - " context_repr/b/Initializer/random_uniform/mul (Mul) \n", - " context_repr/b/Initializer/random_uniform (Add) \n", - " context_repr/b (VariableV2) /device:GPU:0\n", - " context_repr/b/Assign (Assign) /device:GPU:0\n", - " context_repr/b/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_5 (Assign) /device:GPU:0\n", - " save/Assign_47 (Assign) /device:GPU:0\n", - " save/Assign_48 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:50.790460: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Switch: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " inference/transition_params/Initializer/random_uniform/shape (Const) \n", - " inference/transition_params/Initializer/random_uniform/min (Const) \n", - " inference/transition_params/Initializer/random_uniform/max (Const) \n", - " inference/transition_params/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " inference/transition_params/Initializer/random_uniform/sub (Sub) \n", - " inference/transition_params/Initializer/random_uniform/mul (Mul) \n", - " inference/transition_params/Initializer/random_uniform (Add) \n", - " inference/transition_params (VariableV2) /device:GPU:0\n", - " inference/transition_params/Assign (Assign) /device:GPU:0\n", - " inference/transition_params/read (Identity) /device:GPU:0\n", - " inference/cond/Reshape_4/Switch (Switch) /device:GPU:0\n", - " inference/cond_1/ExpandDims/Switch (Switch) /device:GPU:0\n", - " inference/cond_2/ExpandDims_1/Switch (Switch) /device:GPU:0\n", - " training/inference/transition_params/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam/read (Identity) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam_1 (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_inference/transition_params/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_36 (Assign) /device:GPU:0\n", - " save/Assign_109 (Assign) /device:GPU:0\n", - " save/Assign_110 (Assign) /device:GPU:0\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "document = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", "\n", @@ -6721,16 +391,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Stage 21:=================================================> (6 + 1) / 7]\r" - ] - }, { "name": "stdout", "output_type": "stream", @@ -6738,18 +401,11 @@ "+-------------------------------------------------------------------------------------------------------+\n", "|result |\n", "+-------------------------------------------------------------------------------------------------------+\n", - "|[O, O, O, B-PER, O, O, O, O, B-LOC, O, B-LOC, I-LOC] |\n", + "|[O, O, O, B-PER, O, O, O, O, B-ORG, O, B-LOC, O] |\n", "|[B-PER, I-PER, O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-PER, I-PER, O, O, O, O, O, O, O, O, B-LOC, O, O]|\n", "+-------------------------------------------------------------------------------------------------------+\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] } ], "source": [ @@ -6778,2025 +434,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-12-29 13:51:59.049574: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_cnn/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_cnn/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_cnn/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_cnn/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_cnn/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_2 (Assign) /device:GPU:0\n", - " save/Assign_41 (Assign) /device:GPU:0\n", - " save/Assign_42 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.049741: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "Add: CPU \n", - "VarHandleOp: CPU \n", - "RandomUniform: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_cnn/conv1d/kernel/Initializer/random_uniform (Add) \n", - " char_repr_cnn/conv1d/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/conv1d/ExpandDims_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_1 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_10 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_11 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.049858: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "ReadVariableOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_cnn/conv1d/bias/Initializer/zeros (Const) \n", - " char_repr_cnn/conv1d/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_cnn/conv1d/BiasAdd/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_cnn/conv1d/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_cnn/conv1d/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_8 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_9 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.050069: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "AddV2: CPU \n", - "AssignSub: CPU \n", - "RealDiv: CPU \n", - "Shape: CPU \n", - "Unique: CPU \n", - "Cast: CPU \n", - "UnsortedSegmentSum: CPU \n", - "Add: CPU \n", - "GatherV2: CPU \n", - "StridedSlice: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "NoOp: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Sqrt: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "ScatterAdd: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/char_embeddings/Initializer/random_uniform (Add) \n", - " char_repr_lstm/char_embeddings (VariableV2) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " char_repr_lstm/char_embeddings/read (Identity) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup/axis (Const) /device:GPU:0\n", - " char_repr_lstm/embedding_lookup (GatherV2) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Shape (Const) /device:GPU:0\n", - " training_1/gradients/char_repr_lstm/embedding_lookup_grad/Cast (Cast) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam/read (Identity) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1 (VariableV2) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/char_repr_lstm/char_embeddings/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Unique (Unique) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Shape (Shape) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_1 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice/stack_2 (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/strided_slice (StridedSlice) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/UnsortedSegmentSum (UnsortedSegmentSum) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_1 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_2 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_2 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_3 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3/x (Const) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/sub_3 (Sub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_4 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_5 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Assign_1 (Assign) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/ScatterAdd_1 (ScatterAdd) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/Sqrt_1 (Sqrt) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/mul_6 (Mul) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/add (AddV2) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/truediv_1 (RealDiv) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/AssignSub (AssignSub) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/char_embeddings/group_deps (NoOp) /device:GPU:0\n", - " save/Assign_3 (Assign) /device:GPU:0\n", - " save/Assign_43 (Assign) /device:GPU:0\n", - " save/Assign_44 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.050226: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_6 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_20 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_21 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.050391: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_7 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_22 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_23 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.050612: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/forward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/forward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/forward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_5 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_18 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_19 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.050748: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "ReadVariableOp: CPU \n", - "Add: CPU \n", - "AssignVariableOp: CPU \n", - "Fill: CPU \n", - "RandomUniform: CPU \n", - "Mul: CPU \n", - "Enter: CPU \n", - "Sub: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/min (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/max (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/sub (Sub) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Initializer/random_uniform (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_3 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_14 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_15 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.050905: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Fill: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "Sign: CPU \n", - "VarHandleOp: CPU \n", - "Const: CPU \n", - "DiagPart: CPU \n", - "Transpose: CPU \n", - "Mul: CPU \n", - "Qr: CPU \n", - "VarIsInitializedOp: CPU \n", - "AssignVariableOp: CPU \n", - "Add: CPU \n", - "RandomStandardNormal: CPU \n", - "Reshape: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mean (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/stddev (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/RandomStandardNormal (RandomStandardNormal) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/random_normal (Add) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Qr (Qr) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/DiagPart (DiagPart) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Sign (Sign) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose/perm (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/matrix_transpose/transpose (Transpose) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape/shape (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/Reshape (Reshape) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1/x (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Initializer/mul_1 (Mul) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_1 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_2 (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/ReadVariableOp_3 (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/recurrent_kernel/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_4 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_16 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_17 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.051027: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ResourceApplyAdam: CPU \n", - "Enter: CPU \n", - "ReadVariableOp: CPU \n", - "AssignVariableOp: CPU \n", - "VarIsInitializedOp: CPU \n", - "VarHandleOp: CPU \n", - "ConcatV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/ones (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/zeros_1 (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat/axis (Const) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Initializer/concat (ConcatV2) \n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias (VarHandleOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Assign (AssignVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp/Enter (Enter) /device:GPU:0\n", - " char_repr_lstm/sequential/bidirectional/backward_lstm_1/while/split_1/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1 (VarHandleOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/IsInitialized/VarIsInitializedOp (VarIsInitializedOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Assign (AssignVariableOp) /device:GPU:0\n", - " training/char_repr_lstm/sequential/bidirectional/backward_lstm/bias/Adam_1/Read/ReadVariableOp (ReadVariableOp) /device:GPU:0\n", - " training_1/Adam/update_char_repr_lstm/sequential/bidirectional/backward_lstm/bias/ResourceApplyAdam (ResourceApplyAdam) /device:GPU:0\n", - " save/AssignVariableOp_2 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_12 (AssignVariableOp) /device:GPU:0\n", - " save/AssignVariableOp_13 (AssignVariableOp) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.051439: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " W/Initializer/random_uniform/shape (Const) \n", - " W/Initializer/random_uniform/min (Const) \n", - " W/Initializer/random_uniform/max (Const) \n", - " W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " W/Initializer/random_uniform/sub (Sub) \n", - " W/Initializer/random_uniform/mul (Mul) \n", - " W/Initializer/random_uniform (Add) \n", - " W (VariableV2) /device:GPU:0\n", - " W/Assign (Assign) /device:GPU:0\n", - " W/read (Identity) /device:GPU:0\n", - " training_1/beta1_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta1_power (VariableV2) /device:GPU:0\n", - " training_1/beta1_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta1_power/read (Identity) /device:GPU:0\n", - " training_1/beta2_power/initial_value (Const) /device:GPU:0\n", - " training_1/beta2_power (VariableV2) /device:GPU:0\n", - " training_1/beta2_power/Assign (Assign) /device:GPU:0\n", - " training_1/beta2_power/read (Identity) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam (VariableV2) /device:GPU:0\n", - " training/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/W/Adam/read (Identity) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " training_1/Adam/mul (Mul) /device:GPU:0\n", - " training_1/Adam/Assign (Assign) /device:GPU:0\n", - " training_1/Adam/mul_1 (Mul) /device:GPU:0\n", - " training_1/Adam/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign (Assign) /device:GPU:0\n", - " save/Assign_37 (Assign) /device:GPU:0\n", - " save/Assign_38 (Assign) /device:GPU:0\n", - " save/Assign_111 (Assign) /device:GPU:0\n", - " save/Assign_112 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.051557: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " b/Initializer/random_uniform/shape (Const) \n", - " b/Initializer/random_uniform/min (Const) \n", - " b/Initializer/random_uniform/max (Const) \n", - " b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " b/Initializer/random_uniform/sub (Sub) \n", - " b/Initializer/random_uniform/mul (Mul) \n", - " b/Initializer/random_uniform (Add) \n", - " b (VariableV2) /device:GPU:0\n", - " b/Assign (Assign) /device:GPU:0\n", - " b/read (Identity) /device:GPU:0\n", - " training/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam (VariableV2) /device:GPU:0\n", - " training/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/b/Adam/read (Identity) /device:GPU:0\n", - " training/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_1 (Assign) /device:GPU:0\n", - " save/Assign_39 (Assign) /device:GPU:0\n", - " save/Assign_40 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.051700: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_7 (Assign) /device:GPU:0\n", - " save/Assign_51 (Assign) /device:GPU:0\n", - " save/Assign_52 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.051812: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_6 (Assign) /device:GPU:0\n", - " save/Assign_49 (Assign) /device:GPU:0\n", - " save/Assign_50 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.051931: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_9 (Assign) /device:GPU:0\n", - " save/Assign_55 (Assign) /device:GPU:0\n", - " save/Assign_56 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052051: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_8 (Assign) /device:GPU:0\n", - " save/Assign_53 (Assign) /device:GPU:0\n", - " save/Assign_54 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052170: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_10 (Assign) /device:GPU:0\n", - " save/Assign_57 (Assign) /device:GPU:0\n", - " save/Assign_58 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052323: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_12 (Assign) /device:GPU:0\n", - " save/Assign_61 (Assign) /device:GPU:0\n", - " save/Assign_62 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052435: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_11 (Assign) /device:GPU:0\n", - " save/Assign_59 (Assign) /device:GPU:0\n", - " save/Assign_60 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052554: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_14 (Assign) /device:GPU:0\n", - " save/Assign_65 (Assign) /device:GPU:0\n", - " save/Assign_66 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052671: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_13 (Assign) /device:GPU:0\n", - " save/Assign_63 (Assign) /device:GPU:0\n", - " save/Assign_64 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052789: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-0/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_15 (Assign) /device:GPU:0\n", - " save/Assign_67 (Assign) /device:GPU:0\n", - " save/Assign_68 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.052943: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_17 (Assign) /device:GPU:0\n", - " save/Assign_71 (Assign) /device:GPU:0\n", - " save/Assign_72 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053053: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_16 (Assign) /device:GPU:0\n", - " save/Assign_69 (Assign) /device:GPU:0\n", - " save/Assign_70 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053164: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_19 (Assign) /device:GPU:0\n", - " save/Assign_75 (Assign) /device:GPU:0\n", - " save/Assign_76 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053282: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_18 (Assign) /device:GPU:0\n", - " save/Assign_73 (Assign) /device:GPU:0\n", - " save/Assign_74 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053399: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_20 (Assign) /device:GPU:0\n", - " save/Assign_77 (Assign) /device:GPU:0\n", - " save/Assign_78 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053551: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_22 (Assign) /device:GPU:0\n", - " save/Assign_81 (Assign) /device:GPU:0\n", - " save/Assign_82 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053662: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_21 (Assign) /device:GPU:0\n", - " save/Assign_79 (Assign) /device:GPU:0\n", - " save/Assign_80 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053779: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_24 (Assign) /device:GPU:0\n", - " save/Assign_85 (Assign) /device:GPU:0\n", - " save/Assign_86 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.053907: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_23 (Assign) /device:GPU:0\n", - " save/Assign_83 (Assign) /device:GPU:0\n", - " save/Assign_84 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054026: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-1/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_25 (Assign) /device:GPU:0\n", - " save/Assign_87 (Assign) /device:GPU:0\n", - " save/Assign_88 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054183: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_27 (Assign) /device:GPU:0\n", - " save/Assign_91 (Assign) /device:GPU:0\n", - " save/Assign_92 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054293: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_26 (Assign) /device:GPU:0\n", - " save/Assign_89 (Assign) /device:GPU:0\n", - " save/Assign_90 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054411: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_29 (Assign) /device:GPU:0\n", - " save/Assign_95 (Assign) /device:GPU:0\n", - " save/Assign_96 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054527: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_28 (Assign) /device:GPU:0\n", - " save/Assign_93 (Assign) /device:GPU:0\n", - " save/Assign_94 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054644: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_30 (Assign) /device:GPU:0\n", - " save/Assign_97 (Assign) /device:GPU:0\n", - " save/Assign_98 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054796: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/kernel/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/kernel/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/kernel/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_32 (Assign) /device:GPU:0\n", - " save/Assign_101 (Assign) /device:GPU:0\n", - " save/Assign_102 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.054905: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "Identity: CPU \n", - "ApplyAdam: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Initializer/Const (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/bias (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/bias/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/bias/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/bias/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_31 (Assign) /device:GPU:0\n", - " save/Assign_99 (Assign) /device:GPU:0\n", - " save/Assign_100 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.055016: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_i_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_34 (Assign) /device:GPU:0\n", - " save/Assign_105 (Assign) /device:GPU:0\n", - " save/Assign_106 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.055139: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_f_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_33 (Assign) /device:GPU:0\n", - " save/Assign_103 (Assign) /device:GPU:0\n", - " save/Assign_104 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.055257: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/shape (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/min (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/max (Const) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/sub (Sub) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform/mul (Mul) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Initializer/random_uniform (Add) \n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag (VariableV2) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Assign (Assign) /device:GPU:0\n", - " context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/lstm-2/lstm_fused_cell_1/w_o_diag/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_35 (Assign) /device:GPU:0\n", - " save/Assign_107 (Assign) /device:GPU:0\n", - " save/Assign_108 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.055425: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Fill: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/W/Initializer/random_uniform/shape (Const) \n", - " context_repr/W/Initializer/random_uniform/min (Const) \n", - " context_repr/W/Initializer/random_uniform/max (Const) \n", - " context_repr/W/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/W/Initializer/random_uniform/sub (Sub) \n", - " context_repr/W/Initializer/random_uniform/mul (Mul) \n", - " context_repr/W/Initializer/random_uniform (Add) \n", - " context_repr/W (VariableV2) /device:GPU:0\n", - " context_repr/W/Assign (Assign) /device:GPU:0\n", - " context_repr/W/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/shape_as_tensor (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros/Const (Const) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Initializer/zeros (Fill) /device:GPU:0\n", - " training/context_repr/W/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/W/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/W/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/W/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_4 (Assign) /device:GPU:0\n", - " save/Assign_45 (Assign) /device:GPU:0\n", - " save/Assign_46 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.055544: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " context_repr/b/Initializer/random_uniform/shape (Const) \n", - " context_repr/b/Initializer/random_uniform/min (Const) \n", - " context_repr/b/Initializer/random_uniform/max (Const) \n", - " context_repr/b/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " context_repr/b/Initializer/random_uniform/sub (Sub) \n", - " context_repr/b/Initializer/random_uniform/mul (Mul) \n", - " context_repr/b/Initializer/random_uniform (Add) \n", - " context_repr/b (VariableV2) /device:GPU:0\n", - " context_repr/b/Assign (Assign) /device:GPU:0\n", - " context_repr/b/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam/read (Identity) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/context_repr/b/Adam_1 (VariableV2) /device:GPU:0\n", - " training/context_repr/b/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/context_repr/b/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_context_repr/b/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_5 (Assign) /device:GPU:0\n", - " save/Assign_47 (Assign) /device:GPU:0\n", - " save/Assign_48 (Assign) /device:GPU:0\n", - "\n", - "2022-12-29 13:51:59.055678: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [\n", - " /job:localhost/replica:0/task:0/device:CPU:0].\n", - "See below for details of this colocation group:\n", - "Colocation Debug Info:\n", - "Colocation group had the following types and supported devices: \n", - "Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]\n", - "ApplyAdam: CPU \n", - "Switch: CPU \n", - "Identity: CPU \n", - "Mul: CPU \n", - "Sub: CPU \n", - "Add: CPU \n", - "RandomUniform: CPU \n", - "Assign: CPU \n", - "VariableV2: CPU \n", - "Const: CPU \n", - "\n", - "Colocation members, user-requested devices, and framework assigned devices, if any:\n", - " inference/transition_params/Initializer/random_uniform/shape (Const) \n", - " inference/transition_params/Initializer/random_uniform/min (Const) \n", - " inference/transition_params/Initializer/random_uniform/max (Const) \n", - " inference/transition_params/Initializer/random_uniform/RandomUniform (RandomUniform) \n", - " inference/transition_params/Initializer/random_uniform/sub (Sub) \n", - " inference/transition_params/Initializer/random_uniform/mul (Mul) \n", - " inference/transition_params/Initializer/random_uniform (Add) \n", - " inference/transition_params (VariableV2) /device:GPU:0\n", - " inference/transition_params/Assign (Assign) /device:GPU:0\n", - " inference/transition_params/read (Identity) /device:GPU:0\n", - " inference/cond/Reshape_4/Switch (Switch) /device:GPU:0\n", - " inference/cond_1/ExpandDims/Switch (Switch) /device:GPU:0\n", - " inference/cond_2/ExpandDims_1/Switch (Switch) /device:GPU:0\n", - " training/inference/transition_params/Adam/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam/read (Identity) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Initializer/zeros (Const) /device:GPU:0\n", - " training/inference/transition_params/Adam_1 (VariableV2) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/Assign (Assign) /device:GPU:0\n", - " training/inference/transition_params/Adam_1/read (Identity) /device:GPU:0\n", - " training_1/Adam/update_inference/transition_params/ApplyAdam (ApplyAdam) /device:GPU:0\n", - " save/Assign_36 (Assign) /device:GPU:0\n", - " save/Assign_109 (Assign) /device:GPU:0\n", - " save/Assign_110 (Assign) /device:GPU:0\n", - "\n" - ] - }, { "data": { "text/plain": [ - "Param(parent='Pipeline_ada025e49033', name='stages', doc='a list of pipeline stages')" + "Param(parent='Pipeline_704aa7f63c6f', name='stages', doc='a list of pipeline stages')" ] }, - "execution_count": 19, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -8812,16 +459,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Stage 47:=================================================> (6 + 1) / 7]\r" - ] - }, { "name": "stdout", "output_type": "stream", @@ -8829,18 +469,11 @@ "+-------------------------------------------------------------------------------------------------------+\n", "|result |\n", "+-------------------------------------------------------------------------------------------------------+\n", - "|[O, O, O, B-PER, O, O, O, O, B-LOC, O, B-LOC, I-LOC] |\n", + "|[O, O, O, B-PER, O, O, O, O, B-ORG, O, B-LOC, O] |\n", "|[B-PER, I-PER, O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-PER, I-PER, O, O, O, O, O, O, O, O, B-LOC, O, O]|\n", "+-------------------------------------------------------------------------------------------------------+\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] } ], "source": [ @@ -8859,7 +492,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "sparknlp", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -8872,13 +505,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "b892d92fcc857cff1611a1b388f1d54f8b5970543d5ec3d14e16974e3049534d" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/french/Train-Perceptron-French.ipynb b/examples/python/training/french/Train-Perceptron-French.ipynb index c4d20c8879984c..8c7de9049fba4c 100644 --- a/examples/python/training/french/Train-Perceptron-French.ipynb +++ b/examples/python/training/french/Train-Perceptron-French.ipynb @@ -1,82 +1,32 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "NcrfGVpLv2Xx" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/french/Train-Perceptron-French.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Train POS Tagger in French by Spark NLP\n", + "Based on Universal Dependency `UD_French-GSD` version 2.3\n" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 69235, - "status": "ok", - "timestamp": 1589640976843, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "AcKqfUfOwBoS", - "outputId": "cc3485b6-8288-4216-8865-ad2754406f73" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 61kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 44.1MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.7MB/s \n", - "\u001B[?25h" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "0koUdx-qv2X3" - }, - "source": [ - "# Train POS Tagger in French by Spark NLP\n", - "### Based on Universal Dependency `UD_French-GSD` version 2.3\n" - ] - }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EqaPFY67v2X5" - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -96,44 +46,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "IxoKOXacv2YG" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 134964, - "status": "ok", - "timestamp": 1589641042606, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "h3JFtb73v2YI", - "outputId": "11b27317-0a3b-4f01-d8de-ae3f629323b2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -146,10 +74,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jOW45P_Wv2YQ" - }, + "metadata": {}, "source": [ "Let's prepare our training datasets containing `token_posTag` like `de_DET`. You can download this data set from Amazon S3:\n", "\n", @@ -160,42 +85,24 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 138352, - "status": "ok", - "timestamp": 1589641046004, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "8E6rlnU3v2YR", - "outputId": "49b7045e-a871-429a-868d-9d4f9997aa8e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-05-16 14:57:22-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/fr/pos/UD_French/UD_French-GSD_2.3.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.168.181\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.168.181|:443... connected.\n", + "--2023-02-20 18:11:19-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/fr/pos/UD_French/UD_French-GSD_2.3.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.171.189, 52.217.40.94, 52.217.140.64, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.171.189|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", - "Length: 3565213 (3.4M) [text/plain]\n", + "Length: 3565213 (3,4M) [text/plain]\n", "Saving to: ‘/tmp/UD_French-GSD_2.3.txt’\n", "\n", - "UD_French-GSD_2.3.t 100%[===================>] 3.40M 2.49MB/s in 1.4s \n", + "UD_French-GSD_2.3.t 100%[===================>] 3,40M 4,26MB/s in 0,8s \n", "\n", - "2020-05-16 14:57:25 (2.49 MB/s) - ‘/tmp/UD_French-GSD_2.3.txt’ saved [3565213/3565213]\n", + "2023-02-20 18:11:21 (4,26 MB/s) - ‘/tmp/UD_French-GSD_2.3.txt’ saved [3565213/3565213]\n", "\n" ] } @@ -207,11 +114,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "PgrS-fz7v2YY" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.training import POS\n", @@ -227,27 +130,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 459 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 147751, - "status": "ok", - "timestamp": 1589641055414, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "3p1xcWIjv2Yf", - "outputId": "900066d2-12be-43fe-c8a7-84b09607d9a8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -256,26 +140,26 @@ "+--------------------+--------------------+--------------------+\n", "| text| document| tags|\n", "+--------------------+--------------------+--------------------+\n", - "|Les commotions cé...|[[document, 0, 11...|[[pos, 0, 2, DET,...|\n", - "|L' œuvre est situ...|[[document, 0, 82...|[[pos, 0, 1, DET,...|\n", - "|Le comportement d...|[[document, 0, 18...|[[pos, 0, 1, DET,...|\n", - "|Toutefois , les f...|[[document, 0, 44...|[[pos, 0, 8, ADV,...|\n", - "|Ismene entre et a...|[[document, 0, 80...|[[pos, 0, 5, PROP...|\n", - "|je reviendrais av...|[[document, 0, 28...|[[pos, 0, 1, PRON...|\n", - "|Les forfaits comp...|[[document, 0, 30...|[[pos, 0, 2, DET,...|\n", - "|Il prévient que d...|[[document, 0, 99...|[[pos, 0, 1, PRON...|\n", - "|Ils tiraient à ba...|[[document, 0, 43...|[[pos, 0, 2, PRON...|\n", - "|Le château est en...|[[document, 0, 44...|[[pos, 0, 1, DET,...|\n", - "|En effet , la bir...|[[document, 0, 10...|[[pos, 0, 1, ADP,...|\n", - "|Le point final de...|[[document, 0, 15...|[[pos, 0, 1, DET,...|\n", - "|L' information gé...|[[document, 0, 53...|[[pos, 0, 1, DET,...|\n", - "|Motivé par la cha...|[[document, 0, 21...|[[pos, 0, 5, VERB...|\n", - "|Il exploitait un ...|[[document, 0, 12...|[[pos, 0, 1, PRON...|\n", - "|Plus tard dans la...|[[document, 0, 84...|[[pos, 0, 3, ADV,...|\n", - "|Ils deviennent al...|[[document, 0, 97...|[[pos, 0, 2, PRON...|\n", - "|Le chevalier lui ...|[[document, 0, 17...|[[pos, 0, 1, DET,...|\n", - "|Créée au cours du...|[[document, 0, 15...|[[pos, 0, 4, VERB...|\n", - "|On ne peut éviter...|[[document, 0, 11...|[[pos, 0, 1, PRON...|\n", + "|Les commotions cé...|[{document, 0, 11...|[{pos, 0, 2, DET,...|\n", + "|L' œuvre est situ...|[{document, 0, 82...|[{pos, 0, 1, DET,...|\n", + "|Le comportement d...|[{document, 0, 18...|[{pos, 0, 1, DET,...|\n", + "|Toutefois , les f...|[{document, 0, 44...|[{pos, 0, 8, ADV,...|\n", + "|Ismene entre et a...|[{document, 0, 80...|[{pos, 0, 5, PROP...|\n", + "|je reviendrais av...|[{document, 0, 28...|[{pos, 0, 1, PRON...|\n", + "|Les forfaits comp...|[{document, 0, 30...|[{pos, 0, 2, DET,...|\n", + "|Il prévient que d...|[{document, 0, 99...|[{pos, 0, 1, PRON...|\n", + "|Ils tiraient à ba...|[{document, 0, 43...|[{pos, 0, 2, PRON...|\n", + "|Le château est en...|[{document, 0, 44...|[{pos, 0, 1, DET,...|\n", + "|En effet , la bir...|[{document, 0, 10...|[{pos, 0, 1, ADP,...|\n", + "|Le point final de...|[{document, 0, 15...|[{pos, 0, 1, DET,...|\n", + "|L' information gé...|[{document, 0, 53...|[{pos, 0, 1, DET,...|\n", + "|Motivé par la cha...|[{document, 0, 21...|[{pos, 0, 5, VERB...|\n", + "|Il exploitait un ...|[{document, 0, 12...|[{pos, 0, 1, PRON...|\n", + "|Plus tard dans la...|[{document, 0, 84...|[{pos, 0, 3, ADV,...|\n", + "|Ils deviennent al...|[{document, 0, 97...|[{pos, 0, 2, PRON...|\n", + "|Le chevalier lui ...|[{document, 0, 17...|[{pos, 0, 1, DET,...|\n", + "|Créée au cours du...|[{document, 0, 15...|[{pos, 0, 4, VERB...|\n", + "|On ne peut éviter...|[{document, 0, 11...|[{pos, 0, 1, PRON...|\n", "+--------------------+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n" @@ -289,11 +173,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CBzSVba-v2Yr" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -321,10 +201,10 @@ " .setInputCols([\"sentence\", \"token\"]) \\\n", " .setOutputCol(\"pos\") \\\n", " .setPosCol(\"tags\")\n", - " \n", + "\n", "pipeline = Pipeline(stages=[\n", - " document_assembler, \n", - " sentence_detector, \n", + " document_assembler,\n", + " sentence_detector,\n", " tokenizer,\n", " posTagger\n", "])" @@ -332,34 +212,15 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 308726, - "status": "ok", - "timestamp": 1589641216400, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "ozE0ZwKuv2Y2", - "outputId": "f5c08ff6-8d32-4ce2-fd21-76321bf0664a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 68.2 ms, sys: 17.6 ms, total: 85.8 ms\n", - "Wall time: 2min 40s\n" + "CPU times: user 37.5 ms, sys: 527 µs, total: 38 ms\n", + "Wall time: 2min 14s\n" ] } ], @@ -372,10 +233,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "PVkYWiTZv2Y_" - }, + "metadata": {}, "source": [ "This is our testing DataFrame where we get some sentences in French. We are going to use our trained Pipeline to transform these sentence and predict each token's `Part Of Speech`." ] @@ -383,11 +241,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2vQq_Ps_v2ZA" - }, + "metadata": {}, "outputs": [], "source": [ "dfTest = spark.createDataFrame([\n", @@ -399,11 +253,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gNF94YHDv2ZG" - }, + "metadata": {}, "outputs": [], "source": [ "predict = model.transform(dfTest)" @@ -411,27 +261,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 310172, - "status": "ok", - "timestamp": 1589641217862, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "o-CU0ituv2ZM", - "outputId": "8cb25702-ffdc-4b84-80cb-850c34fc4391" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -440,7 +271,7 @@ "+--------------------+--------------------+\n", "| result| result|\n", "+--------------------+--------------------+\n", - "|[Je, sens, qu'ent...|[PRON, NOUN, VERB...|\n", + "|[Je, sens, qu'ent...|[PRON, NOUN, ADP,...|\n", "|[On, pourra, touj...|[PRON, VERB, ADV,...|\n", "+--------------------+--------------------+\n", "\n" @@ -450,17 +281,6 @@ "source": [ "predict.select(\"token.result\", \"pos.result\").show()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8mkCYL7tv2ZT" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -482,8 +302,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb b/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb index cd9992420bdaa9..a548ddbae35adc 100644 --- a/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb +++ b/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb @@ -1,79 +1,30 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "auU7wIldvPcF" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/italian/Train-Lemmatizer-Italian.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Training Lemmatizer Model in Italian language" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 61299, - "status": "ok", - "timestamp": 1589640760488, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "S5kucrZvvwwd", - "outputId": "396a902a-aeb8-4a05-ede8-89e1b152ffe4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 53kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 34.7MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.8MB/s \n", - "\u001B[?25h" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "csJ5KdVVvPcH" - }, - "source": [ - "# Training Lemmatizer Model in Italian language" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "WpB04U6avPcJ" - }, + "metadata": {}, "source": [ "### A brief explaination about `Lemmatizer` annotator in Spark NLP:\n", "\n", @@ -96,10 +47,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "XCjl36FWvPcL" - }, + "metadata": {}, "source": [ "Let's import required libraries including `SQL` and `ML` from Spark and some annotators from Spark NLP" ] @@ -107,11 +55,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "MRr6Bm61vPcM" - }, + "metadata": {}, "outputs": [], "source": [ "#Spark ML and SQL\n", @@ -128,44 +72,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "oOusk1AUvPcX" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 80225, - "status": "ok", - "timestamp": 1589640779435, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "vAbZzSPtvPcZ", - "outputId": "3004c45e-056f-4575-b95d-4a0236d1d115" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -178,44 +100,24 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 81715, - "status": "ok", - "timestamp": 1589640780933, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "JWIFXFJzvPcl", - "outputId": "de373ad3-76f3-4393-af82-bde32cd72f85" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-05-16 14:52:59-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/it/lemma/dxc.technology/lemma_italian.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.131.53\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.131.53|:443... connected.\n", + "--2023-02-20 18:14:45-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/it/lemma/dxc.technology/lemma_italian.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.87.190, 52.217.68.38, 52.216.37.120, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.87.190|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", - "Length: 6900964 (6.6M) [text/plain]\n", + "Length: 6900964 (6,6M) [text/plain]\n", "Saving to: ‘/tmp/lemma_italian.txt’\n", "\n", - "\r\n", - "lemma_italian.txt 0%[ ] 0 --.-KB/s \r\n", - "lemma_italian.txt 100%[===================>] 6.58M --.-KB/s in 0.1s \n", + "lemma_italian.txt 100%[===================>] 6,58M 6,52MB/s in 1,0s \n", "\n", - "2020-05-16 14:52:59 (54.5 MB/s) - ‘/tmp/lemma_italian.txt’ saved [6900964/6900964]\n", + "2023-02-20 18:14:47 (6,52 MB/s) - ‘/tmp/lemma_italian.txt’ saved [6900964/6900964]\n", "\n" ] } @@ -226,10 +128,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "8eAxIqsdvPcu" - }, + "metadata": {}, "source": [ "### Now we are going to create a Spark NLP Pipeline by using Spark ML Pipeline natively" ] @@ -237,11 +136,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r7OJidDrvPcw" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -258,14 +153,14 @@ "normalizer = Normalizer() \\\n", " .setInputCols([\"token\"]) \\\n", " .setOutputCol(\"normal\")\n", - " \n", + "\n", "lemmatizer = Lemmatizer() \\\n", " .setInputCols([\"normal\"]) \\\n", " .setOutputCol(\"lemma\") \\\n", " .setDictionary(\n", " path = \"/tmp/lemma_italian.txt\",\n", " read_as = \"TEXT\",\n", - " key_delimiter = \"\\\\s+\", \n", + " key_delimiter = \"\\\\s+\",\n", " value_delimiter = \"->\"\n", " )\n", "pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, normalizer, lemmatizer])" @@ -273,37 +168,15 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "YS9EQmrqvPc3" - }, + "metadata": {}, "source": [ "Let's see how good our model does when it comes to prediction. We are going to create a DataFrame with Italian text for testing purposes and use `transform()` to predict." ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 95483, - "status": "ok", - "timestamp": 1589640794718, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "8GRinTw5vPc5", - "outputId": "9c35a6ab-4a79-4975-e73c-a22ae9dc6654" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -396,8 +269,8 @@ ], "source": [ "# Let's create a DataFrame with Italian text for testing our Spark NLP Pipeline\n", - "dfTest = spark.createDataFrame([\"Finchè non avevo la linea ADSL di fastweb potevo entrare nel router e configurare quelle pochissime cose configurabili (es. nome dei device), da ieri che ho avuto la linea niente è più configurabile...\", \n", - " \"L'uomo è insoddisfatto del prodotto.\", \n", + "dfTest = spark.createDataFrame([\"Finchè non avevo la linea ADSL di fastweb potevo entrare nel router e configurare quelle pochissime cose configurabili (es. nome dei device), da ieri che ho avuto la linea niente è più configurabile...\",\n", + " \"L'uomo è insoddisfatto del prodotto.\",\n", " \"La coppia contenta si abbraccia sulla spiaggia.\"], StringType()).toDF(\"text\")\n", "\n", "# Of course you can select multiple columns at the same time however, this way we see each annotator without truncating their results\n", @@ -411,25 +284,11 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "NBbjwO5dvPdC" - }, + "metadata": {}, "source": [ "### Credits \n", "We would like to thank `DXC.Technology` for sharing their Italian datasets and models with Spark NLP community. The datasets are used to train `Lemmatizer` and `SentimentDetector` Models." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "561WWW8ExMNH" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -453,8 +312,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/italian/Train-SentimentDetector-Italian.ipynb b/examples/python/training/italian/Train-SentimentDetector-Italian.ipynb index 06670adbf4d159..e6139f4f018bc7 100644 --- a/examples/python/training/italian/Train-SentimentDetector-Italian.ipynb +++ b/examples/python/training/italian/Train-SentimentDetector-Italian.ipynb @@ -1,79 +1,30 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4MNtr_kFuh79" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/italian/Train-SentimentDetector-Italian.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-SentimentDetector-Italian.ipynb)\n", "\n", - "## 0. Colab Setup" + "# Training SentimentDetector Model in Italian language" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 61108, - "status": "ok", - "timestamp": 1589640756951, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "4Ih9NmzduqiG", - "outputId": "e2409f48-9c8f-4aec-e842-f3bb0c355f28" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "openjdk version \"1.8.0_252\"\n", - "OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)\n", - "OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)\n", - "\u001B[K |████████████████████████████████| 215.7MB 65kB/s \n", - "\u001B[K |████████████████████████████████| 204kB 44.2MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", - "\u001B[K |████████████████████████████████| 122kB 2.9MB/s \n", - "\u001B[?25h" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "HSj6T-enuh7-" - }, - "source": [ - "# Training SentimentDetector Model in Italian language" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "O1cDjGjZuh7_" - }, + "metadata": {}, "source": [ "### A brief explaination about `SentimentDetector` annotator in Spark NLP:\n", "\n", @@ -99,10 +50,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "btgGY2_Suh8A" - }, + "metadata": {}, "source": [ "Let's import required libraries including `SQL` and `ML` from Spark and some annotators from Spark NLP" ] @@ -110,11 +58,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "eeue1JUhuh8B" - }, + "metadata": {}, "outputs": [], "source": [ "#Spark ML and SQL\n", @@ -131,44 +75,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "kRFZfirruh8F" - }, + "metadata": {}, "source": [ "### Let's create a Spark Session for our app" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 80515, - "status": "ok", - "timestamp": 1589640776383, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "TjETa3RCuh8G", - "outputId": "b807ac03-aefa-49ba-d2a9-1e7925792016" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version: 2.5.0\n", - "Apache Spark version: 2.4.4\n" + "Spark NLP version: 4.3.1\n", + "Apache Spark version: 3.3.0\n" ] } ], @@ -181,55 +103,31 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 391 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 83197, - "status": "ok", - "timestamp": 1589640779071, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "Z8Oft8Ahuh8J", - "outputId": "5322aca9-f75a-4dfe-fb4a-5ef4342aed6a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-05-16 14:52:56-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/it/lemma/dxc.technology/lemma_italian.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.165.197\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.165.197|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 6900964 (6.6M) [text/plain]\n", - "Saving to: ‘/tmp/lemma_italian.txt’\n", + "--2023-02-20 18:15:24-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/it/lemma/dxc.technology/lemma_italian.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.132.189, 52.217.174.32, 52.216.242.22, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.132.189|:443... connected.\n", + "HTTP request sent, awaiting response... 304 Not Modified\n", + "File ‘/tmp/lemma_italian.txt’ not modified on server. Omitting download.\n", "\n", - "\r\n", - "lemma_italian.txt 0%[ ] 0 --.-KB/s \r\n", - "lemma_italian.txt 100%[===================>] 6.58M --.-KB/s in 0.1s \n", - "\n", - "2020-05-16 14:52:56 (58.7 MB/s) - ‘/tmp/lemma_italian.txt’ saved [6900964/6900964]\n", - "\n", - "--2020-05-16 14:52:57-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/it/sentiment/dxc.technology/sentiment_italian.txt\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.225.139\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.225.139|:443... connected.\n", + "--2023-02-20 18:15:25-- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/it/sentiment/dxc.technology/sentiment_italian.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.107.142, 54.231.234.0, 52.216.209.0, ...\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.107.142|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 349115 (341K) [text/plain]\n", "Saving to: ‘/tmp/sentiment_italian.txt’\n", "\n", - "sentiment_italian.t 100%[===================>] 340.93K --.-KB/s in 0.05s \n", + "sentiment_italian.t 100%[===================>] 340,93K 704KB/s in 0,5s \n", "\n", - "2020-05-16 14:52:58 (6.34 MB/s) - ‘/tmp/sentiment_italian.txt’ saved [349115/349115]\n", + "2023-02-20 18:15:26 (704 KB/s) - ‘/tmp/sentiment_italian.txt’ saved [349115/349115]\n", "\n" ] } @@ -241,10 +139,7 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "26-SwNfZuh8N" - }, + "metadata": {}, "source": [ "### Now we are going to create a Spark NLP Pipeline by using Spark ML Pipeline natively" ] @@ -252,11 +147,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "cmz3eA33uh8O" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -273,14 +164,14 @@ "normalizer = Normalizer() \\\n", " .setInputCols([\"token\"]) \\\n", " .setOutputCol(\"normal\")\n", - " \n", + "\n", "lemmatizer = Lemmatizer() \\\n", " .setInputCols([\"normal\"]) \\\n", " .setOutputCol(\"lemma\") \\\n", " .setDictionary(\n", " path = \"/tmp/lemma_italian.txt\",\n", " read_as = \"TEXT\",\n", - " key_delimiter = \"\\\\s+\", \n", + " key_delimiter = \"\\\\s+\",\n", " value_delimiter = \"->\"\n", " )\n", "\n", @@ -297,47 +188,22 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "vNALNlGjuh8R" - }, + "metadata": {}, "source": [ "Now that we have our Spark NLP Pipeline, we can go ahead with training it by using `fit()`. Since we are using an external dataset to train our `Lemmatizer` and `SentimentDetector` models we don't need to pass any DataFrame with real data. We are going to create an empty DataFrame to just trigger the training." ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MJAnjMnouh8S" - }, + "metadata": {}, "source": [ "Let's see how good our model does when it comes to prediction. We are going to create a DataFrame with Italian text for testing purposes and use `transform()` to predict." ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 100383, - "status": "ok", - "timestamp": 1589640796268, - "user": { - "displayName": "Christian Kasim Loan", - "photoUrl": "", - "userId": "14469489166467359317" - }, - "user_tz": -120 - }, - "id": "SSqzEFPZuh8T", - "outputId": "80cbdd47-80c8-4164-bcd3-f40f3f8a7e3a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -370,9 +236,9 @@ "+-------------------------------------+\n", "|sentiment_score |\n", "+-------------------------------------+\n", - "|[[sentiment, 0, 0, positive, [], []]]|\n", - "|[[sentiment, 0, 0, negative, [], []]]|\n", - "|[[sentiment, 0, 0, positive, [], []]]|\n", + "|[{sentiment, 0, 0, positive, {}, []}]|\n", + "|[{sentiment, 0, 0, negative, {}, []}]|\n", + "|[{sentiment, 0, 0, positive, {}, []}]|\n", "+-------------------------------------+\n", "\n", "root\n", @@ -449,8 +315,8 @@ ], "source": [ "# Let's create a DataFrame with Italian text for testing our Spark NLP Pipeline\n", - "dfTest = spark.createDataFrame([\"Finchè non avevo la linea ADSL di fastweb potevo entrare nel router e configurare quelle pochissime cose configurabili (es. nome dei device), da ieri che ho avuto la linea niente è più configurabile...\", \n", - " \"L'uomo è insoddisfatto del prodotto.\", \n", + "dfTest = spark.createDataFrame([\"Finchè non avevo la linea ADSL di fastweb potevo entrare nel router e configurare quelle pochissime cose configurabili (es. nome dei device), da ieri che ho avuto la linea niente è più configurabile...\",\n", + " \"L'uomo è insoddisfatto del prodotto.\",\n", " \"La coppia contenta si abbraccia sulla spiaggia.\"], StringType()).toDF(\"text\")\n", "\n", "# Of course you can select multiple columns at the same time however, this way we see each annotator without truncating their results\n", @@ -465,25 +331,11 @@ }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "a3IGYKq2uh8X" - }, + "metadata": {}, "source": [ "### Credits \n", "We would like to thank `DXC.Technology` for sharing their Italian datasets and models with Spark NLP community. The datasets are used to train `Lemmatizer` and `SentimentDetector` Models." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "F0m7N9WlxQU5" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -506,8 +358,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb b/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb index e94ef6b977cd25..f5ccec1c865ef3 100644 --- a/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb +++ b/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb @@ -1,20 +1,22 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "F1FAS4dwrLca" - }, + "metadata": {}, "source": [ - "

Noisy Channel Model Spell Checker - Training

\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb)\n", + "\n", + "\n", + "# Noisy Channel Model Spell Checker - Training\n", "In this notebook we're going to learn how to train the Noisy Channel Model Spell Checker, a.k.a. ContextSpellChecker, as it can leverage context word information to produce corrections for each word." ] }, { "cell_type": "markdown", - "metadata": { - "id": "sB5BAhNBrLcc" - }, + "metadata": {}, "source": [ "## Italian Language Spell Checking\n", "This is a toy Italian Spell Checking Model used here to exemplify how to train a Spell Checker. It may require more work to become a real world model." @@ -22,60 +24,18 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "PwsHnOr5rLcd", - "outputId": "641d6b12-ed83-482a-d2d7-a56573558212", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-12-23 11:10:17-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", - "--2022-12-23 11:10:17-- https://setup.johnsnowlabs.com/colab.sh\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2022-12-23 11:10:18-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 0%[ ] 0 --.-KB/s Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2022-12-23 11:10:18 (61.6 MB/s) - written to stdout [1191/1191]\n", - "\n", - "\u001B[K |████████████████████████████████| 281.5 MB 48 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 61.5 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 48.8 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# This is only to setup PySpark and Spark NLP on Colab\n", + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "MeRuuA96rLcd" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -88,53 +48,40 @@ }, { "cell_type": "code", - "source": [ - "!wget https://clarin.eurac.edu/repository/xmlui/bitstream/handle/20.500.12124/3/paisa.raw.utf8.gz" - ], - "metadata": { - "id": "o6Pnr3N1sAiV", - "outputId": "ddbcc061-924c-40dd-bbca-4af1b6b2f30d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, - "execution_count": 4, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "--2022-12-23 11:13:29-- https://clarin.eurac.edu/repository/xmlui/bitstream/handle/20.500.12124/3/paisa.raw.utf8.gz\n", - "Resolving clarin.eurac.edu (clarin.eurac.edu)... 46.18.24.111\n", - "Connecting to clarin.eurac.edu (clarin.eurac.edu)|46.18.24.111|:443... connected.\n", + "--2023-02-20 18:16:37-- https://clarin.eurac.edu/repository/xmlui/bitstream/handle/20.500.12124/3/paisa.raw.utf8.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving clarin.eurac.edu (clarin.eurac.edu)... 193.106.181.65\n", + "Connecting to clarin.eurac.edu (clarin.eurac.edu)|193.106.181.65|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 546911754 (522M) [application/gzip]\n", "Saving to: ‘paisa.raw.utf8.gz’\n", "\n", - "paisa.raw.utf8.gz 100%[===================>] 521.58M 6.88MB/s in 83s \n", + "paisa.raw.utf8.gz 100%[===================>] 521,58M 14,3MB/s in 2m 35s \n", "\n", - "2022-12-23 11:14:52 (6.32 MB/s) - ‘paisa.raw.utf8.gz’ saved [546911754/546911754]\n", + "2023-02-20 18:19:12 (3,37 MB/s) - ‘paisa.raw.utf8.gz’ saved [546911754/546911754]\n", "\n" ] } + ], + "source": [ + "!wget https://clarin.eurac.edu/repository/xmlui/bitstream/handle/20.500.12124/3/paisa.raw.utf8.gz" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "NvFHtFhwrLce", - "outputId": "a05429b2-6ac9-4b55-d493-ee207bf62d1b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|value |\n", @@ -168,7 +115,7 @@ "source": [ "# Let's use the Paisa corpus\n", "from pyspark.sql.functions import *\n", - "paisaCorpusPath = \"/content/paisa.raw.utf8.gz\"\n", + "paisaCorpusPath = \"paisa.raw.utf8.gz\"\n", "\n", "\n", "# do some brief DS exploration, and preparation to get clean text\n", @@ -182,18 +129,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "qOnmpen2rLce", - "outputId": "592bad91-0cd8-43bf-d94c-8dbc74f8b329", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['Abaco',\n", @@ -1199,8 +1138,9 @@ " ...]" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], "source": [ @@ -1218,10 +1158,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "Z4yplfIlrLcf" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "assembler = DocumentAssembler()\\\n", @@ -1247,10 +1185,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "43KLlrMPrLcf" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(\n", @@ -1264,18 +1200,10 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "CE2ArBQzrLcf", - "outputId": "9257b503-0195-4297-bb2a-d0a821e66783", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 0 - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'document': ['Ciiao! sono Glorea, ho laciato la patentte sul tabolo acanto alla fruta!'],\n", @@ -1311,8 +1239,9 @@ " '!']}" ] }, + "execution_count": null, "metadata": {}, - "execution_count": 12 + "output_type": "execute_result" } ], "source": [ @@ -1322,6 +1251,9 @@ } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -1336,11 +1268,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "colab": { - "provenance": [] + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/HuggingFace in Spark NLP - ALBERT.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - ALBERT.ipynb index ecd072a20cce45..5103aeef09a40a 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - ALBERT.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - ALBERT.ipynb @@ -1,20 +1,19 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "lshuevA3Qv-N" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ALBERT.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ALBERT.ipynb)" ] }, { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import ALBERT models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -26,9 +25,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -36,9 +33,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,13 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "c169434e-4452-4658-f6b1-dc888b7a48a9" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -62,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [albert-base-v2](https://huggingface.co/albert-base-v2) model from HuggingFace as an example\n", @@ -75,48 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 896, - "referenced_widgets": [ - "f1585b4c4f9f466f8730bce74110b248", - "b840f00b333849afa95375cab548e3b1", - "9ea0caac05284ef79323d5c88deb8d8c", - "6d5db72c01c043089b58fac2b64f3124", - "721903bf15304fcab9f4504e8dda9be3", - "c2aa3877c4d0409d889a7c486158f4a5", - "1ed9068ee045430a8ace1bf18df1a266", - "d79a00e434ff43f7942c34a117879d43", - "7ab0b2bcd5a34c8ea3bab32d8258bd64", - "ce771a6d9b64470aa5e2a500a8df47d3", - "a651c5e6c85242ff889601522ae76d42", - "6a22788a5471416c999b12e5e05abf33", - "46ee38864bae48feb3692578c266ee3f", - "db69f161d69640f8b3e24e6d9f9dbcf0", - "e912db9cdb8b4b5cbeecb955631478e1", - "4dbbca5cb9654d489a26989ae5a71de0", - "3addd8d30dbd4b3aaef0636b22164391", - "3fb78e1dafe44c4da48b17ff3dc0781d", - "c2e86bc083c242f994e797046d17e1e6", - "095c48e984894f38875fa02a2beae17b", - "95ad63ec7a2a4a62a7546dd1796a9f17", - "51f46f77ef6b4176a6b5e57b99dbfada", - "ce1f37f419f94d1c97eee5e5108833d3", - "d952452b82284cd79f588896b0018994", - "e5a44efb86404928b5922390c7a9a364", - "67739382be014038a3a877ccf7c916e0", - "c22f9ad149fb4a419a8c6423ea5f1ef9", - "60df2a80bfdb475884ea64f21c856bc6", - "cbdfa3aa241746a3974aa2e1687a7b55", - "2255bfc07a9e42eeab515ab6f7941213", - "d97a433c437c4a2584e2bbb1d49d5d6e", - "7cd30dde04874a0486c4a465e1805eb5" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "c698963f-f62c-40c7-908c-3425b909e928" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import AlbertTokenizer, TFAlbertModel\n", @@ -154,23 +100,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "696d74f0-6571-49a4-ca36-3b62d420af0c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -178,7 +116,7 @@ "text": [ "total 91360\n", "-rw-r--r-- 1 maziyar staff 792 Dec 13 14:41 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 13 14:41 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 13 14:41 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 46771352 Dec 13 14:41 tf_model.h5\n" ] } @@ -189,25 +127,19 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "ffe29d99-16b2-4ca1-9818-c210eb20ebbe" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 20080\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 13 14:41 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 13 14:41 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 13 14:41 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 24311 Dec 13 14:41 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 10249151 Dec 13 14:41 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 14:41 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 14:41 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -217,14 +149,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "e1d148ce-eced-48e9-80dc-d2332116a30d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -243,9 +169,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `spiece.model` file from the tokenizer\n", @@ -254,10 +178,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# let's copy spiece.model file to saved_model/1/assets\n", @@ -266,18 +188,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save ALBERT in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -285,14 +203,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8tpW5nkMc53m", - "outputId": "bef364b4-86d8-4a6c-b552-0e332f5aecf7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -309,19 +221,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -331,9 +239,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `AlbertEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `AlbertEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -346,10 +252,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -367,19 +271,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "albert.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -387,9 +287,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -397,9 +295,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -407,9 +303,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -419,13 +313,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "f1cab05f-a9af-4e65-e7ab-7bc8ef74f54a" - }, + "metadata": {}, "outputs": [], "source": [ "! ls -l {MODEL_NAME}_spark_nlp" @@ -433,9 +321,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊 " ] @@ -443,9 +329,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "albert_loaded = AlbertEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -457,14 +341,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "pGRTNISyYlnO", - "outputId": "22ebf476-0d65-42a1-b0b2-9e203532c97f" - }, + "metadata": {}, "outputs": [], "source": [ "albert_loaded.getStorageRef()" @@ -472,21 +349,10 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of ALBERT models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ywzS9bwfLlI1" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -496,7 +362,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "transformers", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -509,17 +375,11 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.28.0" }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } - }, "widgets": { "application/vnd.jupyter.widget-state+json": { "095c48e984894f38875fa02a2beae17b": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - AlbertForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - AlbertForQuestionAnswering.ipynb index 960a67c8c81399..030465444ec941 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - AlbertForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - AlbertForQuestionAnswering.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import AlbertForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,25 +43,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [twmkn9/albert-base-v2-squad2](https://huggingface.co/twmkn9/albert-base-v2-squad2) model from HuggingFace as an example\n", @@ -85,105 +61,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -244,34 +123,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -279,7 +139,7 @@ "text": [ "total 86768\n", "-rw-r--r-- 1 maziyar staff 844 Dec 13 14:55 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 13 14:55 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 13 14:55 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 44417688 Dec 13 14:55 tf_model.h5\n" ] } @@ -290,36 +150,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 20592\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 13 14:55 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 13 14:55 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 13 14:55 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 24513 Dec 13 14:55 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 10512223 Dec 13 14:55 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 14:55 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 14:55 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -329,25 +172,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -366,9 +192,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `spiece.model` from the tokenizer\n", @@ -377,10 +201,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -390,34 +212,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `spiece.model` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -434,18 +237,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save AlbertForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -453,25 +252,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -488,19 +270,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -510,9 +288,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `AlbertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `AlbertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -523,10 +299,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -544,19 +318,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -564,19 +334,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -584,9 +350,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -595,25 +359,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -622,8 +369,8 @@ "total 102416\n", "-rw-r--r-- 1 maziyar staff 51673091 Jun 16 10:16 albert_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 760289 Jun 16 10:16 albert_spp\n", - "drwxr-xr-x 3 maziyar staff 96 Jun 16 10:16 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Jun 16 10:16 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 3 maziyar staff 96 Jun 16 10:16 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Jun 16 10:16 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -633,34 +380,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny AlbertForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -697,9 +425,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `AlbertForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -712,7 +438,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3.6.10 ('transformers')", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -725,13 +451,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - AlbertForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - AlbertForSequenceClassification.ipynb index ebb4ed46789c58..9f55f30dc4b11f 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - AlbertForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - AlbertForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import AlbertForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -49,24 +44,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 88323, - "status": "ok", - "timestamp": 1632137198631, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "8512d660-2c0f-40f4-da6b-9fda3d5543aa" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [mohsenfayyaz/albert-base-v2-toxicity](https://huggingface.co/mohsenfayyaz/albert-base-v2-toxicity) model from HuggingFace as an example\n", @@ -85,99 +61,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 756, - "referenced_widgets": [ - "e1fc0a4d6ec54a62aae134b855f9bf7a", - "1900a259fd4a4d098f2f1c5d40c525a9", - "4a3f42cc8c3d4ec2aad7d53997bb5ff5", - "bdb8a9f473b84c48a2798fb9124fed03", - "a94a622315e045d8a8028bbd6a5068a2", - "1b8a5def4d194653b3e0bec831eaaabf", - "6acbaf6a0a1140a3a6015a3f61d9898f", - "52af864ec283456989a74f2984660779", - "b0bda760efd449e48f11ba41260fa699", - "a979e306af0341d6899d85d6f5230a19", - "19075a2e10324843b5e8c3a4aa8e9e53", - "1f7865cccbdd43619ea98fd8f5a14f8b", - "b2c46dabf83f489bba962298e2ecb710", - "57b9d3e735c7486d872ea2b0b73a3677", - "c9100fded1ef49779deccb8f9fb24d5e", - "8c894774bdd544b9874f3ebdfd131146", - "ae287fa050b744adb89541976956a551", - "00ae5c6d386744f3b0589b95d8af1b94", - "6686a498f46d4945a77aa8471682c0d1", - "d79e3a5501e8441f82535c964261401b", - "098428e313f34f26a9e2720aa2dbf530", - "4d433874cb614632a9c1e60a805f681f", - "1e5d422af6d64492a8cb794f8de39ecb", - "2088dd75202942678c7e3d3099b0ecda", - "525a0dc4876c43cb9934453e83a071c5", - "0f10d12748dc46e980cea8fa9c810ed6", - "e6aa9ca934f541e0926ff8124dcbc52a", - "37f8a228dc314a0f9e316d4c76408e21", - "cd8a643829ba45639d0ab9c6d8261065", - "f9b622ef455a4678a7b4d04c37eaeaeb", - "205e28c15bfb4562bd1e57e2e38e55ca", - "48919d4cdc4343f083e61980230a3593", - "785c2b642c3d4e1dab9d668e9b265ad2", - "8b5863b19c5d4c1a88432a629d12a54d", - "4f8c267af7db4940bc885321aa1eff32", - "9a79361819774bd5a9ffc66f5d009be4", - "f0f4fe39bb684389898f0e5bb8befdd2", - "df2ac87d416d4e9491b405d87a6843c9", - "67468a34a00d4afda58deb4cd5f7ffb8", - "6234eafaa2854beabf40386e2dd14040", - "a1d0b0feb55947a2902610e4d1cc694a", - "b7b3988d15574c4eb3584272afb66061", - "f1fb22fd219f40f68625386b35ee7fde", - "e80e40e02c054ec99c239e366905259b", - "a5ab2ca68dc0459b9e027f113184ba50", - "95bd8ac5c4544ce7a826623d61cddf08", - "bb14c43fc8c04748b24bed0d3872b2aa", - "eb4f935d3ad947c29d2f9a1346238759", - "4cefc208ac634f218d7136c799e9b22c", - "94423182f8c940e3bc2c1f4353eab2f8", - "fe3e2e405c0543dda602cae3ec200cbc", - "9744ef999f49428fa5d43af1180712fb", - "e2be8dbfcdd34899b16f13ee9c5f3586", - "1ecb5f9d496a4e59b814a0fe81082746", - "ea769f21031d495bb46a4d8ade68658c", - "f514faa20bec40acb77e49005d7f8e34", - "ca77a1edc0b8401a83215fc7657acbf7", - "16c80bbca74a44afac6944ee3a5aba81", - "bb695d93b6c54f0fb83763de7270e10e", - "3cd33166f4be45c39257d55ab756b7c8", - "65587021eb3649a799f7d69117045216", - "c20fb03b71aa40e29a77d8f8bdf8043d", - "fa4fdc5ffe924af086161990c22f4f47", - "4d1275aab38546449db6a1eb22979031", - "7b464c4c2a14481ab9d3722b306a5d63", - "6f3582e6d41647898b1b00fef09ffbb4", - "e7b5df71fe094a5cb05e348cd0b8d92c", - "744aac1587df48ffa2999e05d28a7f31", - "7b9e6487f90546a49814d87749dd5956", - "6d8736cdaeee4010a45c554273ae37b1", - "6253c95b8aa9468f89b363cf1d43c6e6" - ] - }, - "executionInfo": { - "elapsed": 63652, - "status": "ok", - "timestamp": 1632137295438, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZaiirlSKNhVD", - "outputId": "8b8f0fb9-37e5-4893-ada0-3fb15d851281" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -237,34 +122,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 18, - "status": "ok", - "timestamp": 1632137295439, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "67bd9f17-ba94-4940-9702-a717343a8fee" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -272,7 +138,7 @@ "text": [ "total 91384\n", "-rw-r--r-- 1 maziyar staff 914 Dec 13 15:05 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 13 15:05 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 13 15:05 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 46781688 Dec 13 15:05 tf_model.h5\n" ] } @@ -283,36 +149,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 288, - "status": "ok", - "timestamp": 1632137295723, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "3de08dd0-c2ae-43bb-d8fd-41b1f3ba9f47" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 20760\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 13 15:05 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 13 15:05 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 56 Dec 13 15:05 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 25976 Dec 13 15:05 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 10595381 Dec 13 15:05 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:05 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:05 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -322,25 +171,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 281, - "status": "ok", - "timestamp": 1632137296002, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "2a94bc16-0d09-4cb3-e58d-e7a638b0a579" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -359,9 +191,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `spiece.model` file from the tokenizer\n", @@ -371,10 +201,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -385,10 +213,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -402,34 +228,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 323, - "status": "ok", - "timestamp": 1628497252447, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "OYnT5U8N9dxT", - "outputId": "8d5068a4-0395-401a-fb19-0ed60300be1c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -447,18 +254,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save AlbertForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -467,9 +270,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -477,19 +278,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -499,9 +296,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `AlbertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `AlbertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -512,10 +307,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -530,19 +323,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -550,19 +339,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -570,9 +355,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -581,25 +364,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 291, - "status": "ok", - "timestamp": 1632137856170, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "35dea086-25d2-4029-df4e-663905aafd77" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -608,8 +374,8 @@ "total 113424\n", "-rw-r--r-- 1 maziyar staff 57307636 Dec 13 15:08 albert_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 760289 Dec 13 15:08 albert_spp\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:08 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 13 15:08 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:08 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 13 15:08 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -619,19 +385,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny AlbertForSequenceClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = AlbertForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -641,34 +403,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1632137863887, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "60046377-bfd4-4c5e-e392-f78841e6bfe8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -676,7 +419,7 @@ "['Toxic', 'Non-Toxic']" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -688,20 +431,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "MysnSyi8BpHi", - "outputId": "68177654-411f-4b93-a1fb-e0725b5e1f53" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -749,9 +487,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `AlbertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -764,7 +500,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "transformers", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -777,13 +513,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - AlbertForTokenClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - AlbertForTokenClassification.ipynb index ec7d6dcc4d7e38..735d030c2acccd 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - AlbertForTokenClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - AlbertForTokenClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForTokenClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForTokenClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import AlbertForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -49,24 +44,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 88288, - "status": "ok", - "timestamp": 1640695444962, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "be6bb11a-70ce-401e-b4e0-19f6dd796154" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [HooshvareLab/albert-fa-zwnj-base-v2-ner](https://huggingface.co/HooshvareLab/albert-fa-zwnj-base-v2-ner) model from HuggingFace as an example\n", @@ -86,93 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 767, - "referenced_widgets": [ - "4ef5957510234de7ba6cb11c7555863b", - "0e791c34a55d43089739280aa23ed1fa", - "51a76a06fbc14a3b87c3cfd67d33d675", - "cc752413569b4a0cac7c0f925a22f677", - "ddd724d35990407587315bf7070f6518", - "1f2fc5df1ecc4e1d92eb5b96f81012df", - "387d94630b7149188e0eb30d1156a60b", - "c2d84f843aac4da685004409a6777384", - "1e8b0bf57bc84b619502edcb2ea3efba", - "2aab7d79fff74a2baa724b54ce25408d", - "b08fcc61fa314e0ab0b54408943a7651", - "049a14ea49b44df781c2f7e8eca13cdf", - "78105296bbaf4079ababbbe1612cc83f", - "725aaf8a5f3f49edb0cdf528af973084", - "fd84d85ac4f6438abcfe92d878bd894f", - "2420ca2b0f8f419fab150e69d44c1b46", - "8c639bce11094ec4bf0e04d9780fc31e", - "3a555aa913684b5088b91b79960121b8", - "4660e83ce6204e51a57dfe266b6df936", - "654100853cbb467f856bd478ecbe9a12", - "be8d3efe78a44b3fa900ebac03f33184", - "4e510dd267d24ed58c36e59549b2d378", - "d001e623b96844aa82598bbef3537e07", - "67253e027be54a4a992090e9245916f0", - "50cc9f3dea3a44809c0198e22aa4ea44", - "9741da1012a3415dad6b8ae14e2fbc3b", - "d3430589d9764868ba6b959e1066ec0b", - "2873e282261d410ea2f167126fc4218a", - "0c67ab73ea4b49e0854278000babec6d", - "ea72f8b937e64f498b5ccd04d5ad7117", - "d50c0655317e4975b6832a17a9ef64a3", - "2e57ac334e534de2bd42d02c321d664e", - "a6b1b81f6c744de1ba0bb597e696584a", - "a59793e3372d42be8c04586311382af6", - "8d6fea7d91994de68d48eedc42dd40b6", - "f4d14d6f5aac49a8a0218ff55ecff072", - "609a35782d0f4685a04aff89e64bfbe3", - "ee8a5e15af394cb6a87a05b54f305f8b", - "58cab0c757ed4d6ab82baec4646b6f37", - "6d9ac41f841a435b9730c1e611e69f74", - "4b681a4e6ff04b609c01965be6716c75", - "c9b47607f3c947d08d24bb6cc7803d0d", - "d8bdf7f27cb54fd28cbb7ccec3448700", - "e2441687ecb74cec8b56cd1eceb09a3e", - "ad497f8cb01944c79a648104c80054ae", - "1dac85b3af134010be9ee9168697d4c0", - "b95effdd4f6a4b1a918323ba8d5e115f", - "d95df1f558a04b0eaf7aaf2f3ec48c67", - "200d7e7c610c4c858f6b3e546cb5fdb1", - "76b2b83e7e0041b09b87447372dfbd1a", - "eb4d95f4faf5499e8d6848c71f3b0b6a", - "c0ea737afa0f4cd9bdfdde96ecd7557d", - "8f4924a858b84a9dad4bcc96aa2578a1", - "3e1ec8813e104de0b908ab88b8c4ea4b", - "1a9f7ae6548a4bce9d42cbbc92ec4f54", - "1c680a7aadfc487eb0854f3daf9bd086", - "041aa85f204042f8a51b25db0bfc30cc", - "14a4326625ae448480adfe4151322f9a", - "652c066d3bda462d82172f02e8e84c91", - "2fae823294414ad0af722e4f470e82af", - "6282cac0414b4f8fb7982f4aa7b80960", - "8e0cf16fadfb4aca89fc866e58bea180", - "2a87325981e942a48dc977ac36e0151d", - "c221b86fc7d748a5b3cdb7b5f34dd26c", - "86ebfc928ea14fd2b809bf607cc51a27", - "040711c2debe4b19908b722f1ae0cf3b" - ] - }, - "executionInfo": { - "elapsed": 74259, - "status": "ok", - "timestamp": 1640695519215, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "ff73d7be-3bde-482c-8d4e-020404235a82" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFAlbertForTokenClassification, AlbertTokenizer \n", @@ -210,34 +100,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 16, - "status": "ok", - "timestamp": 1640695519217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "44c2ccaf-739c-4152-c30b-9e943b75b78f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -245,7 +116,7 @@ "text": [ "total 86880\n", "-rw-r--r-- 1 maziyar staff 1630 Dec 13 15:34 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 13 15:34 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 13 15:34 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 44476132 Dec 13 15:34 tf_model.h5\n" ] } @@ -256,36 +127,19 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640695519217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "4e2eb773-00ef-4d38-e95f-2aadc9bd9efc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 20672\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 13 15:34 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 13 15:34 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 13 15:34 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 26521 Dec 13 15:34 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 10548600 Dec 13 15:34 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:34 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:34 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -295,25 +149,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 984, - "status": "ok", - "timestamp": 1640695520194, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "d632d01b-bcc9-44eb-d50e-5796687d1e63" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -332,9 +169,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `spiece.model` file from the tokenizer\n", @@ -344,10 +179,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -358,10 +191,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -375,34 +206,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1640695520195, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "e374e99e-1f86-4946-ee64-fd7164ed95ae" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -420,18 +232,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save AlbertForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -439,25 +247,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 47186, - "status": "ok", - "timestamp": 1640695567377, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "298d8e98-ee5f-4ba0-e6b6-ff7fc939b51a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -474,19 +265,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -496,9 +283,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `AlbertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `AlbertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -509,10 +294,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -528,19 +311,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -548,19 +327,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -568,9 +343,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -579,25 +352,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 6, - "status": "ok", - "timestamp": 1640695612442, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "97cf51ff-4d8f-4cca-e4f8-665999d79795" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -606,8 +362,8 @@ "total 109024\n", "-rw-r--r-- 1 maziyar staff 54957061 Dec 13 15:35 albert_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 857476 Dec 13 15:35 albert_spp\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:35 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 13 15:35 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:35 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 13 15:35 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -617,19 +373,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny AlbertForTokenClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = AlbertForTokenClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -639,19 +391,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -679,7 +427,7 @@ " 'B-PER']" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -691,34 +439,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11715, - "status": "ok", - "timestamp": 1640695706867, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "c824f8aa-9e22-4602-eb47-15d94b105487" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -765,9 +494,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `AlbertForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -781,7 +508,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "transformers", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -794,13 +521,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - BERT Sentence.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - BERT Sentence.ipynb index 04df91bc226d39..5b5e71922cc0cd 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - BERT Sentence.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - BERT Sentence.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "hxcz5bMVPpY9" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import BERT models for Sentence Embeddings from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,18 +24,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.4.1` version and Transformers on `4.6.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -45,37 +40,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 76870, - "status": "ok", - "timestamp": 1622969047782, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "815f7900-4404-4d22-ca67-e16daae2d6f6" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 2.3MB 7.6MB/s \n", - "\u001B[K |████████████████████████████████| 394.3MB 40kB/s \n", - "\u001B[K |████████████████████████████████| 901kB 9.4MB/s \n", - "\u001B[K |████████████████████████████████| 3.3MB 35.2MB/s \n", - "\u001B[K |████████████████████████████████| 2.9MB 24.4MB/s \n", - "\u001B[K |████████████████████████████████| 3.8MB 30.9MB/s \n", - "\u001B[K |████████████████████████████████| 471kB 26.3MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 2.3MB 7.6MB/s \n", + "\u001b[K |████████████████████████████████| 394.3MB 40kB/s \n", + "\u001b[K |████████████████████████████████| 901kB 9.4MB/s \n", + "\u001b[K |████████████████████████████████| 3.3MB 35.2MB/s \n", + "\u001b[K |████████████████████████████████| 2.9MB 24.4MB/s \n", + "\u001b[K |████████████████████████████████| 3.8MB 30.9MB/s \n", + "\u001b[K |████████████████████████████████| 471kB 26.3MB/s \n", + "\u001b[?25h" ] } ], @@ -85,9 +63,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [bert-base-cased](https://huggingface.co/bert-base-cased) model from HuggingFace as an example\n", @@ -97,67 +73,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 945, - "referenced_widgets": [ - "6b4b0af819504253bfa403fbcf886c76", - "0cd68f80c32e43a4b42fdb7c215daba3", - "3dc29b500eb34822b649758c4a1a4812", - "9b6b3220772c41d89ed83ebd814595ff", - "89f2bf742ca34444bc4d2c9725e422b1", - "b362860014a945f4a43c81841ce0cbcf", - "7b67b9505cdb4842b5dcb48d34e893bc", - "d7c23f698b414f8588c93685ee464f16", - "2a9aa644d7bc4b589cc3cc39a2cac2a3", - "28e3c17dfabd4a9a8603100be41201de", - "822157e16ddc4ca0a807e2989b919f75", - "ebe7b573fb74455eb0703865afa2c896", - "6b7af6bc6c274a15b656df2cca539ecd", - "1685275e0cba434ab143fde811976f92", - "ec1719bf857540e7b17dd190fdda2e61", - "90950ba90d76416781ba4ac2924c41d1", - "67fb63939c574454a40a95afad32dbf2", - "4f5584cde0264771aea2cf2c1e86f4dd", - "7a460da85a004f1ea1cc36f253e89d85", - "7ebe916f1f0448aa8bdb3647ed742b4f", - "bfd38a985b8d460ca1ce131b5c84a4ea", - "7653e1a02c1b41fb8baf4c7b20012f10", - "69e8996566864a36a95d0dcc29a12287", - "26fe51cae569491382573a7301214e17", - "9bdfd260e6d245fcbb0f78c18e989fd3", - "d0f4c1a9898246f3a2ce6e3496533c89", - "36ae40c995ab4604b365a40bcde90bcd", - "7c3383c3141340e7baf208fc094f8a71", - "8efde65358a74240935b37e26c1dc41d", - "1547700638314d5eab21a6de053c3fba", - "a4034d6c83cc434b976d3bf0c22bb714", - "7d29703bb1384e0b95005efdf2b5d1db", - "55bb99310d4943589740af2ad5f6b5a5", - "d4c4cc8fa224474bb0147e6b51ffa034", - "e9df5614f1044db6a69c5a453b7f0d88", - "e763035ee50d4f0db40f449a1877349b", - "b6a2b1a6995d47eb80eaf70142745796", - "b721d77e3c0b488b886835a85286f9a7", - "83ffae5117354f49a5f36214e89e3d87", - "b50ad654c51b4c599bf41cf7e8837f1d" - ] - }, - "executionInfo": { - "elapsed": 92238, - "status": "ok", - "timestamp": 1622969140017, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZaiirlSKNhVD", - "outputId": "1605a730-3a83-43f6-8a9b-96642e96a369" - }, + "metadata": {}, "outputs": [ { "data": { @@ -365,9 +281,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -375,24 +289,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1622969140021, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "5e39c715-1077-4c09-fa7e-f2519731f817" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -412,24 +309,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 403, - "status": "ok", - "timestamp": 1622969140419, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "d76f4807-2aca-4884-f874-506dc31b6170" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -449,24 +329,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 9, - "status": "ok", - "timestamp": 1622969140420, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "4cb5c032-e0bc-4150-b78a-ff3918e2c8ae" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -485,9 +348,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.txt` from the tokenizer\n", @@ -497,9 +358,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "metadata": {}, "outputs": [], "source": [ "!cp {MODEL_NAME}_tokenizer/vocab.txt {MODEL_NAME}/saved_model/1/assets" @@ -507,9 +366,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BERT in Spark NLP for Sentence/Document embeddings\n", "### `BertSentenceEmbeddings` is the equivalent annotator for this task\n", @@ -518,9 +375,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -529,9 +384,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -539,9 +392,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -549,9 +400,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -561,9 +410,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertSentenceEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Unlike `BertEmbeddings` which uses `last_hidden_state` with (-1, -1, DIMENSION) shape, `BertSentenceEmbeddings` will use `pooler_output` with (-1, DIMENSION) shape for Sentence/Document embeddings. It will generate 1 vector for the entire sentence/document\n", @@ -578,9 +425,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -598,9 +443,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -608,9 +451,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "sent_bert.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -618,9 +459,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -628,9 +467,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -638,9 +475,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -650,24 +485,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 515, - "status": "ok", - "timestamp": 1622379928489, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "27c4c504-dee4-4acd-b1a7-c0bd64623130" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -686,9 +504,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BERT model 😊 " ] @@ -696,9 +512,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "sent_bert_loaded = BertSentenceEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -710,25 +524,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 18, - "status": "ok", - "timestamp": 1622969700523, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "219c97c8-9fd5-4290-d631-e6559726d8c2" - }, + "metadata": {}, "outputs": [ { "data": { @@ -739,7 +535,7 @@ "'sent_bert_base_cased'" ] }, - "execution_count": 14, + "execution_count": null, "metadata": { "tags": [] }, @@ -752,9 +548,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of BERT models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -780,13 +574,13 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "0cd68f80c32e43a4b42fdb7c215daba3": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -834,11 +628,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "1547700638314d5eab21a6de053c3fba": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -886,11 +680,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "1685275e0cba434ab143fde811976f92": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -938,11 +732,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "26fe51cae569491382573a7301214e17": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -990,11 +784,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "28e3c17dfabd4a9a8603100be41201de": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1042,11 +836,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "2a9aa644d7bc4b589cc3cc39a2cac2a3": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1063,11 +857,11 @@ "IPY_MODEL_ebe7b573fb74455eb0703865afa2c896" ], "layout": "IPY_MODEL_28e3c17dfabd4a9a8603100be41201de" - }, - "model_module_version": "1.5.0" + } }, "36ae40c995ab4604b365a40bcde90bcd": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1087,11 +881,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_8efde65358a74240935b37e26c1dc41d", "value": 570 - }, - "model_module_version": "1.5.0" + } }, "3dc29b500eb34822b649758c4a1a4812": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1111,11 +905,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_89f2bf742ca34444bc4d2c9725e422b1", "value": 213450 - }, - "model_module_version": "1.5.0" + } }, "4f5584cde0264771aea2cf2c1e86f4dd": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1163,11 +957,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "55bb99310d4943589740af2ad5f6b5a5": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1184,11 +978,11 @@ "IPY_MODEL_e763035ee50d4f0db40f449a1877349b" ], "layout": "IPY_MODEL_d4c4cc8fa224474bb0147e6b51ffa034" - }, - "model_module_version": "1.5.0" + } }, "67fb63939c574454a40a95afad32dbf2": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1205,11 +999,11 @@ "IPY_MODEL_7ebe916f1f0448aa8bdb3647ed742b4f" ], "layout": "IPY_MODEL_4f5584cde0264771aea2cf2c1e86f4dd" - }, - "model_module_version": "1.5.0" + } }, "69e8996566864a36a95d0dcc29a12287": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1220,11 +1014,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "6b4b0af819504253bfa403fbcf886c76": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1241,11 +1035,11 @@ "IPY_MODEL_9b6b3220772c41d89ed83ebd814595ff" ], "layout": "IPY_MODEL_0cd68f80c32e43a4b42fdb7c215daba3" - }, - "model_module_version": "1.5.0" + } }, "6b7af6bc6c274a15b656df2cca539ecd": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1257,11 +1051,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "7653e1a02c1b41fb8baf4c7b20012f10": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1309,11 +1103,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "7a460da85a004f1ea1cc36f253e89d85": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1333,11 +1127,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_bfd38a985b8d460ca1ce131b5c84a4ea", "value": 435797 - }, - "model_module_version": "1.5.0" + } }, "7b67b9505cdb4842b5dcb48d34e893bc": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1348,11 +1142,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "7c3383c3141340e7baf208fc094f8a71": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1369,11 +1163,11 @@ "placeholder": "​", "style": "IPY_MODEL_a4034d6c83cc434b976d3bf0c22bb714", "value": " 570/570 [00:01<00:00, 510B/s]" - }, - "model_module_version": "1.5.0" + } }, "7d29703bb1384e0b95005efdf2b5d1db": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1421,11 +1215,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "7ebe916f1f0448aa8bdb3647ed742b4f": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1442,11 +1236,11 @@ "placeholder": "​", "style": "IPY_MODEL_69e8996566864a36a95d0dcc29a12287", "value": " 436k/436k [00:00<00:00, 2.34MB/s]" - }, - "model_module_version": "1.5.0" + } }, "822157e16ddc4ca0a807e2989b919f75": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1466,11 +1260,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_6b7af6bc6c274a15b656df2cca539ecd", "value": 29 - }, - "model_module_version": "1.5.0" + } }, "83ffae5117354f49a5f36214e89e3d87": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1481,11 +1275,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "89f2bf742ca34444bc4d2c9725e422b1": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1497,11 +1291,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "8efde65358a74240935b37e26c1dc41d": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1513,11 +1307,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "90950ba90d76416781ba4ac2924c41d1": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1565,11 +1359,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "9b6b3220772c41d89ed83ebd814595ff": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1586,11 +1380,11 @@ "placeholder": "​", "style": "IPY_MODEL_7b67b9505cdb4842b5dcb48d34e893bc", "value": " 213k/213k [00:20<00:00, 10.4kB/s]" - }, - "model_module_version": "1.5.0" + } }, "9bdfd260e6d245fcbb0f78c18e989fd3": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1607,11 +1401,11 @@ "IPY_MODEL_7c3383c3141340e7baf208fc094f8a71" ], "layout": "IPY_MODEL_d0f4c1a9898246f3a2ce6e3496533c89" - }, - "model_module_version": "1.5.0" + } }, "a4034d6c83cc434b976d3bf0c22bb714": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1622,11 +1416,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "b362860014a945f4a43c81841ce0cbcf": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1674,11 +1468,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "b50ad654c51b4c599bf41cf7e8837f1d": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1726,11 +1520,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "b6a2b1a6995d47eb80eaf70142745796": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1742,11 +1536,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "b721d77e3c0b488b886835a85286f9a7": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1794,11 +1588,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "bfd38a985b8d460ca1ce131b5c84a4ea": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1810,11 +1604,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "d0f4c1a9898246f3a2ce6e3496533c89": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1862,11 +1656,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "d4c4cc8fa224474bb0147e6b51ffa034": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1914,11 +1708,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "d7c23f698b414f8588c93685ee464f16": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1966,11 +1760,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "e763035ee50d4f0db40f449a1877349b": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1987,11 +1781,11 @@ "placeholder": "​", "style": "IPY_MODEL_83ffae5117354f49a5f36214e89e3d87", "value": " 527M/527M [00:12<00:00, 40.7MB/s]" - }, - "model_module_version": "1.5.0" + } }, "e9df5614f1044db6a69c5a453b7f0d88": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -2011,11 +1805,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_b6a2b1a6995d47eb80eaf70142745796", "value": 526681800 - }, - "model_module_version": "1.5.0" + } }, "ebe7b573fb74455eb0703865afa2c896": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -2032,11 +1826,11 @@ "placeholder": "​", "style": "IPY_MODEL_ec1719bf857540e7b17dd190fdda2e61", "value": " 29.0/29.0 [00:00<00:00, 36.0B/s]" - }, - "model_module_version": "1.5.0" + } }, "ec1719bf857540e7b17dd190fdda2e61": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -2047,8 +1841,7 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } } } } diff --git a/examples/python/transformers/HuggingFace in Spark NLP - BERT.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - BERT.ipynb index dd6a7611772145..bad6cfcf73a803 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - BERT.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - BERT.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import BERT models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,9 +24,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -35,9 +32,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -45,25 +40,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 50264, - "status": "ok", - "timestamp": 1622377416683, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "20c8fbd9-caf2-4810-aa7b-b233bea166fd" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -71,9 +49,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [bert-base-cased](https://huggingface.co/bert-base-cased) model from HuggingFace as an example\n", @@ -83,24 +59,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 36261, - "status": "ok", - "timestamp": 1622378724253, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZaiirlSKNhVD", - "outputId": "39a20147-83a6-4877-a6f2-0ccab49518b6" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFBertModel, BertTokenizer \n", @@ -137,34 +96,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 207, - "status": "ok", - "timestamp": 1622378877133, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "8ea9c871-7096-40f2-df38-3bcc7fe7cd07" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -172,7 +112,7 @@ "text": [ "total 846704\n", "-rw-r--r-- 1 maziyar staff 628 Dec 13 15:57 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 13 15:57 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 13 15:57 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 433508328 Dec 13 15:57 tf_model.h5\n" ] } @@ -183,36 +123,19 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 218, - "status": "ok", - "timestamp": 1622378887608, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "1b6e7e04-3655-49f9-ed2d-25a6928f19bf" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 17584\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 13 15:57 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 13 15:57 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 13 15:57 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165091 Dec 13 15:57 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 8827430 Dec 13 15:57 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:57 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 15:57 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -222,25 +145,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 185, - "status": "ok", - "timestamp": 1622378898534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "0c1f5eef-c75a-4f7d-ce44-0cc9b99a1095" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -259,9 +165,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.txt` from the tokenizer\n", @@ -270,10 +174,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!cp {MODEL_NAME}_tokenizer/vocab.txt {MODEL_NAME}/saved_model/1/assets" @@ -281,18 +183,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BERT in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -300,10 +198,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -320,19 +216,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -342,9 +234,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `BertEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -357,10 +247,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -378,9 +266,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -388,9 +274,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "bert.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -398,9 +282,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -408,9 +290,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -418,9 +298,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -430,24 +308,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 515, - "status": "ok", - "timestamp": 1622379928489, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "27c4c504-dee4-4acd-b1a7-c0bd64623130" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -466,9 +327,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BERT model 😊 " ] @@ -476,9 +335,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "bert_loaded = BertEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -490,25 +347,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 13, - "status": "ok", - "timestamp": 1622380021828, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "d1b7b9a5-d194-4fda-ba7c-173a163021ad" - }, + "metadata": {}, "outputs": [ { "data": { @@ -519,7 +358,7 @@ "'bert_base_cased'" ] }, - "execution_count": 23, + "execution_count": null, "metadata": { "tags": [] }, @@ -532,9 +371,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of BERT models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -547,7 +384,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "transformers", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -560,13 +397,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1 (default, Jan 8 2020, 16:15:59) \n[Clang 4.0.1 (tags/RELEASE_401/final)]" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/HuggingFace in Spark NLP - BertForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - BertForQuestionAnswering.ipynb index 5009ff2b54b795..a7dc58b98ad80c 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - BertForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - BertForQuestionAnswering.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import BertForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -48,24 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [deepset/bert-large-uncased-whole-word-masking-squad2](https://huggingface.co/deepset/bert-large-uncased-whole-word-masking-squad2) model from HuggingFace as an example\n", @@ -85,104 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFBertForQuestionAnswering, BertTokenizer \n", @@ -218,34 +97,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -253,7 +113,7 @@ "text": [ "total 2636416\n", "-rw-r--r-- 1 maziyar staff 743 Dec 13 19:01 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 13 18:59 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 13 18:59 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1336926952 Dec 13 19:01 tf_model.h5\n" ] } @@ -264,36 +124,19 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 35984\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 13 18:59 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 13 18:59 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 13 19:01 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 319364 Dec 13 19:01 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 18090076 Dec 13 19:01 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 19:01 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 19:01 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -303,25 +146,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -340,9 +166,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.txt` from the tokenizer\n", @@ -351,10 +175,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -364,34 +186,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -408,18 +211,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BertForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -427,25 +226,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -462,19 +244,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -484,9 +262,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `BertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -497,10 +273,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -520,19 +294,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -540,19 +310,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -560,9 +326,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -571,25 +335,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -597,8 +344,8 @@ "text": [ "total 2647936\n", "-rw-r--r-- 1 maziyar staff 1354389475 Dec 13 19:02 bert_classification_tensorflow\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 13 19:01 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 13 19:01 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 13 19:01 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 13 19:01 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -608,34 +355,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BertForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -672,9 +400,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `BertForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -700,13 +426,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - BertForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - BertForSequenceClassification.ipynb index dcd25155c7d043..d0368bdeeeb5d0 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - BertForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - BertForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import BertForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -47,25 +42,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [finiteautomata/beto-sentiment-analysis](https://huggingface.co/finiteautomata/beto-sentiment-analysis) model from HuggingFace as an example\n", @@ -84,105 +60,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -238,34 +117,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -273,7 +133,7 @@ "text": [ "total 858824\n", "-rw-r--r-- 1 maziyar staff 873 Dec 14 10:34 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 14 10:34 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 14 10:34 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 439713052 Dec 14 10:34 tf_model.h5\n" ] } @@ -284,36 +144,19 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18400\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 14 10:34 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 14 10:34 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 14 10:34 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 167007 Dec 14 10:34 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9245668 Dec 14 10:34 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 14 10:34 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 14 10:34 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -323,25 +166,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -361,9 +187,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.txt` from the tokenizer\n", @@ -373,10 +197,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -386,10 +208,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -403,34 +223,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -448,18 +249,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BertForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -467,25 +264,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -502,19 +282,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -524,9 +300,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `BertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -537,10 +311,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -558,19 +330,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -578,19 +346,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -598,9 +362,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -609,25 +371,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -635,8 +380,8 @@ "text": [ "total 876136\n", "-rw-r--r-- 1 maziyar staff 448581411 Dec 14 11:09 bert_classification_tensorflow\n", - "drwxr-xr-x 5 maziyar staff 160 Dec 14 11:09 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 14 11:09 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 5 maziyar staff 160 Dec 14 11:09 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 14 11:09 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -646,19 +391,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BertForSequenceClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = BertForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -668,34 +409,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1632137863887, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "60046377-bfd4-4c5e-e392-f78841e6bfe8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -703,7 +425,7 @@ "['POS', 'NEG', 'NEU']" ] }, - "execution_count": 17, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -715,34 +437,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -783,21 +486,10 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `BertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y30JdbS-JKWo" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -820,13 +512,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - BertForTokenClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - BertForTokenClassification.ipynb index 6c67800dfa96d0..1bc19de4c5f162 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - BertForTokenClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - BertForTokenClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForTokenClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForTokenClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import BertForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -48,24 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 89517, - "status": "ok", - "timestamp": 1640696301858, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "95bb1068-cdd6-4211-9fb9-ca90df0a0399" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [dslim/bert-base-NER](https://huggingface.co/dslim/bert-base-NER) model from HuggingFace as an example\n", @@ -85,93 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 385, - "referenced_widgets": [ - "430bb1b9125048f3930739ecbc7a1e29", - "9d04328e127442c395f5bcc30569272b", - "b3830d5595ce448e898544afa3dc5f3d", - "7baa295f947e4b81850a967e27e7b54d", - "eaf4027fb3a0426e88440ed4a4374ec5", - "de54ce213e0c4f5c8a4925015746726f", - "b3dab46a496647868bfe41713c739795", - "38f390fa8ef34e449720f01b0f5f1843", - "16256a89e804451bae57a4217a1c687c", - "27f9c970297542f1bb4b89aba3c9f923", - "64ca5437df0c46faa95caedc7f8661e8", - "1f8cba4a8a134662b09a9d136647902f", - "085b4ca6809c42f3a9a0553cfe45daf0", - "51ced34a77524defa279f8d235d15704", - "96aafbd204804fc58545abb3232673a3", - "992132c649e54d0899223fc19274cad9", - "80fab83ea3834b888c44167b3d37d86d", - "877ba010ccc8468a9560351cce304687", - "3589316fe3ac403e88c48138c1f071b2", - "ad2d5095596342a58adeedd3f280fb78", - "fb1ff80355b644eeb18f9556047e829e", - "a8feb9b4a13f4509b902b7d986a3cc46", - "57ffaa9adc3740e68d5d548c48c9525d", - "e93eb440fa3c493690705b5c8a53f3c3", - "c50dae8b41624f9caf06562995e70976", - "68997a9d91024e95985813b7335214f3", - "c07d9c69379e4a2cbf0c3efc82d0f842", - "eba8ba787d8c4392bd236191946b9c14", - "516d5f47a24a43b09781a3a4a6d51f10", - "f28672174da14cbe85902acc4ee1fca9", - "7a119840352345409c08a3f0be628068", - "c269bce6e21f47e5b130441aacb2aede", - "dc102ed0f0b74b13a8cc7c83ddde6123", - "a6fc53c9cc6742af91177648cb3477aa", - "3c0279271c804aef85a0300ed943226c", - "f96946dcb7f94dde8e176a8b54017730", - "ee3c5c88b5ad45a683866373db3551e6", - "f29773a8c68b46f68b027c43f3da2b7f", - "ced31adebc88460885b69c4edcd16a9f", - "5fdb2c066adb4017b12fe1c3970cb395", - "426f4cad9b5345e5add7ffa352d3f16d", - "7c72aea845c841e0a94369aaf8971392", - "3c73121e251a480dba58fd32b798af81", - "5a9bd734318f47e7ad3975df0c505e84", - "01c3c6321068427c96e43a9437dff261", - "a8a89414bdb34065b2f90935aa2c4405", - "505d2a2a9f264f8e8800145aaf80d375", - "ed989cec1bc84f55a4f4ce1fb863dc33", - "bf215ce615914e17be45f8116addd3a1", - "c69af031582c493c9332a57735e8734c", - "44132cd42f384a7ca6d7aef1cc4cf5b0", - "d57b9c3b443b4f7e9e28e7c54ae9986d", - "d98958b33f0a43b99760c96776953244", - "19ca8edd4e6243c8a424db1ffc9c071e", - "e71ee7678361465ca6d94ed192dfa300", - "0aff9f570b9649a184b4e253a74b6d15", - "5a6cbd800caf4be181f5b14d69a9873d", - "731ce5db12dd4c958ef8fb95c51989ba", - "17efa98390ec4044ad630dca1e885f66", - "ee64aa9968b8459888519e3ca37dd1be", - "4a3e32f080ea4aa5b7b47ab9874966fd", - "6b44a379b35b4d5d960d960f18a422b1", - "dc970598af584b748695e67db8055b07", - "d4299865309e46e184bc6cc628884b6e", - "7486b7e6b6a746de98f433658976c1d7", - "8b3e5e02b18a426b9f68342921d20a5b" - ] - }, - "executionInfo": { - "elapsed": 62215, - "status": "ok", - "timestamp": 1640696364067, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "627c210a-8497-4a4d-fc52-304a55fade44" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFBertForTokenClassification, BertTokenizer \n", @@ -209,34 +99,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 17, - "status": "ok", - "timestamp": 1640696364068, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "7dea1ee6-a663-45e3-a769-d8294b2f5466" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -244,7 +115,7 @@ "text": [ "total 842160\n", "-rw-r--r-- 1 maziyar staff 999 Dec 14 20:18 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 14 20:18 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 14 20:18 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 431179756 Dec 14 20:18 tf_model.h5\n" ] } @@ -255,36 +126,19 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 235, - "status": "ok", - "timestamp": 1640696364300, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "ff96f553-5056-4a1e-c77b-7ccd7e0f580a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18288\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 14 20:18 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 14 20:18 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 53 Dec 14 20:18 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165837 Dec 14 20:18 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9190201 Dec 14 20:18 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 14 20:18 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 14 20:18 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -294,25 +148,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 237, - "status": "ok", - "timestamp": 1640696364532, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "03346f6a-a400-4e09-d145-80c6a6ed0c6b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -331,9 +168,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.txt` from the tokenizer\n", @@ -343,10 +178,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -356,10 +189,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -373,34 +204,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 223, - "status": "ok", - "timestamp": 1640696365044, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "086c0864-e804-442f-99ad-212e025c94c5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -418,18 +230,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BertForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -437,25 +245,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 43185, - "status": "ok", - "timestamp": 1640696408227, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "66a14c53-77ac-4ed7-91d0-7c5db847f7ae" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -472,19 +263,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -494,9 +281,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `BertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -507,10 +292,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -528,19 +311,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -548,19 +327,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "! rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -568,9 +343,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zeQt3UFv3vVb" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -579,25 +352,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 816, - "status": "ok", - "timestamp": 1640696099014, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "d63fdbee-4240-4986-a263-eba7325f121f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -605,8 +361,8 @@ "text": [ "total 859392\n", "-rw-r--r-- 1 maziyar staff 440007186 Dec 14 20:19 bert_classification_tensorflow\n", - "drwxr-xr-x 5 maziyar staff 160 Dec 14 20:19 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 14 20:19 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 5 maziyar staff 160 Dec 14 20:19 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 14 20:19 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -616,19 +372,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BertForTokenClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = BertForTokenClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -638,28 +390,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `BertForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -667,7 +413,7 @@ "['B-LOC', 'I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'B-MISC', 'B-ORG', 'O', 'B-PER']" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -678,34 +424,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 1937, - "status": "ok", - "timestamp": 1640696146085, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "7be9953a-4e14-4684-e5db-30c8880097c2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -748,9 +475,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "pKfO-QonGPdQ" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `BertForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -777,13 +502,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - CamemBERT.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - CamemBERT.ipynb index 61f8f885f85389..bc3c9ba5858e2d 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - CamemBERT.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - CamemBERT.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "lshuevA3Qv-N" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBERT.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBERT.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import CamemBERT models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,9 +24,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -35,9 +32,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -47,13 +42,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "abac85a3-c938-45b4-97db-db978e1a2d38" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -61,9 +50,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [camembert-base](https://huggingface.co/camembert-base) model from HuggingFace as an example\n", @@ -74,48 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 896, - "referenced_widgets": [ - "7271f65ac8c34370927812c6ebc26680", - "be4ae5e77eba4501b68dd4c168e75a70", - "a381d87b7e8c4664b725819cf9e40b5e", - "6da6c5fe9a4e4d86b91b8ba468a9b8fd", - "3a7d01e35a66472885c75e47118f2a7a", - "13ee7dbdd57f43d6a667b1e118fd7306", - "1b63d0cfa2164ce6959fe55bc3d53292", - "570e6b14d24c4bdb90ac3f6d50879280", - "80cd474ae43144e88275a8e0d25f3dad", - "eb76330eb6fd4a66a9d02d1f90447b35", - "690778e1619f40d681ae5346e9ca8f7b", - "19805c06fa8c4336b0d8d0fd04ed16d6", - "64b1edc02ded48109b0db3df4537e2dc", - "240adb86143a4080ae42e63ff4e1a851", - "ee7fa14eb12e4ebe9f8cc6c16edbba73", - "c1b239ba82554cc6b83a1e72c2df9811", - "664e5d3170fb40f78d4f4d044d6b152b", - "1fd84f303c5e4c7db7041c62c675278b", - "cb2daa67db4f42a89781b52f04dbf921", - "3c881124f6264bfe9ecc89c26354ebe9", - "f7c27a24a0ef4027ad58cc8a4663e091", - "4fd9efce28e249df983c39acac900d51", - "5980407785b1454ab0f7422c77ac5bfc", - "4550fa6e3e4545e49e3eb5ff05cc6e3e", - "e79a5512e1a3490494ac78742ec8fe09", - "1fc6028e0c1c4d3996606926b896b9d2", - "9ffab1dc0b364b4d8f52e9bcf6f320fc", - "fca45b67bfdc4d2ebed539985e91bdc3", - "a850b999845b4897ac5bea7349d88d31", - "8fbb65204a6d4b9893a5e87fdd1d1e76", - "53b235bce90b4e668713bf13baa70907", - "70c1f42b905647a49ce528d9289b82d9" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "b3a68a21-512d-45f2-abbc-1aa4e88231a1" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import CamembertTokenizer, TFCamembertModel\n", @@ -152,23 +98,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "dc44304c-a042-4230-854c-977024072d36" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -176,7 +114,7 @@ "text": [ "total 864768\n", "-rw-r--r-- 1 maziyar staff 667 Dec 14 20:25 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 14 20:25 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 14 20:25 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 442754752 Dec 14 20:25 tf_model.h5\n" ] } @@ -187,24 +125,18 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "d588934e-73c5-492c-dca1-f165ac6a5222" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 12976\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 14 20:25 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 14 20:25 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 143311 Dec 14 20:25 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 6497960 Dec 14 20:25 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 14 20:25 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 14 20:25 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -214,14 +146,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "b1c953b5-9550-4fdc-b07a-3c4399cee28d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -240,9 +166,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `sentencepiece.bpe.model` file from the tokenizer\n", @@ -251,10 +175,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# let's copy sentencepiece.bpe.model file to saved_model/1/assets\n", @@ -263,18 +185,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save CamemBERT in Spark NLP" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -282,10 +200,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -293,9 +209,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -303,9 +217,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -315,9 +227,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -331,9 +241,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -351,19 +259,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "camembert.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -371,19 +275,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -391,9 +291,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -402,14 +300,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "8d8fc13b-427e-44f1-bfe4-2705862f8730" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -418,8 +310,8 @@ "total 878288\n", "-rw-r--r-- 1 maziyar staff 810912 Dec 14 20:31 camembert_spp\n", "-rw-r--r-- 1 maziyar staff 448869922 Dec 14 20:31 camembert_tensorflow\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 14 20:31 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 14 20:31 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 3 maziyar staff 96 Dec 14 20:31 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 14 20:31 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -429,19 +321,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊 " ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "camembert_loaded = CamemBertEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -452,15 +340,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "pGRTNISyYlnO", - "outputId": "fc4d45f1-d870-408a-e16e-bbf6710bf33d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -468,7 +349,7 @@ "'camembert_base'" ] }, - "execution_count": 13, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -479,9 +360,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of CamemBERT models from HuggingFace 🤗 in Spark NLP 🚀" ] @@ -508,13 +387,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1 (default, Jan 8 2020, 16:15:59) \n[Clang 4.0.1 (tags/RELEASE_401/final)]" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForQuestionAnswering.ipynb index 78e35a64588b85..af6723f1b41633 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForQuestionAnswering.ipynb @@ -3,19 +3,17 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForQuestionAnswering.ipynb)" ] }, { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import CamemBertForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -29,9 +27,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -39,9 +35,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -50,25 +44,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -77,9 +54,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [etalab-ia/camembert-base-squadFR-fquad-piaf](https://huggingface.co/etalab-ia/camembert-base-squadFR-fquad-piaf) model from HuggingFace as an example\n", @@ -88,105 +63,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -315,34 +193,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -350,7 +209,7 @@ "text": [ "total 860208\n", "-rw-r--r-- 1 maziyar staff 717 Dec 25 00:49 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 25 00:49 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 25 00:49 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 440418536 Dec 25 00:49 tf_model.h5\n" ] } @@ -361,36 +220,19 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18808\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 25 00:49 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 25 00:49 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 25 00:49 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165535 Dec 25 00:49 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9454001 Dec 25 00:49 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 25 00:49 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 25 00:49 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -400,25 +242,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -437,9 +262,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `sentencepiece.bpe.model` from the tokenizer\n", @@ -448,10 +271,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -462,34 +283,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `sentencepiece.bpe.model` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -507,18 +309,14 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save CamemBertForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -526,25 +324,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -561,19 +342,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -584,9 +361,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -597,10 +372,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -618,19 +391,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -638,19 +407,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -659,9 +424,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -670,33 +433,16 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 2198320\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:30 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:30 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:30 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:30 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1119269627 Dec 15 18:31 xlm_roberta_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 5069051 Dec 15 18:31 xlmroberta_spp\n" ] @@ -709,34 +455,15 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -783,9 +510,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `CamemBertForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -811,13 +536,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForSequenceClassification.ipynb index 93201e86458117..f50d0f94d03793 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import CamemBertForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,25 +43,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 95771, - "status": "ok", - "timestamp": 1640707909485, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3134cc48-78bc-4e03-a79f-748292f7d0a1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [tblard/tf-allocine](https://huggingface.co/tblard/tf-allocine) model from HuggingFace as an example\n", @@ -85,21 +61,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "executionInfo": { - "elapsed": 352, - "status": "ok", - "timestamp": 1640708841457, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -170,34 +133,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 494, - "status": "ok", - "timestamp": 1640708154100, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "7bd16979-4e59-4f6e-d685-4b0f882b5bcc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -205,7 +149,7 @@ "text": [ "total 864824\n", "-rw-r--r-- 1 maziyar staff 833 Dec 15 10:17 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 10:17 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 10:17 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 442783552 Dec 15 10:17 tf_model.h5\n" ] } @@ -216,36 +160,19 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 511, - "status": "ok", - "timestamp": 1640708154608, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "49b86052-ec5c-4a97-959d-c2aa5c3b8df5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18928\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 10:17 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 10:17 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 15 10:17 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 167585 Dec 15 10:17 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9518269 Dec 15 10:17 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 10:17 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 10:17 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -255,25 +182,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1640708154609, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "5068af51-5a09-4a60-866b-96b4f4bdd083" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -292,9 +202,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `sentencepiece.bpe.model` file from the tokenizer\n", @@ -304,10 +212,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -318,10 +224,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -335,34 +239,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 228, - "status": "ok", - "timestamp": 1640708155273, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "89764651-6a64-4b11-aaaa-f031a4284e1a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -380,18 +265,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save CamemBertForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -400,24 +281,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7553, - "status": "ok", - "timestamp": 1640708780913, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "2677b2fd-477a-4530-c98b-a8a1ccbd2baa" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -425,9 +289,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -435,20 +297,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "executionInfo": { - "elapsed": 33750, - "status": "ok", - "timestamp": 1640708814657, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "cbNneAVCLU1y" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -458,9 +307,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -472,20 +319,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "executionInfo": { - "elapsed": 2, - "status": "ok", - "timestamp": 1640708858933, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -501,9 +335,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -511,9 +343,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -521,9 +351,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -531,9 +359,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "metadata": {}, "outputs": [], "source": [ "! rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -541,9 +367,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -553,24 +377,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 16, - "status": "ok", - "timestamp": 1640708814658, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "7fc4e69f-3ab2-4ddc-a3b0-6de95f018c91" - }, + "metadata": {}, "outputs": [], "source": [ "! ls -l {MODEL_NAME}_spark_nlp" @@ -578,9 +385,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForTokenClassification model 😊 " ] @@ -588,20 +393,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "executionInfo": { - "elapsed": 88864, - "status": "ok", - "timestamp": 1640708950792, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = CamemBertForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -611,9 +403,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] @@ -621,9 +411,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "metadata": {}, "outputs": [], "source": [ "# .getClasses was introduced in spark-nlp==3.4.0\n", @@ -632,9 +420,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] @@ -642,24 +428,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 15729, - "status": "ok", - "timestamp": 1640708966516, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "c13a1827-770f-48a6-bba6-eda25077f8ef" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler() \\\n", @@ -687,9 +456,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `CamemBertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -715,13 +482,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForTokenClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForTokenClassification.ipynb index 003039e1efbcbf..cad6d91003ffd8 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForTokenClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - CamemBertForTokenClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForTokenClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForTokenClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import CamemBertForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -49,24 +44,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 95771, - "status": "ok", - "timestamp": 1640707909485, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3134cc48-78bc-4e03-a79f-748292f7d0a1" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [Jean-Baptiste/camembert-ner](https://huggingface.co/Jean-Baptiste/camembert-ner) model from HuggingFace as an example\n", @@ -86,20 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "executionInfo": { - "elapsed": 352, - "status": "ok", - "timestamp": 1640708841457, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFCamembertForTokenClassification, CamembertTokenizer \n", @@ -136,34 +99,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 494, - "status": "ok", - "timestamp": 1640708154100, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "7bd16979-4e59-4f6e-d685-4b0f882b5bcc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -171,7 +115,7 @@ "text": [ "total 889880\n", "-rw-r--r-- 1 maziyar staff 936 Dec 14 21:01 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 14 21:01 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 14 21:01 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 440428624 Dec 14 21:01 tf_model.h5\n" ] } @@ -182,36 +126,19 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 511, - "status": "ok", - "timestamp": 1640708154608, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "49b86052-ec5c-4a97-959d-c2aa5c3b8df5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18808\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 14 21:01 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 14 21:01 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 56 Dec 14 21:01 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 166277 Dec 14 21:01 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9456539 Dec 14 21:01 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 14 21:01 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 14 21:01 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -221,25 +148,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1640708154609, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "5068af51-5a09-4a60-866b-96b4f4bdd083" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -258,9 +168,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `sentencepiece.bpe.model` file from the tokenizer\n", @@ -270,10 +178,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -284,10 +190,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -301,34 +205,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 228, - "status": "ok", - "timestamp": 1640708155273, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "89764651-6a64-4b11-aaaa-f031a4284e1a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -346,18 +231,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save CamemBertForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -365,25 +246,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7553, - "status": "ok", - "timestamp": 1640708780913, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "2677b2fd-477a-4530-c98b-a8a1ccbd2baa" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -400,30 +264,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "executionInfo": { - "elapsed": 33750, - "status": "ok", - "timestamp": 1640708814657, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -433,9 +282,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -446,21 +293,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "executionInfo": { - "elapsed": 2, - "status": "ok", - "timestamp": 1640708858933, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -476,19 +310,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -496,19 +326,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "! rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -516,9 +342,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -527,25 +351,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 16, - "status": "ok", - "timestamp": 1640708814658, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "7fc4e69f-3ab2-4ddc-a3b0-6de95f018c91" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -554,8 +361,8 @@ "total 879536\n", "-rw-r--r-- 1 maziyar staff 449510592 Dec 14 21:03 camembert_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 810912 Dec 14 21:03 camembert_spp\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 14 21:03 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 14 21:03 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 14 21:03 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 14 21:03 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -565,30 +372,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForTokenClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "executionInfo": { - "elapsed": 88864, - "status": "ok", - "timestamp": 1640708950792, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = CamemBertForTokenClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -598,19 +390,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -618,7 +406,7 @@ "['I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'O']" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -630,9 +418,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] @@ -640,24 +426,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 15729, - "status": "ok", - "timestamp": 1640708966516, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "c13a1827-770f-48a6-bba6-eda25077f8ef" - }, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", @@ -687,9 +456,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `CamemBertForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -715,13 +482,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1 (default, Jan 8 2020, 16:15:59) \n[Clang 4.0.1 (tags/RELEASE_401/final)]" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/HuggingFace in Spark NLP - DeBERTa.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - DeBERTa.ipynb index b5e9f1486ac8e8..171aef4302074a 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - DeBERTa.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - DeBERTa.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "lshuevA3Qv-N" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DeBERTa.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DeBERTa.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import DeBERTa models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,9 +24,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -35,9 +32,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -46,14 +41,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "c169434e-4452-4658-f6b1-dc888b7a48a9" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -61,9 +50,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [microsoft/deberta-v3-xsmall](https://huggingface.co/microsoft/deberta-v3-xsmall) model from HuggingFace as an example\n", @@ -74,48 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 896, - "referenced_widgets": [ - "f1585b4c4f9f466f8730bce74110b248", - "b840f00b333849afa95375cab548e3b1", - "9ea0caac05284ef79323d5c88deb8d8c", - "6d5db72c01c043089b58fac2b64f3124", - "721903bf15304fcab9f4504e8dda9be3", - "c2aa3877c4d0409d889a7c486158f4a5", - "1ed9068ee045430a8ace1bf18df1a266", - "d79a00e434ff43f7942c34a117879d43", - "7ab0b2bcd5a34c8ea3bab32d8258bd64", - "ce771a6d9b64470aa5e2a500a8df47d3", - "a651c5e6c85242ff889601522ae76d42", - "6a22788a5471416c999b12e5e05abf33", - "46ee38864bae48feb3692578c266ee3f", - "db69f161d69640f8b3e24e6d9f9dbcf0", - "e912db9cdb8b4b5cbeecb955631478e1", - "4dbbca5cb9654d489a26989ae5a71de0", - "3addd8d30dbd4b3aaef0636b22164391", - "3fb78e1dafe44c4da48b17ff3dc0781d", - "c2e86bc083c242f994e797046d17e1e6", - "095c48e984894f38875fa02a2beae17b", - "95ad63ec7a2a4a62a7546dd1796a9f17", - "51f46f77ef6b4176a6b5e57b99dbfada", - "ce1f37f419f94d1c97eee5e5108833d3", - "d952452b82284cd79f588896b0018994", - "e5a44efb86404928b5922390c7a9a364", - "67739382be014038a3a877ccf7c916e0", - "c22f9ad149fb4a419a8c6423ea5f1ef9", - "60df2a80bfdb475884ea64f21c856bc6", - "cbdfa3aa241746a3974aa2e1687a7b55", - "2255bfc07a9e42eeab515ab6f7941213", - "d97a433c437c4a2584e2bbb1d49d5d6e", - "7cd30dde04874a0486c4a465e1805eb5" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "c698963f-f62c-40c7-908c-3425b909e928" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import DebertaV2Tokenizer, TFDebertaV2Model\n", @@ -152,23 +98,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "696d74f0-6571-49a4-ca36-3b62d420af0c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -176,7 +114,7 @@ "text": [ "total 552760\n", "-rw-r--r-- 1 maziyar staff 833 Dec 15 14:31 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:31 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:31 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 283007184 Dec 15 14:31 tf_model.h5\n" ] } @@ -187,25 +125,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "ffe29d99-16b2-4ca1-9818-c210eb20ebbe" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 47880\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 14:31 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 14:31 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 15 14:31 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 176375 Dec 15 14:31 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 24328024 Dec 15 14:31 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:31 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:31 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -215,14 +147,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "e1d148ce-eced-48e9-80dc-d2332116a30d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -242,9 +168,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `spm.model` file from the tokenizer\n", @@ -253,10 +177,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# let's copy spm.model file to saved_model/1/assets\n", @@ -265,18 +187,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save DeBERTa in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -284,14 +202,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8tpW5nkMc53m", - "outputId": "bef364b4-86d8-4a6c-b552-0e332f5aecf7" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -308,19 +220,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -330,9 +238,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `DeBertaEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `DeBertaEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -345,10 +251,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -366,19 +270,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "deberta.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -386,19 +286,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -406,9 +302,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -417,14 +311,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "f1cab05f-a9af-4e65-e7ab-7bc8ef74f54a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -433,8 +321,8 @@ "total 604088\n", "-rw-r--r-- 1 maziyar staff 2464616 Dec 15 14:33 deberta_spp\n", "-rw-r--r-- 1 maziyar staff 306826917 Dec 15 14:33 deberta_tensorflow\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:33 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 14:33 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:33 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 14:33 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -444,19 +332,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊 " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "deberta_loaded = DeBertaEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -468,14 +352,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "pGRTNISyYlnO", - "outputId": "22ebf476-0d65-42a1-b0b2-9e203532c97f" - }, + "metadata": {}, "outputs": [], "source": [ "deberta_loaded.getStorageRef()" @@ -483,9 +360,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of DeBERTa models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -511,17 +386,11 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1 (default, Jan 8 2020, 16:15:59) \n[Clang 4.0.1 (tags/RELEASE_401/final)]" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.28.0" }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } - }, "widgets": { "application/vnd.jupyter.widget-state+json": { "095c48e984894f38875fa02a2beae17b": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - DeBertaForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - DeBertaForQuestionAnswering.ipynb index 9f4d967e310ab3..4fda6428b576a7 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - DeBertaForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - DeBertaForQuestionAnswering.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DeBertaForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DeBertaForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import DeBertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -49,24 +44,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [nbroad/deberta-v3-xsmall-squad2](https://huggingface.co/nbroad/deberta-v3-xsmall-squad2) model from HuggingFace as an example\n", @@ -86,104 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFDebertaV2ForQuestionAnswering, DebertaV2Tokenizer \n", @@ -217,34 +96,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -252,7 +112,7 @@ "text": [ "total 552808\n", "-rw-r--r-- 1 maziyar staff 880 Dec 15 14:38 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:38 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:38 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 283030336 Dec 15 14:38 tf_model.h5\n" ] } @@ -263,36 +123,19 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 49384\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 14:38 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 14:38 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 15 14:38 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 177566 Dec 15 14:38 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 25097793 Dec 15 14:38 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:38 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:38 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -302,25 +145,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -340,9 +166,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `spm.model` from the tokenizer\n", @@ -351,10 +175,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -364,34 +186,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `spm.model` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -408,18 +211,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save DeBertaForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -427,25 +226,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -462,19 +244,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -484,9 +262,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `DeBertaForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `DeBertaForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -497,10 +273,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -518,19 +292,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -538,19 +308,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -558,9 +324,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -569,25 +333,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -596,8 +343,8 @@ "total 605592\n", "-rw-r--r-- 1 maziyar staff 307593450 Dec 15 14:38 deberta_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 2464616 Dec 15 14:38 deberta_spp\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:38 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 14:38 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:38 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 14:38 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -607,34 +354,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny DeBertaForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -673,9 +401,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `DeBertaForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -701,13 +427,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 06:16:26) \n[Clang 12.0.0 ]" - }, - "vscode": { - "interpreter": { - "hash": "87a0babc161b66fece47f6635e3fb5206fb435f3149fb144acff789d447c8452" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - DistilBERT.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - DistilBERT.ipynb index f958dd3b00f26f..aeedcfbee2c8b9 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - DistilBERT.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - DistilBERT.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "azqv8xKyQZ6g" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBERT.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBERT.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import DistilBERT models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,9 +24,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -35,9 +32,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -45,10 +40,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "hHXgqiWpMfCY" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -56,9 +49,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) model from HuggingFace as an example\n", @@ -68,24 +59,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 57472, - "status": "ok", - "timestamp": 1622474870082, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZaiirlSKNhVD", - "outputId": "62cc50a2-9b49-491d-ab1e-ba20d1b127ee" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import DistilBertTokenizer, TFDistilBertModel\n", @@ -121,34 +95,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 191, - "status": "ok", - "timestamp": 1622474465502, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "cd8d5f2d-115c-41c6-a392-f31f585c075f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -156,7 +111,7 @@ "text": [ "total 518704\n", "-rw-r--r-- 1 maziyar staff 518 Dec 15 14:46 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:46 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 14:46 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 265571968 Dec 15 14:46 tf_model.h5\n" ] } @@ -167,36 +122,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 6, - "status": "ok", - "timestamp": 1622474469865, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "113ded2e-8c80-4bbe-f6db-6fd6797770fb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 9472\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 14:46 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 14:46 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 15 14:46 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 77329 Dec 15 14:46 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 4764278 Dec 15 14:46 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:46 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:46 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -206,25 +144,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 221, - "status": "ok", - "timestamp": 1622474472548, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "9779b7ce-9096-41c4-fa68-910a6cc6ab0b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -243,9 +164,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.txt` from the tokenizer\n", @@ -254,10 +173,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!cp {MODEL_NAME}_tokenizer/vocab.txt {MODEL_NAME}/saved_model/1/assets" @@ -265,18 +182,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save DistilBERT in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -284,10 +197,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -304,19 +215,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -326,9 +233,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `DistilBertEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `DistilBertEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -341,10 +246,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -362,19 +265,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "distil_bert.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -382,19 +281,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -402,9 +297,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -413,25 +306,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 1839, - "status": "ok", - "timestamp": 1622475225792, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "c869eefd-6b74-48f1-a692-38868549c005" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -439,8 +315,8 @@ "text": [ "total 527720\n", "-rw-r--r-- 1 maziyar staff 270191794 Dec 15 14:53 distilbert_tensorflow\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:53 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 14:53 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 14:53 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 14:53 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -450,19 +326,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny DistilBERT model 😊 " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "distilbert_loaded = DistilBertEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -473,26 +345,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 24, - "status": "ok", - "timestamp": 1622475262800, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "147dfcfe-319c-40f2-bbf1-57b45000e40e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -500,17 +354,9 @@ "'distilbert_base_uncased'" ] }, - "execution_count": 15, + "execution_count": null, "metadata": {}, "output_type": "execute_result" - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001B[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] } ], "source": [ @@ -519,9 +365,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of DistilBERT models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -539,7 +383,7 @@ ] }, "kernelspec": { - "display_name": "transformers", + "display_name": "sparknlp", "language": "python", "name": "python3" }, @@ -552,13 +396,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForQuestionAnswering.ipynb index 2ef1f27f985c58..bb1f93ddaa915a 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForQuestionAnswering.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import DistilBertForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -47,25 +42,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [distilbert-base-cased-distilled-squad](https://huggingface.co/distilbert-base-cased-distilled-squad) model from HuggingFace as an example\n", @@ -85,104 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFDistilBertForQuestionAnswering, DistilBertTokenizer \n", @@ -215,34 +94,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -250,7 +110,7 @@ "text": [ "total 509576\n", "-rw-r--r-- 1 maziyar staff 569 Dec 15 15:47 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 15:47 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 15:47 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 260895720 Dec 15 15:47 tf_model.h5\n" ] } @@ -261,36 +121,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 9928\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 15:47 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 15:47 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 57 Dec 15 15:47 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 79098 Dec 15 15:47 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 4996317 Dec 15 15:47 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 15:47 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 15:47 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -300,25 +143,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -337,9 +163,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.txt` from the tokenizer\n", @@ -348,10 +172,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -361,34 +183,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -405,18 +208,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save DistilBertForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -424,25 +223,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -459,19 +241,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -481,9 +259,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `DistilBertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `DistilBertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -493,10 +269,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -514,19 +288,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -534,19 +304,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -554,9 +320,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -565,25 +329,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -591,8 +338,8 @@ "text": [ "total 519016\n", "-rw-r--r-- 1 maziyar staff 265735555 Dec 15 15:48 distilbert_classification_tensorflow\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 15:48 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 15:48 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 15:48 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 15:48 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -602,34 +349,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny DistilBertForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -666,9 +394,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `DistilBertForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -694,13 +420,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForSequenceClassification.ipynb index 2a3e8dc55bd383..7f7cc1786e4245 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import DistilBertForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -47,25 +42,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 82800, - "status": "ok", - "timestamp": 1640699488847, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "4a4d1bf6-f539-443b-ae6d-d957671b4cd5" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) model from HuggingFace as an example\n", @@ -85,71 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 408, - "referenced_widgets": [ - "ebdbb9e88bee4bdcb1df65b54d6caba1", - "4f0433b513ce4e4890998036cf0af513", - "a4fcb7b27e594cb692f81933ef182961", - "2b2221dd83744d3cb86c3e93c500b7e8", - "d796b8bf433b4bdc906d658600850849", - "4985212f63b440a68c656759176fe1e0", - "cdc594fbc9ec4f7f87404b070c511b3c", - "51cb80c1a5164c66bcf20f426f7f3d40", - "fcca4ba0139544fc89a8fe6812ce64b6", - "0ef2ef2558964408a0b184cdc7dd3a51", - "94d60e14a67a469b831e1b1c5514eac3", - "0118be42263540229b0159fdd239d856", - "db8b6d489eb045e9bab4d16e461bc1da", - "6476557662fe4eff8eb26ad6a4d9e44c", - "0ff01b1835954d668161e86bd2eb7644", - "ce28103f68b34d229fde30f142eab7cb", - "60ab9054da034c8d9f42d3f0b2e453ad", - "bcfa6720b32e4f62b00f5b9e4a0ec991", - "d17e77f7bf8e4c1dacb3331b813e2355", - "01e08ef43e6240babdd845f922bc674e", - "66d7eb776f3143618b413b8fd5105601", - "483d98eb32d84cd187ae1ae177e4935e", - "a84206f500a64d38971023c1543d84ab", - "d1e0b26e1a4c4c58b26d8f78ec03b3dc", - "322263bb29514f809fc7a9dacdf4ee92", - "d9432a7a0829496db7b570574663782f", - "38567e82761f4aee81baad572f89547a", - "364913c00ff94deba165e70419f9820f", - "a6dd43b3407c4dc197d813865ed42cc1", - "2576cb8630fc433aadd62652151bb5a0", - "a697f861f4304c7a9bd20e225290c55a", - "7e77e7f2d8fd45118c721b1e558163ca", - "08109482c5294dd89587c56ff7e9c090", - "ca30fa646a2b48fdb005c3f147d9bb9f", - "92235f111949479a806054b7be0ff479", - "c2dcac40aae8425cbd99a68f4a405141", - "112b01fe3760439498e0b674df2c4858", - "406d028e89d64f9f92de12ba14b14892", - "d83cd2413b2e4318928374569d381be8", - "194541e90ca44c62b6a8c6ec47272547", - "762b5b3ec8384ed0b2b6cb57fe318f23", - "877e170194cf4056aba0a776c06760ef", - "77bde4d656d545e2ac71f1d4dcd4e89b", - "e3c06b0d62f5475b84c9a2de1a7f319a" - ] - }, - "executionInfo": { - "elapsed": 38913, - "status": "ok", - "timestamp": 1640699527750, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "a98145c7-a39e-43c2-c455-ce5610c58627" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer \n", @@ -182,34 +94,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11, - "status": "ok", - "timestamp": 1640699527750, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "fd8aa6a1-a5dc-4728-c4be-ebfc7ab6ff96" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -217,7 +110,7 @@ "text": [ "total 523352\n", "-rw-r--r-- 1 maziyar staff 735 Dec 15 16:45 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 16:45 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 16:45 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 267951808 Dec 15 16:45 tf_model.h5\n" ] } @@ -228,36 +121,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7, - "status": "ok", - "timestamp": 1640699527751, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "3e0779f1-1d7c-46a4-93e0-8c115f32e121" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 10000\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 16:45 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 16:45 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 53 Dec 15 16:45 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 80253 Dec 15 16:45 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 5032374 Dec 15 16:45 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 16:45 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 16:45 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -267,25 +143,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 441, - "status": "ok", - "timestamp": 1640699528188, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "ff891b27-2e35-4dbf-82b0-9c1f59e39c4f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -304,9 +163,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.txt` from the tokenizer\n", @@ -316,10 +173,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -329,10 +184,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -346,23 +199,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OYnT5U8N9dxT", - "outputId": "697556a6-2cb2-4439-e37a-60bf34023efb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -380,18 +225,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save DistilBertForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -399,25 +240,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 39101, - "status": "ok", - "timestamp": 1640699567282, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "200d0e7f-acc4-4f2e-f3be-5d160565218c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -434,19 +258,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -456,9 +276,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `DistilBertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `DistilBertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -469,10 +287,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -490,19 +306,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -510,19 +322,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -530,9 +338,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "5xD9rRK42S4i" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -541,25 +347,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 17, - "status": "ok", - "timestamp": 1640699619610, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "60ae0b5c-e65f-4e6a-9bd3-c332fce9fc30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -567,8 +356,8 @@ "text": [ "total 532864\n", "-rw-r--r-- 1 maziyar staff 272823081 Dec 15 16:46 distilbert_classification_tensorflow\n", - "drwxr-xr-x 5 maziyar staff 160 Dec 15 16:46 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 16:46 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 5 maziyar staff 160 Dec 15 16:46 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 16:46 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -578,19 +367,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BertForSequenceClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = DistilBertForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -600,43 +385,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `DistilBertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1632137863887, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "60046377-bfd4-4c5e-e392-f78841e6bfe8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -644,7 +408,7 @@ "['POSITIVE', 'NEGATIVE']" ] }, - "execution_count": 16, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -656,34 +420,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 9672, - "status": "ok", - "timestamp": 1640700188622, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "1d96eaa8-f6f6-4a5f-b744-96c2d49377ec" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -726,21 +471,10 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "T0kOAJBKTyQb" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `DistilBertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 " ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sC2N5zTy2bLk" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -763,13 +497,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForTokenClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForTokenClassification.ipynb index 8b07757cea50c3..0c5060185de4f8 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForTokenClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - DistilBertForTokenClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForTokenClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForTokenClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import DistilBertForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -47,25 +42,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 90347, - "status": "ok", - "timestamp": 1640696872768, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "5417592c-58a8-4cf4-b134-1d3c962e5842" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [elastic/distilbert-base-cased-finetuned-conll03-english](https://huggingface.co/elastic/distilbert-base-cased-finetuned-conll03-english) model from HuggingFace as an example\n", @@ -85,82 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 492, - "referenced_widgets": [ - "d0a3893e3f2545269436a5fd84b4d17f", - "cbbb9fbb1d2841c7973a7b47d305a4a5", - "75c0467e13ed4feca35010cdaa0c8ab6", - "9f0928c7947f43369b5ef356d7ddda71", - "8b34187311574a8bb4ffbc6d41272fb3", - "d69b0bcd8adc4b79a498e51e79c8835f", - "bce4155ced9a48789ce985a9c296a098", - "984a1d1a7d7c47ca8c5ab4fbe7ebaf3f", - "4b52d7584f4a4788aa488da50c7af7a7", - "8cf6248c6fbc41c191604add924c2489", - "ad17ce03ed224aa9b3dd82acbef37283", - "2c50c60035b645bd8f78ac15f135d8a1", - "b35b1c0e3c6f448ba66fa22bd1c4f4dd", - "ac96aca5f8a043239a06b91eac444a5c", - "f2a9278d5bbf4495bf605e72ffd11978", - "0092e7836fc547c8854d857054ec1eba", - "7562ab03cf944c2fa666dc11b4c72cfe", - "eba6924af6b748da8a66623306d72ce7", - "864ba324ab4441d9bac5d7e7a47ea307", - "67ea3ce64466485a81e3742522bd4cd3", - "d5a08ba258524a1face1364d96380bce", - "e63d7cbe82264f6cad66a964f7305276", - "a08ec98d957f4222a221c7388f7f8742", - "fe2aa0b269764ae98871fedb48ce147d", - "5987cb268ca24ae0873056a44ace2a0c", - "5e5c08c0ebca468a81194a0547e114ee", - "6a8210ec9c4e4885ae20bd4aab9edccc", - "0c71d0c3bee143ab9c4556937cf1bcdf", - "546490d0772147febd5205e86f12d64e", - "9770dceb7c114fa08609972c3e09f25a", - "0b9974769f7f47e2bc52c9c1b98cfa38", - "7d5a1400f99c42d593e602fa7f4fd366", - "26a0af2288074eaf8f37302b37eca75f", - "8843a2e112104b8caae085f7db856433", - "47c073870ed845bc8556accd6b53103d", - "d94e21cc38d34011b9021e276ab0cc75", - "e315759a82224ec085689e66f00abe47", - "9dc476ed14ba473fb944f0d8a031b18d", - "ae7d29c03c1f4cf290a512d96ba03c99", - "ad0fe96e30b4463da2e653d30cf914ab", - "cdb40c130e88401db8f428fe1c45b6f8", - "cf36e7759a424986a6b0280113c7c13a", - "7ab11f4777e54b298368c5c0de49d4a8", - "66dc20746b5e46c88f34033fa63a4180", - "b0365023a6de46628ac53cf1a050349e", - "d2bea99ef10e45a082311c49b5f123f6", - "dfff3ee8c6d54bb3b33dd45535073742", - "143a86afbec74ee0b3e1a85ea7efaeb6", - "dc10d4b272be4bbbac2bc7c2b37ace3b", - "1be56d2ef7504930acf3e98b033be329", - "6227a8b4539d4d7581c0c511b4553dbe", - "5deb866b8db84aa29ef84a7288fc6a1a", - "69b0e0f08ffc4cfa924cd06449bb23b7", - "0739cc3378054a1ca429c3cad59f3ba4", - "5d9f7b72bd30469ea6484e4189f5b9a4" - ] - }, - "executionInfo": { - "elapsed": 41221, - "status": "ok", - "timestamp": 1640696913982, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "220d64e2-e491-497d-bed2-60722ff02787" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFDistilBertForTokenClassification, DistilBertTokenizer \n", @@ -197,34 +98,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1640696913982, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "f5855a00-ef33-4946-99d6-676e54442f05" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -232,7 +114,7 @@ "text": [ "total 509616\n", "-rw-r--r-- 1 maziyar staff 960 Dec 15 16:59 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 16:59 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 16:59 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 260918544 Dec 15 16:59 tf_model.h5\n" ] } @@ -243,36 +125,19 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 352, - "status": "ok", - "timestamp": 1640696914331, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "baf30e37-2d87-42f0-c49c-1b6c4276d677" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 9952\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 16:59 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 16:59 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 15 16:59 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 79680 Dec 15 16:59 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 5006359 Dec 15 16:59 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 16:59 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 16:59 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -282,25 +147,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 6, - "status": "ok", - "timestamp": 1640696914332, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "12d4a8b9-c3d7-426a-d3bf-b5e34ea96b2c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -319,9 +167,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.txt` from the tokenizer\n", @@ -331,10 +177,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -344,10 +188,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -361,34 +203,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7, - "status": "ok", - "timestamp": 1640696914579, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "0168f6bd-0eda-49e1-b400-e4c8a211fa6f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -406,18 +229,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save DistilBertForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -425,25 +244,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 41578, - "status": "ok", - "timestamp": 1640696956154, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "fb6db2f7-2681-4bfd-87d5-1180a0fd6b20" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -460,19 +262,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -482,9 +280,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `DistilBertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `DistilBertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -495,10 +291,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -516,19 +310,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -536,19 +326,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -556,9 +342,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -567,25 +351,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1640697002375, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "45f12c45-9d11-49ea-c6f2-3595de01eac5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -593,8 +360,8 @@ "text": [ "total 519080\n", "-rw-r--r-- 1 maziyar staff 265768509 Dec 15 17:00 distilbert_classification_tensorflow\n", - "drwxr-xr-x 5 maziyar staff 160 Dec 15 17:00 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:00 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 5 maziyar staff 160 Dec 15 17:00 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:00 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -604,19 +371,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny DistilBertForTokenClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = DistilBertForTokenClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -626,19 +389,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -646,7 +405,7 @@ "['B-LOC', 'I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'B-MISC', 'B-ORG', 'O', 'B-PER']" ] }, - "execution_count": 17, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -658,34 +417,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7015, - "status": "ok", - "timestamp": 1640699337029, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "5f7ea4b3-d669-4dc8-ae66-5ba1c642512f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -727,21 +467,10 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `DistilBertForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oWAvdlVA2937" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -764,13 +493,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - Longformer.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - Longformer.ipynb index 6b86977acab38b..5d90c82643f8ab 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - Longformer.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - Longformer.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "O1htkW4UQpwE" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20Longformer.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20Longformer.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import Longformer models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,18 +24,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.4.1` version and Transformers on `4.8.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -45,26 +40,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "3702a838-c057-417c-b6d5-d79a7082f9d8" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 2.5 MB 7.9 MB/s \n", - "\u001B[K |████████████████████████████████| 394.3 MB 8.4 kB/s \n", - "\u001B[K |████████████████████████████████| 895 kB 44.8 MB/s \n", - "\u001B[K |████████████████████████████████| 3.3 MB 31.9 MB/s \n", - "\u001B[K |████████████████████████████████| 2.9 MB 34.3 MB/s \n", - "\u001B[K |████████████████████████████████| 462 kB 67.6 MB/s \n", - "\u001B[K |████████████████████████████████| 3.8 MB 34.1 MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 2.5 MB 7.9 MB/s \n", + "\u001b[K |████████████████████████████████| 394.3 MB 8.4 kB/s \n", + "\u001b[K |████████████████████████████████| 895 kB 44.8 MB/s \n", + "\u001b[K |████████████████████████████████| 3.3 MB 31.9 MB/s \n", + "\u001b[K |████████████████████████████████| 2.9 MB 34.3 MB/s \n", + "\u001b[K |████████████████████████████████| 462 kB 67.6 MB/s \n", + "\u001b[K |████████████████████████████████| 3.8 MB 34.1 MB/s \n", + "\u001b[?25h" ] } ], @@ -74,9 +63,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [longformer-base-4096](https://huggingface.co/allenai/longformer-base-4096) model from HuggingFace as an example\n", @@ -86,9 +73,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZaiirlSKNhVD" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import LongformerTokenizer, TFLongformerModel\n", @@ -114,9 +99,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -124,9 +107,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "p2XCole7TTef" - }, + "metadata": {}, "outputs": [], "source": [ "!ls -l {MODEL_NAME}" @@ -135,13 +116,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "78416243-0f15-4cf7-a1b7-9d41d3565f95" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -161,13 +136,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "9406904b-95fd-4d46-f58a-1ac0d73b7d5f" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -187,9 +156,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.json` and `merges.txt` files from the tokenizer\n", @@ -199,9 +166,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "metadata": {}, "outputs": [], "source": [ "# let's save the vocab as txt file\n", @@ -216,18 +181,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save Longformer in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -236,9 +197,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -246,9 +205,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -256,9 +213,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -268,9 +223,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `LongformerEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `LongformerEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -285,9 +238,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -307,9 +258,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -317,9 +266,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "longformer.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -327,9 +274,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -337,9 +282,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -347,9 +290,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -359,13 +300,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "54f27777-cf0c-4dba-f59f-8edd8442eefb" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -384,9 +319,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny Longformer model 😊 " ] @@ -394,9 +327,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "longformer_loaded = LongformerEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -407,14 +338,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "pGRTNISyYlnO", - "outputId": "c10ed67f-202e-4be6-e583-c4d4009247f9" - }, + "metadata": {}, "outputs": [ { "data": { @@ -425,7 +349,7 @@ "'longformer_base_4096'" ] }, - "execution_count": 5, + "execution_count": null, "metadata": { "tags": [] }, @@ -438,9 +362,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of Longformer models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] diff --git a/examples/python/transformers/HuggingFace in Spark NLP - LongformerForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - LongformerForQuestionAnswering.ipynb index 35205ba74bcfe3..3333cf2507d583 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - LongformerForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - LongformerForQuestionAnswering.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20LongformerForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20LongformerForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import LongformerForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,18 +26,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.7.1` version and Transformers on `4.19.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,24 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.19.2 tensorflow==2.7.1 sentencepiece" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [valhalla/longformer-base-4096-finetuned-squadv1](https://huggingface.co/valhalla/longformer-base-4096-finetuned-squadv1) model from HuggingFace as an example\n", @@ -85,104 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFLongformerForQuestionAnswering, LongformerTokenizer \n", @@ -202,9 +81,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -212,24 +89,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "metadata": {}, "outputs": [], "source": [ "!ls -l {MODEL_NAME}" @@ -238,24 +98,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "metadata": {}, "outputs": [], "source": [ "!ls -l {MODEL_NAME}/saved_model/1" @@ -264,24 +107,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "metadata": {}, "outputs": [], "source": [ "!ls -l {MODEL_NAME}_tokenizer" @@ -289,9 +115,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.json` and `merges.txt` files from the tokenizer\n", @@ -301,9 +125,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -320,9 +142,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] @@ -330,24 +150,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "metadata": {}, "outputs": [], "source": [ "!ls -l {MODEL_NAME}/saved_model/1/assets" @@ -355,18 +158,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save LongformerForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -375,24 +174,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -400,9 +182,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -410,9 +190,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -422,9 +200,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `LongformerForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `LongformerForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -436,9 +212,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -456,9 +230,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -466,9 +238,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -476,9 +246,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -486,9 +254,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -496,9 +262,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -508,24 +272,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "metadata": {}, "outputs": [], "source": [ "! ls -l {MODEL_NAME}_spark_nlp" @@ -533,9 +280,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny LongformerForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] @@ -543,24 +288,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = MultiDocumentAssembler() \\\n", @@ -584,9 +312,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `LongformerForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -607,8 +333,7 @@ "name": "python3" }, "language_info": { - "name": "python", - "version": "3.6.9" + "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - RoBERTa.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - RoBERTa.ipynb index d6e49ecc097ed4..9e35167c7ad5bb 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - RoBERTa.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - RoBERTa.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "O1htkW4UQpwE" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import RoBERTa models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,9 +24,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -35,9 +32,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n" @@ -45,25 +40,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 79664, - "status": "ok", - "timestamp": 1622476245503, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "5290a6f5-417a-4a0e-e6d6-2aba664f030c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -71,9 +49,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [roberta-base](https://huggingface.co/roberta-base) model from HuggingFace as an example\n", @@ -82,68 +58,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 945, - "referenced_widgets": [ - "35e5ee38bc1c437b8df70d2eae183389", - "788e3ef4ec61409caf40cc45e23f2f1f", - "a7b6bc5963af4756ad490e86305f5e24", - "524d70accce44a1296b2bf8cd9044ebf", - "0ab09f97b5114624b03cda86be5814a8", - "943998b3cb204d8489ae0c71081e0390", - "e3866d1d10c74297abd8fe945bfa574e", - "0e1999f605a44ec089d3d27a5e6d1874", - "39c01f1f92ce4ecb89e7bafba714ecb3", - "97e54be966dc4aeebadf2d77161296a8", - "5bdca68c26394bc8857c14ca792c8e7d", - "be5a115938c04631a834020b4426fd11", - "e81d493c5510490da6655865f859d82f", - "bba854c507044daf8136a9b3368317fa", - "9240418ef66b42f4b0e4a4b78520bc9f", - "a646ecf1a46b4d92a03c417fa44bd517", - "a0b574ea85694a09a617848c2d98ed77", - "91dc248081da4e12babe1ac606cc80b1", - "cd4c3c270a0e42b4bae48a161099483d", - "220c79e165df4ea59ae5c61abab6493b", - "dd386f5a79b848458aab9b34179e9351", - "2c57da9a378042ebaa510f6d58eef27c", - "5ed79516254d41cf99ec61e552d52b36", - "44387ff2654445c1b304e9ad0ebdda2b", - "af56cc3ec8044118904cd7e5043e246d", - "73a77396295d481584ba3ff2a5746891", - "52d34ecaf34e4db6b879a076a8a0c918", - "39e24155d4de40cfb9ba3c3d678d9b3b", - "9d5d2c73362543ea8f75959a36dbef31", - "6ed0c9b9a15e4c9196e53deaa6527c26", - "d530a358eb484cbd90d36842abf80728", - "649ffc91bd8d4c85b3d6eb82f065b094", - "ef61a20839a84bb29f4877120eda6b95", - "1f6af1f0db45462da210f0153d092036", - "4c2a3270ee274517b12c173e548dc141", - "71e93f1ef31344998fd0e3382dd71956", - "739d8fb0cc4a4c80bc00dd3402ea2c43", - "4ba64be214f04a4cbb9817725389e99d", - "990b1223d2ca4d15b2b528039646a450", - "154919ace6a24a9c836627d45af4832d" - ] - }, - "executionInfo": { - "elapsed": 102609, - "status": "ok", - "timestamp": 1622476348109, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZaiirlSKNhVD", - "outputId": "6012bcbe-3fc0-415b-f0e8-3ba4f6115ac2" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -216,34 +132,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 303, - "status": "ok", - "timestamp": 1622475893386, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "b5eae84c-d956-406c-ac4a-f74c0313820c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -251,7 +148,7 @@ "text": [ "total 974328\n", "-rw-r--r-- 1 maziyar staff 638 Dec 15 17:27 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:27 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:27 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 498849472 Dec 15 17:27 tf_model.h5\n" ] } @@ -262,36 +159,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 9, - "status": "ok", - "timestamp": 1622475893387, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "49e70965-3ee3-4c91-d05c-603e211597d6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18032\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:27 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:27 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 56 Dec 15 17:27 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165513 Dec 15 17:27 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9057879 Dec 15 17:27 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:27 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:27 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -301,25 +181,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1622475893388, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "b38036da-afd1-46ce-ef43-f1ec428289ee" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -339,9 +202,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.json` and `merges.txt` files from the tokenizer\n", @@ -350,10 +211,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "\n", @@ -373,18 +232,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save RoBERTa in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -392,10 +247,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -412,19 +265,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -434,9 +283,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `RoBertaEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `RoBertaEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -449,10 +296,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -470,19 +315,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "roberta.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -490,19 +331,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -510,9 +347,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -521,33 +356,16 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 980, - "status": "ok", - "timestamp": 1622477591833, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "8d8fc13b-427e-44f1-bfe4-2705862f8730" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 991336\n", - "drwxr-xr-x 5 maziyar staff 160 Dec 15 17:27 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:27 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 5 maziyar staff 160 Dec 15 17:27 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:27 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 507563632 Dec 15 17:27 roberta_tensorflow\n" ] } @@ -558,19 +376,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊 " ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "roberta_loaded = RoBertaEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -581,26 +395,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 23, - "status": "ok", - "timestamp": 1622477610651, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "fc4d45f1-d870-408a-e16e-bbf6710bf33d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -608,7 +404,7 @@ "'roberta_base'" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -619,19 +415,10 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of RoBERTa models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -664,17 +451,11 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.28.0" }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } - }, "widgets": { "application/vnd.jupyter.widget-state+json": { "0ab09f97b5114624b03cda86be5814a8": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForQuestionAnswering.ipynb index fbe3862e45479b..49505273687cb8 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForQuestionAnswering.ipynb @@ -1,19 +1,19 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import RobertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +27,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +35,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -47,25 +43,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [deepset/roberta-base-squad2](https://huggingface.co/deepset/roberta-base-squad2) model from HuggingFace as an example\n", @@ -84,105 +61,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -325,34 +205,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -360,7 +221,7 @@ "text": [ "total 969768\n", "-rw-r--r-- 1 maziyar staff 749 Dec 15 17:36 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:36 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:36 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 496513256 Dec 15 17:36 tf_model.h5\n" ] } @@ -371,36 +232,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18664\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:36 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:36 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 15 17:36 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165443 Dec 15 17:36 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9380692 Dec 15 17:36 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:36 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:36 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -410,25 +254,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -448,9 +275,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `vocab.json` and `merges.txt` files from the tokenizer\n", @@ -459,10 +284,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -479,9 +302,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] @@ -489,24 +310,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "metadata": {}, "outputs": [], "source": [ "!ls -l {MODEL_NAME}/saved_model/1/assets" @@ -514,18 +318,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save RoBertaForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -533,25 +333,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -568,19 +351,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -590,9 +369,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `RoBertaForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `RoBertaForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -603,10 +380,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -624,19 +399,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -644,19 +415,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -664,9 +431,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -675,33 +440,16 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 987368\n", - "drwxr-xr-x 5 maziyar staff 160 Dec 15 17:37 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:37 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 5 maziyar staff 160 Dec 15 17:37 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:37 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 505530185 Dec 15 17:37 roberta_classification_tensorflow\n" ] } @@ -712,9 +460,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] @@ -722,24 +468,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = MultiDocumentAssembler() \\\n", @@ -763,9 +492,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `RoBertaForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -778,7 +505,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "transformers", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -791,13 +518,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForSequenceClassification.ipynb index 8ba0144f206a58..1c26c7cdfd6d20 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import RobertaForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n" @@ -47,14 +42,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "e7c8bc81-7048-4add-fb02-914c000bf093" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -62,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [arpanghoshal/EmoRoBERTa](https://huggingface.co/arpanghoshal/EmoRoBERTa) model from HuggingFace as an example\n", @@ -74,99 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 975, - "referenced_widgets": [ - "eb8e9fc4b6d04524904af6d70458053c", - "5e28015813ae4a05bbb84939b3229f55", - "e6e059fa86574882969daae24df80791", - "ec74ede43b334a0fac4760684c75f40c", - "6e265c630a80497d985d651381f6828c", - "003d7a6697d843ffa3d375ff58d1e5d2", - "327c953cd5924dc4a961e05a230e705f", - "929f22afa10940b2bab4a80a174b9eb5", - "25cb1113b5924cffa700e55ef6a092ed", - "d51be2931b1248229cc20b8b5c9149e9", - "33293067de9841f0bb73e7e932ac365a", - "c13ca62518a9437490a9b6684391d3b3", - "4978d8ab6e454406ac3960b1fdb95d34", - "690e753835424e478972b13d08667cce", - "b7f9c0b60f6a45e98f93c0e09a97eaef", - "dc229053d9bd4f96968701d383a108de", - "9b5421eaa55f4a448013652b5e6b0d1e", - "59df8429afef46ba82e456918e547c74", - "a7361ec2d9b14708b8b607d48b44ce26", - "b1679b690a014ca6a7e2b44510c3bd65", - "3c78b46f5b894ab0a2bd468aa4e03597", - "75ffcc5495304d60b314cefb6c54cf22", - "dd51c5d2070b41849ab1da5e257c4669", - "c9280d2908d6445093f962563b3c89f5", - "2c6945c7e0504ed1916b97c8e181edc0", - "968d2e5fcc824f0ab0ba6b9ddf34e80c", - "218d76e70c70428eb6a10f579d73d609", - "c76844070c094679946f39136258a7ec", - "6f3879af8219429d9cc179531cfec3de", - "1c309ced86184810bc60f62391fc1bc8", - "a25d4c9b9c7a4d378c278bc70586e98a", - "7a8d103587cf44e2a4b260c9e23fec47", - "e95ff41aa97548f2876b173dfc99a8a3", - "4af1f3efb4494d85a2cfb2ae2557dafa", - "d5e5bf8b9b14491e9712d07d40c07073", - "ec0183b454ff4a719f7c4c4bd6ac50c0", - "5db84c9e9deb4b80b456048301c6493c", - "72fa42b31b41487cba48f72056a2e717", - "8a811791bb894999879e36a6bd392455", - "bb066aaf31fd454189cd59c12e4adebb", - "0a2d90c348974f24ace07d4d160a0b1e", - "b4b82d703ad34d0dafe1854504f94e49", - "0225757978554800be6d5175a9808482", - "00a02acc03b44effadb7e3db08b4c33d", - "5ef489a5f0674fcc94791fb2b228e1a6", - "e91bd8bc1aab41caa2f6d6ebb37af6b0", - "78445e2416d040d49f06421889e28f41", - "089c75e6b1e44f4c827a3f1cbb795f78", - "5c077972fab4493a862d04e84ca2df43", - "997e96ce6ed3464d8a91556913304162", - "f9424eb146c64a868f2bdcb05dfa4420", - "e011e1c2116b4fd0bffcfc384eedd7d2", - "247c23993f3443b9aa41d9b34f516136", - "d2e4afd9b8404efdb81a8c7778ce670a", - "afdd174e12b545eca208f7381beb99f2", - "b89bac4d83ab4d4e804d0e5d35248b89", - "941e7a1a133d48218ac1165dbb44adf4", - "09d5fef8f6b448e398821b0aa7cb725d", - "12449e3c621344d2bbb3265008a1bae8", - "4bcf23778f844f4b9068b86083170aef", - "a54ed3f63fea48868c8ad8a2fe4d92f0", - "54f98d6058b441c2b8d04fc2e22b95a8", - "30476b430b32459f8ac219fc502b88dd", - "03ed9f2068d04dc1bfeb0587ad5f5f21", - "f654280648bb42e69f71589c85872531", - "3997131771a846bc9a5deb645f25feb3", - "dc0ac23dc1854125ac544348ff4a2f5b", - "002cc50fea354dcda0bb26c4cd5f70f6", - "6b3826dbcd9444f9b30e4f355af0f7bc", - "64c37cdfefd847a1bb736ef66f3890a4", - "99d755e67df04d15ad57b07c4cbb1853", - "a82a3dc729c0450ab132cbef9c1538f8", - "fe20df3c69654eed888f5cf92a53ebe4", - "f69d1d7a20964909878c3d26c9cf5187", - "79a378c3fb5e42a19595ebffd305d82c", - "590f079668ae4b3babb84cc0e37e79cb", - "c97ef12f48ac470aab25553f7e6044ed", - "dc74233639f84ed7b954172fb1a26bfe", - "0e445da3c4cf4cf2bb3a9c68ba09b015", - "55510ece750e40c7ac5e2b276cc967c8", - "02f83766babd4c778672980fa737779d", - "956d47e500e647deadc299e675356c0c", - "9a6515e320b34d00be2a059aa81c7a28" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "79d06be5-935b-49c2-cb09-4893b8ee8684" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFRobertaForSequenceClassification, RobertaTokenizer \n", @@ -203,23 +98,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "cf9ab3cf-172e-436b-d1f2-b2bdb2be5787" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -227,7 +114,7 @@ "text": [ "total 974536\n", "-rw-r--r-- 1 maziyar staff 1894 Dec 15 17:47 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:47 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:47 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 498958144 Dec 15 17:47 tf_model.h5\n" ] } @@ -238,25 +125,19 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "2b1bed54-f3fb-4839-ddce-6331306f9d6c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18864\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:47 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:47 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 53 Dec 15 17:47 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 169987 Dec 15 17:47 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9481842 Dec 15 17:47 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:47 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:47 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -266,14 +147,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "c5395b47-fd43-4428-eca6-dbad494aece3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -293,9 +168,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.json` and `merges.txt` files from the tokenizer\n", @@ -305,10 +178,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -325,10 +196,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -342,23 +211,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OYnT5U8N9dxT", - "outputId": "01b61a25-d261-4731-bc2a-6a935c3e2b7d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -377,18 +238,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save RobertaForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -396,10 +253,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -416,19 +271,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -438,9 +289,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `RoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `RoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -450,10 +299,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -468,19 +315,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -488,19 +331,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -508,9 +347,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -519,22 +356,16 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "c2ac85fa-b376-4db9-b6f0-bebebff3da1b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 992336\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:48 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:48 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:48 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:48 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 508072624 Dec 15 17:48 roberta_classification_tensorflow\n" ] } @@ -545,19 +376,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForSequenceClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = RoBertaForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -567,23 +394,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Z8tkO433o795" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pGRTNISyYlnO", - "outputId": "95f9a363-486d-4069-ca55-1a13bf48c91c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -618,7 +437,7 @@ " 'relief']" ] }, - "execution_count": 17, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -629,19 +448,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "I3lOuNRYo795" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "9mYJ8a3evyXp" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline, PipelineModel" @@ -649,11 +464,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "rCtagQxio795", - "outputId": "89c0b63d-8769-42da-f917-0ee82ae51873" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -667,14 +479,6 @@ "+--------------------+------+\n", "\n" ] - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001B[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] } ], "source": [ @@ -703,9 +507,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `RoBertaForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -731,13 +533,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForTokenClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForTokenClassification.ipynb index 2e281c153d6031..e09e0cfabe1399 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForTokenClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - RoBertaForTokenClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForTokenClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForTokenClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import RoBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n" @@ -47,25 +42,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 91416, - "status": "ok", - "timestamp": 1640700825967, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "a068c234-7130-456c-eb15-aada216a7f44" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0" @@ -73,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [philschmid/distilroberta-base-ner-wikiann-conll2003-3-class](https://huggingface.co/philschmid/distilroberta-base-ner-wikiann-conll2003-3-class) model from HuggingFace as an example\n", @@ -85,104 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 469, - "referenced_widgets": [ - "198e89eb3e04460e868976b0c1aeb9ba", - "19a1221923a54cb794f0ec8013d16535", - "e631cd1803374c73867a12327262cee9", - "f2a26901cb0b423aa317ef167ad92b42", - "8ac5a0b11a4e4d5d87fd7b5ef8a585e3", - "970e9d9c30ea4c839b2355ef9ff482e3", - "1e88556cf8fb412b85a69fea503f22e8", - "b349cb1de54f46358bdce764d56e2601", - "a7849aac4e95490f9b2f2a6e51006319", - "f4ae6213c5d84e0c878a13a98edb172a", - "5c5e056f14514e3b8106663ef4733299", - "9500f3fc86b84899a74f7581bbdb6ab5", - "02b8acf235974fa6b48309394e4bb4b6", - "8ae4c74038a04973a6d31ac1f3a3c4ed", - "3d954da06f6f4bfdb146b1f43b8476e4", - "58a40de38e9f483ebefeb2f3be93ae57", - "187d3ffcbc734c2e83a81bd154e9fa7b", - "3129798a94be4d92925f47a54e3bb59c", - "a6e9e4621dba44feab499f62abb906ec", - "f0b2d02e4c6a41db9f49cc68772324da", - "21a12dd333e944849dedde6439a78a2d", - "efc894c71cc34b4b8a52aca1b5aaa4da", - "2b5f151603f8481c81a3175f3771c28f", - "ab4c1050a90f4c7e91d2d0d9f6794113", - "4d936549b025465ba92cff643f0f7c78", - "18f5d5b3a31945e6a16c3145b2cd0d6a", - "3b77d93b7cd04b7ab119c566e363a471", - "f7fec485f4bb44f5b6e142fd81e1f2c4", - "68bea901e4f34ce2ad61964cb416e7b4", - "cd6868ce7f7d4dc389ab0b26fbd2fa5d", - "e1e3dc6f07f347b6917c325fd8e4d78a", - "30e63624f1e141cd83ba69a818335b41", - "acc184b324354f23a0a5647afefae706", - "cc01893a0cf0465f97f266c9e4334a3e", - "d8c81f0e489048bd84508291e95ef604", - "15ae4247dfcc4727bbe68786d8046c34", - "ef0f7669810e43fe85de245b88ceed27", - "6903edad9f3c42f9bcf12fafc254f49f", - "1b799d22333e4d0e9fda583dc473ff48", - "ad4e2cbda9a64bf7a4f69ce27f816a30", - "42739073a7bd4fce8d0ef82607dd15b8", - "94614bebf689491c9894a016f7feaf5b", - "dcb0d55d0a3a462082efd7adc17b690c", - "b516f3fcfc8f4e00aca132f0dd66593f", - "44fdcb7c0019408e8666a5a8152df218", - "65f51c8958dc4a04bbcb5bbd2f3a432e", - "36d807ce700140b0ac17686c1627b9a3", - "505ded741f9041308508dfce64aefb95", - "a05bf28cb87b4373b83ab9979b7f2163", - "d089dc7e7ef7481cb89e262d61cd0534", - "b2b1f3ef592e4945939c9c4b55c022bb", - "075fec6260444e11babae4f672cb073e", - "8aa2d67b89704f85904d2811a0acbda1", - "4448fa1da1ed43908a6057aeb4e958df", - "ffeac475a50540eebb92bd3b408926c3", - "3123500720f14038be919536539da8c6", - "386de36fb344423e82fe676eef1443b4", - "e82207dbfe9a4277b643f609ad4337eb", - "4404442dd57c4e1d9ab277ef9f53fe51", - "3607a65ef2104e949eebd5dfde6cea14", - "13f403911c4049a5972d71cacd43123c", - "d3eb961d14d74719b8176ed412f7f71f", - "e915e93701844939b9abcce09da15bab", - "e96f93cff59b44638d4aca57da5ccc9a", - "440fb367fd66407782bdbcc631728fe4", - "9f570940b09b48229ea5c89dafbaff6f", - "40b0dbee9005432cb1cfa35e32e9a6f5", - "34f0787eb59742358f5c642e10efc8f6", - "14dec41e88714f08ba42c93e2d4374fd", - "b916d51751904b5488d14c02c74c1013", - "d2b95c5419c54e6cb395b757e9c0b67e", - "f031eaeec5b04617aa784e9df60fc56b", - "3ed5df6d2fae4628a1dff804b650efd6", - "c5c2fc8715ca4cb3a698f5f59e5b7cb9", - "0c4b0291e18c4c208804546b18f1f1f4", - "b24f49154dc648a3a765fa7024f94a1c", - "9b9e18f26fd14af6b1a8ef8eda7046c1" - ] - }, - "executionInfo": { - "elapsed": 47373, - "status": "ok", - "timestamp": 1640700873332, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "83a2e2eb-ad9c-4363-aa8c-a96e0b93a300" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import TFRobertaForTokenClassification, RobertaTokenizer \n", @@ -219,34 +98,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 20, - "status": "ok", - "timestamp": 1640700873332, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "e48981a0-3664-4e51-9656-522f72355f22" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -254,7 +114,7 @@ "text": [ "total 637280\n", "-rw-r--r-- 1 maziyar staff 1034 Dec 15 17:51 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:51 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 17:51 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 326280552 Dec 15 17:51 tf_model.h5\n" ] } @@ -265,36 +125,19 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 323, - "status": "ok", - "timestamp": 1640700873652, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "28abfaa0-ea5d-4d0a-eb36-d3aca80e3c2e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 9672\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:51 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 17:51 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 52 Dec 15 17:51 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 89042 Dec 15 17:51 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 4854186 Dec 15 17:51 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:51 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 17:51 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -304,25 +147,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 212, - "status": "ok", - "timestamp": 1640700873860, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "9f340ef5-9ba0-4ea5-88b9-906edcc9d8f0" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -342,9 +168,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `vocab.json` and `merges.txt` files from the tokenizer\n", @@ -354,10 +178,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -374,10 +196,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -391,34 +211,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 232, - "status": "ok", - "timestamp": 1640700874309, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "7278d43b-775a-4405-bdea-d091f52026b5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -437,18 +238,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save RobertaForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -456,25 +253,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 42989, - "status": "ok", - "timestamp": 1640700917295, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "5c20ad81-62ca-4b74-9182-be8e1779939c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -491,19 +271,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -513,9 +289,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `RoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `RoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -526,10 +300,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -545,19 +317,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -565,19 +333,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -585,9 +349,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -596,33 +358,16 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 6, - "status": "ok", - "timestamp": 1640700977744, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "ae9b2aac-4d4d-405d-9a4b-8a3a153c4d12" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 646384\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:51 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:51 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:51 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 17:51 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 330946755 Dec 15 17:51 roberta_classification_tensorflow\n" ] } @@ -633,19 +378,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForTokenClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = RoBertaForTokenClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -655,19 +396,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -675,7 +412,7 @@ "['B-LOC', 'I-ORG', 'I-LOC', 'I-PER', 'B-ORG', 'O', 'B-PER']" ] }, - "execution_count": 17, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -687,34 +424,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8020, - "status": "ok", - "timestamp": 1640701136272, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "06d9b22c-540b-4c67-b591-e89cd8b60ac3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -756,9 +474,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `RoBertaForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -784,13 +500,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - ViTForImageClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - ViTForImageClassification.ipynb index 5fcebe4b5c8fdc..e9bd0726f676c4 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - ViTForImageClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - ViTForImageClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ViTForImageClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ViTForImageClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import ViTForImageClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -32,18 +31,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.9.2` version and Transformers on `4.21.3`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." @@ -51,25 +46,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "! pip install -q transformers==4.21.3 tensorflow==2.9.2" @@ -77,9 +55,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) model from HuggingFace as an example\n", @@ -88,105 +64,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -285,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -294,7 +173,7 @@ "['google/vit-base-patch16-224/saved_model/1/assets/preprocessor_config.json']" ] }, - "execution_count": 8, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -321,34 +200,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -356,7 +216,7 @@ "text": [ "total 676976\n", "-rw-r--r-- 1 maziyar staff 69684 Sep 7 09:51 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Sep 7 09:51 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Sep 7 09:51 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 346537560 Sep 7 09:51 tf_model.h5\n" ] } @@ -367,35 +227,18 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13200\n", - "drwxr-xr-x 4 maziyar staff 128 Sep 7 09:53 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 4 maziyar staff 128 Sep 7 09:53 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 329701 Sep 7 09:51 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 6426590 Sep 7 09:51 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Sep 7 09:51 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Sep 7 09:51 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -405,25 +248,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -441,9 +267,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `lables.json` and `preprocessor_config.json` in our `assets`" @@ -451,18 +275,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save ViTForImageClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -470,25 +290,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -525,19 +328,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -547,9 +346,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `ViTForImageClassification` which allows us to load TensorFlow model in SavedModel format\n", "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", @@ -559,10 +356,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -578,19 +373,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "imageClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -598,19 +389,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -618,9 +405,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -629,34 +414,17 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 688696\n", - "drwxr-xr-x 4 maziyar staff 128 Sep 7 09:58 \u001B[34mfields\u001B[m\u001B[m\n", + "drwxr-xr-x 4 maziyar staff 128 Sep 7 09:58 \u001b[34mfields\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 352611671 Sep 7 09:59 image_classification_tensorflow\n", - "drwxr-xr-x 6 maziyar staff 192 Sep 7 09:58 \u001B[34mmetadata\u001B[m\u001B[m\n" + "drwxr-xr-x 6 maziyar staff 192 Sep 7 09:58 \u001b[34mmetadata\u001b[m\u001b[m\n" ] } ], @@ -666,16 +434,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny ViTForImageClassification model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -702,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -723,25 +489,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -782,17 +531,10 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `ViTForImageClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { @@ -815,13 +557,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - XLM-RoBERTa.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - XLM-RoBERTa.ipynb index 1d8c558579676e..918149f4ac05dd 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - XLM-RoBERTa.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - XLM-RoBERTa.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "lshuevA3Qv-N" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLM-RoBERTa.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLM-RoBERTa.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import XLM-RoBERTa models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,9 +24,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -35,9 +32,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -46,14 +41,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "abac85a3-c938-45b4-97db-db978e1a2d38" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -61,9 +50,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) model from HuggingFace as an example\n", @@ -74,48 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 896, - "referenced_widgets": [ - "7271f65ac8c34370927812c6ebc26680", - "be4ae5e77eba4501b68dd4c168e75a70", - "a381d87b7e8c4664b725819cf9e40b5e", - "6da6c5fe9a4e4d86b91b8ba468a9b8fd", - "3a7d01e35a66472885c75e47118f2a7a", - "13ee7dbdd57f43d6a667b1e118fd7306", - "1b63d0cfa2164ce6959fe55bc3d53292", - "570e6b14d24c4bdb90ac3f6d50879280", - "80cd474ae43144e88275a8e0d25f3dad", - "eb76330eb6fd4a66a9d02d1f90447b35", - "690778e1619f40d681ae5346e9ca8f7b", - "19805c06fa8c4336b0d8d0fd04ed16d6", - "64b1edc02ded48109b0db3df4537e2dc", - "240adb86143a4080ae42e63ff4e1a851", - "ee7fa14eb12e4ebe9f8cc6c16edbba73", - "c1b239ba82554cc6b83a1e72c2df9811", - "664e5d3170fb40f78d4f4d044d6b152b", - "1fd84f303c5e4c7db7041c62c675278b", - "cb2daa67db4f42a89781b52f04dbf921", - "3c881124f6264bfe9ecc89c26354ebe9", - "f7c27a24a0ef4027ad58cc8a4663e091", - "4fd9efce28e249df983c39acac900d51", - "5980407785b1454ab0f7422c77ac5bfc", - "4550fa6e3e4545e49e3eb5ff05cc6e3e", - "e79a5512e1a3490494ac78742ec8fe09", - "1fc6028e0c1c4d3996606926b896b9d2", - "9ffab1dc0b364b4d8f52e9bcf6f320fc", - "fca45b67bfdc4d2ebed539985e91bdc3", - "a850b999845b4897ac5bea7349d88d31", - "8fbb65204a6d4b9893a5e87fdd1d1e76", - "53b235bce90b4e668713bf13baa70907", - "70c1f42b905647a49ce528d9289b82d9" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "b3a68a21-512d-45f2-abbc-1aa4e88231a1" - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import XLMRobertaTokenizer, TFXLMRobertaModel\n", @@ -152,23 +98,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "dc44304c-a042-4230-854c-977024072d36" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -176,7 +114,7 @@ "text": [ "total 2202408\n", "-rw-r--r-- 1 maziyar staff 673 Dec 15 18:14 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:14 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:14 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1112441536 Dec 15 18:14 tf_model.h5\n" ] } @@ -187,25 +125,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "d588934e-73c5-492c-dca1-f165ac6a5222" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18136\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:14 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:14 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 15 18:14 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165621 Dec 15 18:14 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9111290 Dec 15 18:14 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:14 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:14 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -215,14 +147,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "b1c953b5-9550-4fdc-b07a-3c4399cee28d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -241,9 +167,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `sentencepiece.bpe.model` file from the tokenizer\n", @@ -252,10 +176,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# let's copy sentencepiece.bpe.model file to saved_model/1/assets\n", @@ -264,18 +186,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save XLM-RoBERTa in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -283,10 +201,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -303,19 +219,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -325,9 +237,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `XlmRoBertaEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `XlmRoBertaEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -340,10 +250,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -361,19 +269,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "xlm_roberta.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -381,19 +285,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -401,9 +301,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -412,22 +310,16 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "8d8fc13b-427e-44f1-bfe4-2705862f8730" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 2229168\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:15 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:15 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:15 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:15 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 5069051 Dec 15 18:15 xlmroberta_spp\n", "-rw-r--r-- 1 maziyar staff 1121302747 Dec 15 18:15 xlmroberta_tensorflow\n" ] @@ -439,19 +331,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊 " ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "xlm_roberta_loaded = XlmRoBertaEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -462,15 +350,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "pGRTNISyYlnO", - "outputId": "fc4d45f1-d870-408a-e16e-bbf6710bf33d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -478,7 +359,7 @@ "'xlm_roberta_base'" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -489,9 +370,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of XLM-RoBERTa models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -518,13 +397,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - XLNet.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - XLNet.ipynb index 6e5d96d8d7a2a5..3d79af75e97179 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - XLNet.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - XLNet.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "lshuevA3Qv-N" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark_NLP%20-%20XLNet.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLNet.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import XLNet models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -25,18 +24,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.4.1` version and Transformers on `4.6.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -46,27 +41,21 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "7cebd8a2-a930-4c11-d03c-7473bcd50765" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 2.3MB 7.8MB/s \n", - "\u001B[K |████████████████████████████████| 394.3MB 39kB/s \n", - "\u001B[K |████████████████████████████████| 1.2MB 44.4MB/s \n", - "\u001B[K |████████████████████████████████| 3.3MB 35.7MB/s \n", - "\u001B[K |████████████████████████████████| 901kB 40.5MB/s \n", - "\u001B[K |████████████████████████████████| 2.9MB 33.1MB/s \n", - "\u001B[K |████████████████████████████████| 471kB 58.8MB/s \n", - "\u001B[K |████████████████████████████████| 3.8MB 26.3MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 2.3MB 7.8MB/s \n", + "\u001b[K |████████████████████████████████| 394.3MB 39kB/s \n", + "\u001b[K |████████████████████████████████| 1.2MB 44.4MB/s \n", + "\u001b[K |████████████████████████████████| 3.3MB 35.7MB/s \n", + "\u001b[K |████████████████████████████████| 901kB 40.5MB/s \n", + "\u001b[K |████████████████████████████████| 2.9MB 33.1MB/s \n", + "\u001b[K |████████████████████████████████| 471kB 58.8MB/s \n", + "\u001b[K |████████████████████████████████| 3.8MB 26.3MB/s \n", + "\u001b[?25h" ] } ], @@ -76,9 +65,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [xlnet-base-cased](https://huggingface.co/xlnet-base-cased) model from HuggingFace as an example\n", @@ -88,48 +75,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000, - "referenced_widgets": [ - "b4d7753d59f3415cab82fa08622d127a", - "112b6c83e72a40f8b702827554494a87", - "e15f6cd42fd84bf2ab7d79dece2f7c8c", - "c2dc361222924623ac3e406e3ccbd35e", - "c014617a417847ef8bc74b13568d9e92", - "a5d6a0b40023484dbb6bbebf0807d88a", - "b6d6708bdf3342cf80b1aa2d005d8470", - "556e2b36f6894075bfc126b8201cf1a7", - "2ea5c37fd09340578d533b4335cbbb11", - "65abd5525a79466fa2c04552c0f3dbd8", - "e0eecb92526d47f58e659131e58c8820", - "6098e5b8c2874ddfbed513fa5caa97fc", - "6816aac65f234ab8ae2d3d159dbc3de4", - "82e3d960ecb34ab68d92310296e24a1c", - "de79e461015447aab0e63a97d113adfc", - "8c4b1f595de4444a8b1ac95f35757f8f", - "cbe0d6932baf40958c1b02a4de39a446", - "cc5509259b214cd1ac222eaaa16ad5b0", - "16e1f43fde7a472da03db3c41f60fa8c", - "79eebe79a9094230a33c8bfb4b1ca94e", - "2b45858a80214631a4ddfddce2017fb8", - "b9fcce2249cf44b8821b443ca375b87e", - "798c9374c2b4462d987ce16e56f02f42", - "68be27b6a4f148bc94c9d79cbc28d6b0", - "7e8df6a018fb44b1be17e2fdcc92a9f1", - "4464912dcef245829b53089b3f059b34", - "58792379d7b9440ea0561b900b4f09de", - "df27fafb64064e1a89c32c1cb09e6cf7", - "3873f85331ad46ffa6b237d0989b6439", - "4677a59faf074e3daeb63e9b2ee9401a", - "c48df4bcaf394932a1c7654ecd6cbcb9", - "ec158964c7044eb89051e91c01ceb9dd" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "ec1c7a15-991b-4dc6-b3d8-d9e710c4117b" - }, + "metadata": {}, "outputs": [ { "data": { @@ -328,9 +274,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -338,13 +282,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "731e86be-6dec-4868-e778-5b7dc969d89d" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -364,13 +302,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "2fd66b31-0055-4d4a-a3f0-3aa431556ed5" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -390,13 +322,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "203a25a4-27fe-4d83-c3cb-edab29d4446a" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -415,9 +341,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `spiece.model` file from the tokenizer\n", @@ -427,9 +351,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "metadata": {}, "outputs": [], "source": [ "# let's copy spiece.model file to saved_model/1/assets\n", @@ -438,18 +360,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save XLNet in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -458,9 +376,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -468,9 +384,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -478,9 +392,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "xGXPlbLdBvbm" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -490,9 +402,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `XlnetEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `XlnetEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -506,9 +416,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -526,9 +434,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -536,9 +442,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "xlnet.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -546,9 +450,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4W2m4JuVDM3D" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -556,9 +458,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "CnUXH76ADSkL" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -566,9 +466,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -578,13 +476,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "12904d23-ee47-4bd7-ec5d-873296973f57" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -604,9 +496,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊 " ] @@ -614,9 +504,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "xlnet_loaded = XlnetEmbeddings.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -628,14 +516,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "pGRTNISyYlnO", - "outputId": "4073de52-0c10-4884-93f6-1527de9935e0" - }, + "metadata": {}, "outputs": [ { "data": { @@ -646,7 +527,7 @@ "'xlnet_base_cased'" ] }, - "execution_count": 14, + "execution_count": null, "metadata": { "tags": [] }, @@ -659,9 +540,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of XLNet models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -689,8 +568,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" }, "nteract": { "version": "0.28.0" @@ -699,6 +577,7 @@ "application/vnd.jupyter.widget-state+json": { "112b6c83e72a40f8b702827554494a87": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -746,11 +625,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "16e1f43fde7a472da03db3c41f60fa8c": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -770,11 +649,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_2b45858a80214631a4ddfddce2017fb8", "value": 760 - }, - "model_module_version": "1.5.0" + } }, "2b45858a80214631a4ddfddce2017fb8": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -786,11 +665,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "2ea5c37fd09340578d533b4335cbbb11": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -807,11 +686,11 @@ "IPY_MODEL_6098e5b8c2874ddfbed513fa5caa97fc" ], "layout": "IPY_MODEL_65abd5525a79466fa2c04552c0f3dbd8" - }, - "model_module_version": "1.5.0" + } }, "3873f85331ad46ffa6b237d0989b6439": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -823,11 +702,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "4464912dcef245829b53089b3f059b34": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -875,11 +754,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "4677a59faf074e3daeb63e9b2ee9401a": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -927,11 +806,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "556e2b36f6894075bfc126b8201cf1a7": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -979,11 +858,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "58792379d7b9440ea0561b900b4f09de": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1003,11 +882,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_3873f85331ad46ffa6b237d0989b6439", "value": 565485600 - }, - "model_module_version": "1.5.0" + } }, "6098e5b8c2874ddfbed513fa5caa97fc": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1024,11 +903,11 @@ "placeholder": "​", "style": "IPY_MODEL_de79e461015447aab0e63a97d113adfc", "value": " 1.38M/1.38M [00:01<00:00, 1.06MB/s]" - }, - "model_module_version": "1.5.0" + } }, "65abd5525a79466fa2c04552c0f3dbd8": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1076,11 +955,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "6816aac65f234ab8ae2d3d159dbc3de4": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1092,11 +971,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "68be27b6a4f148bc94c9d79cbc28d6b0": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1144,11 +1023,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "798c9374c2b4462d987ce16e56f02f42": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1159,11 +1038,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "79eebe79a9094230a33c8bfb4b1ca94e": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1180,11 +1059,11 @@ "placeholder": "​", "style": "IPY_MODEL_798c9374c2b4462d987ce16e56f02f42", "value": " 760/760 [00:00<00:00, 2.27kB/s]" - }, - "model_module_version": "1.5.0" + } }, "7e8df6a018fb44b1be17e2fdcc92a9f1": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1201,11 +1080,11 @@ "IPY_MODEL_df27fafb64064e1a89c32c1cb09e6cf7" ], "layout": "IPY_MODEL_4464912dcef245829b53089b3f059b34" - }, - "model_module_version": "1.5.0" + } }, "82e3d960ecb34ab68d92310296e24a1c": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1253,11 +1132,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "8c4b1f595de4444a8b1ac95f35757f8f": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1305,11 +1184,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "a5d6a0b40023484dbb6bbebf0807d88a": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1357,11 +1236,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "b4d7753d59f3415cab82fa08622d127a": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1378,11 +1257,11 @@ "IPY_MODEL_c2dc361222924623ac3e406e3ccbd35e" ], "layout": "IPY_MODEL_112b6c83e72a40f8b702827554494a87" - }, - "model_module_version": "1.5.0" + } }, "b6d6708bdf3342cf80b1aa2d005d8470": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1393,11 +1272,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "b9fcce2249cf44b8821b443ca375b87e": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1445,11 +1324,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "c014617a417847ef8bc74b13568d9e92": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1461,11 +1340,11 @@ "_view_name": "StyleView", "bar_color": null, "description_width": "initial" - }, - "model_module_version": "1.5.0" + } }, "c2dc361222924623ac3e406e3ccbd35e": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1482,11 +1361,11 @@ "placeholder": "​", "style": "IPY_MODEL_b6d6708bdf3342cf80b1aa2d005d8470", "value": " 798k/798k [00:03<00:00, 251kB/s]" - }, - "model_module_version": "1.5.0" + } }, "c48df4bcaf394932a1c7654ecd6cbcb9": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1497,11 +1376,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "cbe0d6932baf40958c1b02a4de39a446": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], @@ -1518,11 +1397,11 @@ "IPY_MODEL_79eebe79a9094230a33c8bfb4b1ca94e" ], "layout": "IPY_MODEL_cc5509259b214cd1ac222eaaa16ad5b0" - }, - "model_module_version": "1.5.0" + } }, "cc5509259b214cd1ac222eaaa16ad5b0": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1570,11 +1449,11 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } }, "de79e461015447aab0e63a97d113adfc": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", @@ -1585,11 +1464,11 @@ "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" - }, - "model_module_version": "1.5.0" + } }, "df27fafb64064e1a89c32c1cb09e6cf7": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], @@ -1606,11 +1485,11 @@ "placeholder": "​", "style": "IPY_MODEL_c48df4bcaf394932a1c7654ecd6cbcb9", "value": " 565M/565M [00:14<00:00, 38.0MB/s]" - }, - "model_module_version": "1.5.0" + } }, "e0eecb92526d47f58e659131e58c8820": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1630,11 +1509,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_6816aac65f234ab8ae2d3d159dbc3de4", "value": 1382015 - }, - "model_module_version": "1.5.0" + } }, "e15f6cd42fd84bf2ab7d79dece2f7c8c": { "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], @@ -1654,11 +1533,11 @@ "orientation": "horizontal", "style": "IPY_MODEL_c014617a417847ef8bc74b13568d9e92", "value": 798011 - }, - "model_module_version": "1.5.0" + } }, "ec158964c7044eb89051e91c01ceb9dd": { "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", @@ -1706,8 +1585,7 @@ "top": null, "visibility": null, "width": null - }, - "model_module_version": "1.2.0" + } } } } diff --git a/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForSequenceClassification.ipynb index aab3e520d470f8..e9a89d40a664e9 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRBertaForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRoBertaForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import XlmRoBertaForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,14 +43,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hHXgqiWpMfCY", - "outputId": "66f8c987-e55a-41fc-bbe9-946afe2b1974" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -63,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [cardiffnlp/twitter-xlm-roberta-base-sentiment](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment) model from HuggingFace as an example\n", @@ -74,61 +61,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 321, - "referenced_widgets": [ - "0dfec3fc2e48468da2c85978a6839ed2", - "b440e3352310400aafe110cf59edd3d8", - "4143e0ef0efb409bbab93ff6c65df55f", - "ef9e5fc46dbc4b459bf57d0efd3e0570", - "6c70202517d54b0daa4d1b86f16d6d50", - "9ae66f85eec94b998c5b0297b6ec48cd", - "d6e441f5a874452daa284566a22f69b4", - "73a872c58cdc4e3cbcd7cf4a97c610ce", - "2ca8c7607dfb4ebc977f749e07485b73", - "77704c04cfed4f9592314635adf22861", - "fd5cd2cc8cf840cca1f77b7b596e704d", - "45f3f69c773b47f9b604453dd47c96a3", - "b78538d42bce49b8893b03d06d092478", - "8bd229b4337f4d829c5eaa61f365bb42", - "c905675a8a5545dcbdd67467294755dc", - "2c0bcec26bc7411faf197122966581bc", - "06dc05f1f4ee4fd7962a7a644011f094", - "e4fd69a7568a432d87096a5f3dfe954c", - "095192c21301490baf48796fed1e19d0", - "bc49be0b269648338137a5e7108cd742", - "ebe63c4a2d04488293aa47f1438652fd", - "b2f6e2f73cb5423c8fc9f7d210158e79", - "5a5cc12ec2004af497d68101e1b00052", - "e435d6df9aa54f88899e5cc7d52d1c75", - "ffab44017fc94958a232e4bc37efad0b", - "b40c930fd22b4075a0e929f8b2abfcd3", - "0e2cf169d8c64619a0910f40bcc7cd0b", - "dd1755b8ce294c33a817ea2ebac1868f", - "0c894efe0a91494f909cde332d6447fa", - "1b6023939d8442ef8824b8299a212659", - "3679e979fc9443c5819bb5951a41666f", - "1ff5391d7fc042edb2d4a7e85585158b", - "73b53f223e4c45568d065f63d44a2e95", - "86d6f003523f44f398c388b45434f214", - "b2e8e3a080124f8c96537d13f287f590", - "b18cbf0ddcce48609dc4a321b296a817", - "a7571d0cc2984063a4056ccad28d5903", - "fd5ec6acd2de42fdb7d9b40312b80ce7", - "21c2ec038b444514af2caefbd09df224", - "640f4fd8c89d45b48f0791f50546c036", - "08f3e20dd9444d81bdbebd1d88037fef", - "51c9b138088a4447988e4af39b036467", - "fd188f14aa97479399ce8c336bc553f5", - "12fbfbfccd4e4dbaa870de2c9fd81471" - ] - }, - "id": "ZaiirlSKNhVD", - "outputId": "6e58ccd4-babd-42ab-d402-6cc4021bccdb" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -187,23 +121,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p2XCole7TTef", - "outputId": "aca78d35-6d4f-46bf-d0d2-6513dfd3d890" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -211,7 +137,7 @@ "text": [ "total 2202472\n", "-rw-r--r-- 1 maziyar staff 915 Dec 15 18:34 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:34 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:34 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1112473408 Dec 15 18:34 tf_model.h5\n" ] } @@ -222,25 +148,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "r0DOGz8VUR-r", - "outputId": "7860ceff-0778-48ec-d31f-1c28521a9ec8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18968\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:34 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:34 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 15 18:34 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 167652 Dec 15 18:34 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9535557 Dec 15 18:34 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:34 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:34 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -250,14 +170,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3e4c2f48-a173-4e33-bad5-244cf2e40a00" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -276,9 +190,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `sentencepiece.bpe.model` file from the tokenizer\n", @@ -288,10 +200,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -302,10 +212,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -319,23 +227,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OYnT5U8N9dxT", - "outputId": "aac822ce-acbc-4df6-e55c-69aed03708c5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -353,18 +253,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save XlmRoBertaForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -372,10 +268,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -392,19 +286,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -414,9 +304,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `XlmRoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `XlmRoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -427,15 +315,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 364 - }, - "id": "8W_almibVRTj", - "outputId": "fbe3588e-602c-46b4-f439-27e74a96ebc2" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -450,19 +331,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -470,19 +347,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -490,9 +363,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -501,22 +372,16 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ogpxSWxOXj3W", - "outputId": "b5e25591-0f93-4500-8e52-a4f76ef3e91e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 2231984\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:35 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:35 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:35 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:35 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1121735053 Dec 15 18:35 xlm_roberta_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 5069051 Dec 15 18:35 xlmroberta_spp\n" ] @@ -528,19 +393,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XlmRoBertaForSequenceClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = XlmRoBertaForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -557,14 +418,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pGRTNISyYlnO", - "outputId": "e92e3289-5fb5-4809-f507-76fd5235d0fd" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -572,7 +427,7 @@ "['positive', 'negative', 'neutral']" ] }, - "execution_count": 16, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -590,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -637,9 +492,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `XlmRoBertaForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -665,13 +518,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForTokenClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForTokenClassification.ipynb index 2b29e425f1b562..d0ee9315d696d9 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForTokenClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - XlmRoBertaForTokenClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRoBertaForTokenClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRoBertaForTokenClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import XlmRoBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,25 +43,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 95771, - "status": "ok", - "timestamp": 1640707909485, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3134cc48-78bc-4e03-a79f-748292f7d0a1" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [xlm-roberta-large-finetuned-conll03-english](https://huggingface.co/xlm-roberta-large-finetuned-conll03-english) model from HuggingFace as an example\n", @@ -85,21 +61,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "executionInfo": { - "elapsed": 352, - "status": "ok", - "timestamp": 1640708841457, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -151,34 +114,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 494, - "status": "ok", - "timestamp": 1640708154100, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "7bd16979-4e59-4f6e-d685-4b0f882b5bcc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -186,7 +130,7 @@ "text": [ "total 4395720\n", "-rw-r--r-- 1 maziyar staff 1046 Dec 15 18:44 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:44 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:44 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 2235947880 Dec 15 18:44 tf_model.h5\n" ] } @@ -197,36 +141,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 511, - "status": "ok", - "timestamp": 1640708154608, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "49b86052-ec5c-4a97-959d-c2aa5c3b8df5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 37200\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:44 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:44 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 54 Dec 15 18:44 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 321614 Dec 15 18:44 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 18717362 Dec 15 18:44 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:44 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:44 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -236,25 +163,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1640708154609, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "5068af51-5a09-4a60-866b-96b4f4bdd083" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -273,9 +183,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `sentencepiece.bpe.model` file from the tokenizer\n", @@ -285,10 +193,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -299,10 +205,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -316,34 +220,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 228, - "status": "ok", - "timestamp": 1640708155273, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "89764651-6a64-4b11-aaaa-f031a4284e1a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -361,18 +246,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save XlmRoBertaForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -380,25 +261,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7553, - "status": "ok", - "timestamp": 1640708780913, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "2677b2fd-477a-4530-c98b-a8a1ccbd2baa" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -415,30 +279,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "executionInfo": { - "elapsed": 33750, - "status": "ok", - "timestamp": 1640708814657, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -448,9 +297,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `XlmRoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `XlmRoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -461,21 +308,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "executionInfo": { - "elapsed": 2, - "status": "ok", - "timestamp": 1640708858933, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -491,19 +325,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -511,19 +341,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "! rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -531,9 +357,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -542,33 +366,16 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 16, - "status": "ok", - "timestamp": 1640708814658, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "7fc4e69f-3ab2-4ddc-a3b0-6de95f018c91" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 4432048\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:45 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:45 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:45 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:45 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 2254172695 Dec 15 18:45 xlm_roberta_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 5069051 Dec 15 18:45 xlmroberta_spp\n" ] @@ -580,30 +387,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XlmRoBertaForTokenClassification model 😊 " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "executionInfo": { - "elapsed": 88864, - "status": "ok", - "timestamp": 1640708950792, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = XlmRoBertaForTokenClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -613,19 +405,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "BDWNWdBlBpHi" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "pGRTNISyYlnO" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -633,7 +421,7 @@ "['B-LOC', 'I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'B-MISC', 'B-ORG', 'O']" ] }, - "execution_count": 16, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -645,34 +433,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "UvRBsP2SBpHi" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 15729, - "status": "ok", - "timestamp": 1640708966516, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "c13a1827-770f-48a6-bba6-eda25077f8ef" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -714,9 +483,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `XlmRoBertaForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -742,13 +509,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/HuggingFace in Spark NLP - XlmRobertaForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - XlmRobertaForQuestionAnswering.ipynb index f57619e39c9b90..825353027d67eb 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - XlmRobertaForQuestionAnswering.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - XlmRobertaForQuestionAnswering.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRobertaForQuestionAnswering.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRobertaForQuestionAnswering.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import XlmRoBertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,9 +26,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] @@ -37,9 +34,7 @@ { "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.11.0` version and Transformers on `4.25.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,25 +43,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 97075, - "status": "ok", - "timestamp": 1640696490534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "hHXgqiWpMfCY", - "outputId": "3e56840b-f4e1-4391-ce82-3d8136e8990c" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.25.1 tensorflow==2.11.0 sentencepiece" @@ -74,9 +52,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [deepset/xlm-roberta-base-squad2](https://huggingface.co/deepset/xlm-roberta-base-squad2) model from HuggingFace as an example\n", @@ -85,105 +61,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 435, - "referenced_widgets": [ - "47dac9ef87fd4c5ca9a61d2cea256596", - "ce38947889204d1eb23c4a414d8e5208", - "2da64fb5519d420783cabae619f3b952", - "0784faf7b3784e2fb5856d8ca6248654", - "f2c8a9d039864796ad4495a3fc748b8a", - "4d41832a7c7f4ff6af11043759050846", - "97d4aab21aea4a30996a2399f7c58b1d", - "b0c3a334fc5c49f19a2911227190e18f", - "68e0a6c49a2d4fea8c81b8b1bfabfcd5", - "8fe11dbcbad6402ebb392316b90fbd4c", - "e6bfed8858df4404a958f9a0c5efdf61", - "b3cba7624d89414581b69a8804cdf5eb", - "6910684eaf584454b1b0b38da1851284", - "4771514aa5b44e5ea05f18aa6ef73008", - "1743adef69ba48b2a78e312121e1ff95", - "cf43d892dc5f45df80e87b77c378074e", - "19df597d10364f94b41991bfc4b0e039", - "1265068d2c4d4ff0b7ab480bd3fe2342", - "7ad895b923ad4fcfae33f38485d46690", - "f25af430b7c34f1b9cecb003aba253aa", - "a7d6155372a94ab185aa4d648603a677", - "1cca3cd83e4a48caa4ca67eb84e0d65c", - "85152c67f8424559a5b2334dce66b6c1", - "c03f7b608dbf416bb59626a47f4ec63e", - "a956903ad8194c4a9806f27ea0741773", - "5715e0c21cce4cee91a33e42beb48226", - "34ef44ce578847ca93e1e361ac6c6068", - "ffd12d9337cd4681afd51a74f77503f5", - "38e5d4d80eb1456e96fbaba2836e8030", - "5f4b9df77c6249c9874fb4cd7fc87962", - "d2ebd46bf924436cba4c7cdf8a666731", - "1fd718b370c8454bb4f63cd5d97e4649", - "beca0d66f4e94d8db677761102717623", - "7016f4970cbb46b99ee0b61f91529bc3", - "d04c456268b048ffbe3c00cccbf4390d", - "ebbbb05d599f451cb08a8dc6972a48bd", - "aa680bf2fba94b89819124d1764fd5fe", - "395fbcecbde042419bd7e0e99298b8a2", - "75812a9dedc343a9bacef9cb3ee1d8a0", - "69dc223e5de2449189995b7a116a0cc7", - "200aa3c11c1b4f2294935d5b91e844e3", - "f288ae4807364757b1f727e02c8d76b7", - "028bdbafc40e47c4bc7f1dda920630a7", - "c64ad3e7f7a9403f940367b8ffb4540e", - "cd1df8c0a9e64eab89d894ee0697f330", - "b601ce600b6b4b8a9d609487263f9d58", - "63d534091c114485a89af24ff0c3e574", - "c3c2541de6e34033b5298bd449c177ca", - "4bfda2c0b7fc4e96a7480c639ed2909b", - "983a3c073854484ca0c50ff238149ad7", - "10888dcf7383452e8e78475beed266de", - "edf6984a708b43b5ad25fb6b04f211a7", - "ac44ce9590df4690b1e1337eb5caf623", - "f3633266f7b84a8497936c2ef5b780fd", - "663cce4987904af48951a64093a47108", - "a3d2f9f8f9754f9b8134c52b7cfaca19", - "6637ecfad7594cac96e5bf703b6ab5da", - "0d3442a75c2b4a6082c9581ab0621592", - "86eadc1d973e4f6a9270fe934992d3f6", - "af52df20197b457882647e636171c83a", - "a6e2dfe0ca474d25b8f43506930a3798", - "a81ea939fe4d440cb6dcd2d87557579e", - "c0c856879cff4c29b8d45b0abfb94a22", - "0c8e5c545fa948b5bf26b7f3d2801dc1", - "118ef92501eb4c5f8c29323739516a1a", - "50ac811bc42b474d82eca728897dc596", - "b13f4e9eb777499ab6d5fc0ccaeac074", - "207abaeff8a94953a889804fc5e88b2d", - "6f13c00ef5f44adca80b0d5b9ce8c4d2", - "cae4eda19aed4598b3c97a3633c224d3", - "bf22edbb769d46abb23c352dc370f5ad", - "cf45db79df5241b1b579d765cd737953", - "0959fb1f18794a559ae6f1849a3eb5a9", - "620d95c4cdcd4f23ab17377da0485cf8", - "bdfbfe93e9cc4d878008d332f1c5860b", - "c2845632b7fb4b71b95b7eff29efb667", - "3b06e84b5b494bfd920ee661392967f5" - ] - }, - "executionInfo": { - "elapsed": 68690, - "status": "ok", - "timestamp": 1640696559216, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ZaiirlSKNhVD", - "outputId": "2d42f5ad-db10-44de-b319-75a6309df876" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -313,34 +192,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1640696559217, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "p2XCole7TTef", - "outputId": "441fca3b-ab35-4d49-d567-4da91e1ad528" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -348,7 +208,7 @@ "text": [ "total 2197848\n", "-rw-r--r-- 1 maziyar staff 787 Dec 15 18:30 config.json\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:29 \u001B[34msaved_model\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:29 \u001b[34msaved_model\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1110105320 Dec 15 18:30 tf_model.h5\n" ] } @@ -359,36 +219,19 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1640696559428, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "r0DOGz8VUR-r", - "outputId": "dad1fb58-d331-491f-a83d-ff002e88d079" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 18768\n", - "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:29 \u001B[34massets\u001B[m\u001B[m\n", + "drwxr-xr-x 2 maziyar staff 64 Dec 15 18:29 \u001b[34massets\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 55 Dec 15 18:29 fingerprint.pb\n", "-rw-r--r-- 1 maziyar staff 165560 Dec 15 18:30 keras_metadata.pb\n", "-rw-r--r-- 1 maziyar staff 9434510 Dec 15 18:30 saved_model.pb\n", - "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:29 \u001B[34mvariables\u001B[m\u001B[m\n" + "drwxr-xr-x 4 maziyar staff 128 Dec 15 18:29 \u001b[34mvariables\u001b[m\u001b[m\n" ] } ], @@ -398,25 +241,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 202, - "status": "ok", - "timestamp": 1640696559628, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "3b52acdf-5ecf-4582-9a6e-3ddc89bc487e" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -435,9 +261,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- As you can see, we need the SavedModel from `saved_model/1/` path\n", "- We also be needing `sentencepiece.bpe.model` from the tokenizer\n", @@ -446,10 +270,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -459,34 +281,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `sentencepiece.bpe.model` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 234, - "status": "ok", - "timestamp": 1640696560064, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "OYnT5U8N9dxT", - "outputId": "db11e138-f83f-4a0d-cab5-6c4dc1eaa4d4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -503,18 +306,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save XlmRoBertaForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -522,25 +321,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 44473, - "status": "ok", - "timestamp": 1640696604534, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "8tpW5nkMc53m", - "outputId": "b956466b-03d6-4f56-88d4-28f920a6d113" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -557,19 +339,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -579,9 +357,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `XlmRoBertaForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `XlmRoBertaForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -592,10 +368,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -613,19 +387,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -633,19 +403,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -653,9 +419,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -664,33 +428,16 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 2392, - "status": "ok", - "timestamp": 1640696670840, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "ogpxSWxOXj3W", - "outputId": "995582ac-5e30-46ed-baef-1ad8a5387f30" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 2198320\n", - "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:30 \u001B[34mfields\u001B[m\u001B[m\n", - "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:30 \u001B[34mmetadata\u001B[m\u001B[m\n", + "drwxr-xr-x 3 maziyar staff 96 Dec 15 18:30 \u001b[34mfields\u001b[m\u001b[m\n", + "drwxr-xr-x 6 maziyar staff 192 Dec 15 18:30 \u001b[34mmetadata\u001b[m\u001b[m\n", "-rw-r--r-- 1 maziyar staff 1119269627 Dec 15 18:31 xlm_roberta_classification_tensorflow\n", "-rw-r--r-- 1 maziyar staff 5069051 Dec 15 18:31 xlmroberta_spp\n" ] @@ -702,34 +449,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XlmRoBertaForQuestionAnswering model in Spark NLP 🚀 pipeline! " ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11346, - "status": "ok", - "timestamp": 1640696711994, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -60 - }, - "id": "MysnSyi8BpHi", - "outputId": "b7ffe817-c5ad-41b3-85b6-ad04aef16e65" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -766,9 +494,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `XlmRoBertaForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -794,13 +520,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - }, - "vscode": { - "interpreter": { - "hash": "59794f394f79a45d9851d6706177d59b9a5e9d735b0369dbae4b76bccf016251" - } + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/HuggingFace in Spark NLP - XlnetForSequenceClassification.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - XlnetForSequenceClassification.ipynb index a95c73bc4a0c7e..1e1195f02b246c 100644 --- a/examples/python/transformers/HuggingFace in Spark NLP - XlnetForSequenceClassification.ipynb +++ b/examples/python/transformers/HuggingFace in Spark NLP - XlnetForSequenceClassification.ipynb @@ -1,19 +1,18 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "2vXYNX2lQROB" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlnetForSequenceClassification.ipynb)" + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlnetForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ "## Import XLNetForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀 \n", "\n", @@ -27,18 +26,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- Let's install `HuggingFace` and `TensorFlow`. You don't need `TensorFlow` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", "- We lock TensorFlow on `2.4.4` version and Transformers on `4.15.0`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", @@ -48,9 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "hHXgqiWpMfCY" - }, + "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers==4.15.0 tensorflow==2.4.4 sentencepiece" @@ -58,9 +51,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Y3AM6bj4P3NS" - }, + "metadata": {}, "source": [ "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", "- We'll use [mohsenfayyaz/xlnet-base-cased-toxicity](https://huggingface.co/mohsenfayyaz/xlnet-base-cased-toxicity) model from HuggingFace as an example\n", @@ -70,93 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 756, - "referenced_widgets": [ - "e1fc0a4d6ec54a62aae134b855f9bf7a", - "1900a259fd4a4d098f2f1c5d40c525a9", - "4a3f42cc8c3d4ec2aad7d53997bb5ff5", - "bdb8a9f473b84c48a2798fb9124fed03", - "a94a622315e045d8a8028bbd6a5068a2", - "1b8a5def4d194653b3e0bec831eaaabf", - "6acbaf6a0a1140a3a6015a3f61d9898f", - "52af864ec283456989a74f2984660779", - "b0bda760efd449e48f11ba41260fa699", - "a979e306af0341d6899d85d6f5230a19", - "19075a2e10324843b5e8c3a4aa8e9e53", - "1f7865cccbdd43619ea98fd8f5a14f8b", - "b2c46dabf83f489bba962298e2ecb710", - "57b9d3e735c7486d872ea2b0b73a3677", - "c9100fded1ef49779deccb8f9fb24d5e", - "8c894774bdd544b9874f3ebdfd131146", - "ae287fa050b744adb89541976956a551", - "00ae5c6d386744f3b0589b95d8af1b94", - "6686a498f46d4945a77aa8471682c0d1", - "d79e3a5501e8441f82535c964261401b", - "098428e313f34f26a9e2720aa2dbf530", - "4d433874cb614632a9c1e60a805f681f", - "1e5d422af6d64492a8cb794f8de39ecb", - "2088dd75202942678c7e3d3099b0ecda", - "525a0dc4876c43cb9934453e83a071c5", - "0f10d12748dc46e980cea8fa9c810ed6", - "e6aa9ca934f541e0926ff8124dcbc52a", - "37f8a228dc314a0f9e316d4c76408e21", - "cd8a643829ba45639d0ab9c6d8261065", - "f9b622ef455a4678a7b4d04c37eaeaeb", - "205e28c15bfb4562bd1e57e2e38e55ca", - "48919d4cdc4343f083e61980230a3593", - "785c2b642c3d4e1dab9d668e9b265ad2", - "8b5863b19c5d4c1a88432a629d12a54d", - "4f8c267af7db4940bc885321aa1eff32", - "9a79361819774bd5a9ffc66f5d009be4", - "f0f4fe39bb684389898f0e5bb8befdd2", - "df2ac87d416d4e9491b405d87a6843c9", - "67468a34a00d4afda58deb4cd5f7ffb8", - "6234eafaa2854beabf40386e2dd14040", - "a1d0b0feb55947a2902610e4d1cc694a", - "b7b3988d15574c4eb3584272afb66061", - "f1fb22fd219f40f68625386b35ee7fde", - "e80e40e02c054ec99c239e366905259b", - "a5ab2ca68dc0459b9e027f113184ba50", - "95bd8ac5c4544ce7a826623d61cddf08", - "bb14c43fc8c04748b24bed0d3872b2aa", - "eb4f935d3ad947c29d2f9a1346238759", - "4cefc208ac634f218d7136c799e9b22c", - "94423182f8c940e3bc2c1f4353eab2f8", - "fe3e2e405c0543dda602cae3ec200cbc", - "9744ef999f49428fa5d43af1180712fb", - "e2be8dbfcdd34899b16f13ee9c5f3586", - "1ecb5f9d496a4e59b814a0fe81082746", - "ea769f21031d495bb46a4d8ade68658c", - "f514faa20bec40acb77e49005d7f8e34", - "ca77a1edc0b8401a83215fc7657acbf7", - "16c80bbca74a44afac6944ee3a5aba81", - "bb695d93b6c54f0fb83763de7270e10e", - "3cd33166f4be45c39257d55ab756b7c8", - "65587021eb3649a799f7d69117045216", - "c20fb03b71aa40e29a77d8f8bdf8043d", - "fa4fdc5ffe924af086161990c22f4f47", - "4d1275aab38546449db6a1eb22979031", - "7b464c4c2a14481ab9d3722b306a5d63", - "6f3582e6d41647898b1b00fef09ffbb4" - ] - }, - "executionInfo": { - "elapsed": 63652, - "status": "ok", - "timestamp": 1632137295438, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZaiirlSKNhVD", - "outputId": "8b8f0fb9-37e5-4893-ada0-3fb15d851281" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -252,9 +157,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -262,24 +165,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 18, - "status": "ok", - "timestamp": 1632137295439, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "67bd9f17-ba94-4940-9702-a717343a8fee" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -299,24 +185,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 288, - "status": "ok", - "timestamp": 1632137295723, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "3de08dd0-c2ae-43bb-d8fd-41b1f3ba9f47" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -336,24 +205,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 281, - "status": "ok", - "timestamp": 1632137296002, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "Mcm2UpNxUUQN", - "outputId": "2a94bc16-0d09-4cb3-e58d-e7a638b0a579" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -372,9 +224,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, we need the SavedModel from `saved_model/1/` path\n", "- we also be needing `spiece.model` file from the tokenizer\n", @@ -385,9 +235,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ez6MT-RTT7ss" - }, + "metadata": {}, "outputs": [], "source": [ "asset_path = '{}/saved_model/1/assets'.format(MODEL_NAME)\n", @@ -399,9 +247,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "vcg_5YP1-vfC" - }, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary \n", @@ -415,9 +261,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "mBq7ztzlACYO" - }, + "metadata": {}, "source": [ "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" ] @@ -425,24 +269,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 323, - "status": "ok", - "timestamp": 1628497252447, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "OYnT5U8N9dxT", - "outputId": "8d5068a4-0395-401a-fb19-0ed60300be1c" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -460,18 +287,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save XlnetForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -480,9 +303,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8tpW5nkMc53m" - }, + "metadata": {}, "outputs": [], "source": [ "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" @@ -490,9 +311,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -500,9 +319,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -512,9 +329,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `XlnetForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `XlnetForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -525,9 +340,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "9Bviq68HBlQM" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -555,9 +368,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -572,9 +383,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -582,9 +391,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" @@ -592,9 +399,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -602,9 +407,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {MODEL_NAME}_tokenizer {MODEL_NAME}" @@ -612,9 +415,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -624,24 +425,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 291, - "status": "ok", - "timestamp": 1632137856170, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "35dea086-25d2-4029-df4e-663905aafd77" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -661,9 +445,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XlnetForSequenceClassification model 😊 " ] @@ -671,9 +453,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = XlnetForSequenceClassification.load(\"./{}_spark_nlp\".format(MODEL_NAME))\\\n", @@ -683,9 +463,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QukRkXhQBlQO" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] @@ -693,24 +471,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1632137863887, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "60046377-bfd4-4c5e-e392-f78841e6bfe8" - }, + "metadata": {}, "outputs": [ { "data": { @@ -718,7 +479,7 @@ "['Non-Toxic', 'Toxic']" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -729,9 +490,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "KAiPa3yyBlQO" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark pipeline:" ] @@ -739,10 +498,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "scK1OqhABlQO", - "outputId": "936665a5-de85-456c-b7a1-f82d72f3bf70" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -785,9 +541,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `XlnetForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀 \n" ] @@ -813,8 +567,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/Import External SavedModel From Remote Storage.ipynb b/examples/python/transformers/Import External SavedModel From Remote Storage.ipynb index bee84e3738a9ee..b4f5be47288ed3 100644 --- a/examples/python/transformers/Import External SavedModel From Remote Storage.ipynb +++ b/examples/python/transformers/Import External SavedModel From Remote Storage.ipynb @@ -1,40 +1,28 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/Import%20External%20SavedModel%20From%20Remote%20Storage.ipynb)" - ], - "metadata": { - "id": "lshuevA3Qv-N", - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "d9be2182-2a6c-4971-b524-7d6900906d63" - } - } + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/Import%20External%20SavedModel%20From%20Remote%20Storage.ipynb)" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# This is only needed to setup PySpark and Spark NLP on Colab\n", "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "f1de6429-2d6e-47e6-85ba-3a76e0b3958f" - }, - "id": "Nt0jHURxzPTY" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Import External SavedModel From Remote Storage\n", "\n", @@ -43,37 +31,23 @@ "This feature allows you to load external models (for example exported models from the transfomers library) from various remote locations. These include dbfs, hdfs and s3.\n", "\n", "For this example we will load an ALBERT model from the transformers library. On how to prepare the model and to export it properly, see the tutorials for the respective transformer at the [following discussion](https://github.com/JohnSnowLabs/spark-nlp/discussions/5669)!" - ], - "metadata": { - "id": "Zva6MvJyLeWi", - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "c9ac9309-e601-4215-8db2-fc5305c34705" - } - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Loading Models from the Databricks File System (DBFS)\n", "First, make sure you have Spark NLP installed on your cluster.\n", "\n", "You can load models from a directory on DBFS by providing a path with the `dbfs:/` protocol." - ], - "metadata": { - "id": "MzxB-Nq6cxOA", - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "87ff4cde-67b8-4704-90a7-15718d8314a7" - } - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import sparknlp\n", "from sparknlp.annotator import *\n", @@ -89,30 +63,20 @@ " .setCaseSensitive(False)\\\n", " .setDimension(768)\\\n", " .setStorageRef('albert_base_uncased') \n" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "6cfc21e6-0bbc-4877-b3e7-66273238d9ae" - }, - "id": "66MYkxENzPTb" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "If the file is on local file storage, it is asvisable to append the `file:/` protocol so that the correct path is resolved." - ], - "metadata": { - "id": "X2227WQ70npi" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import sparknlp\n", "from sparknlp.annotator import *\n", @@ -128,33 +92,23 @@ " .setCaseSensitive(False)\\\n", " .setDimension(768)\\\n", " .setStorageRef('albert_base_uncased') \n" - ], - "metadata": { - "id": "dBMhszZi0xZl" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Loading Models from the Hadoop File System (HDFS)\n", "You can load models from a directory on HDFS by providing a path with the `hdfs:/` protocol. \n", "\n", "Here, the hdfs endpoint is reachable under `localhost:9000`." - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "c2666104-9516-4e84-81a1-66416a969120" - }, - "id": "5hQWu39NzPTb" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import sparknlp\n", "from sparknlp.annotator import *\n", @@ -170,57 +124,33 @@ " .setCaseSensitive(False)\\\n", " .setDimension(768)\\\n", " .setStorageRef('albert_base_uncased') \n" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "5dfdc55c-d2fc-422b-b549-38b78aa21b09" - }, - "id": "rPpd3fyEzPTc" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Loading Models from S3\n", "You can load models from a directory on S3 by providing a path with the `s3:/` protocol. \n", "\n", - "You will need to create a custom Spark session with the proper credentials and permissions to access a directory on the s3 bucket. To see an example on how to set up access with temporary credentials see [Load Model From S3 from the SparkNLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/Load_Model_From_S3.ipynb).\n", + "You will need to create a custom Spark session with the proper credentials and permissions to access a directory on the s3 bucket. To see an example on how to set up access with temporary credentials see [Load Model From S3 from the SparkNLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/examples/prediction/english/Load_Model_From_S3.ipynb).\n", "\n", "In this example, the bucket that will be used is called `johnsnow` and its region is `us-east-1`." - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "5bbc6544-aacb-4a52-86b1-37d3794ff118" - }, - "id": "mdpBKnGTzPTc" - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Anonymous Access\n", "If the bucket is publicly accesible, then a Spark session with s3 support can be created like this to load the model from the bucket:" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "f13861fc-649c-4883-9c79-28c3e4016a50" - }, - "id": "qPcL_PguzPTd" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "from sparknlp.annotator import *\n", @@ -246,93 +176,53 @@ " .setCaseSensitive(False)\\\n", " .setDimension(768)\\\n", " .setStorageRef('albert_base_uncased') \n" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "c3612e0c-5fea-4565-a255-30a0cb6e87b7" - }, - "id": "xAuLqKq8zPTd" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Restricted Access\n", "If the bucket needs credentials, then a Spark session with s3 support can be created like this to load the model from the bucket (taken from the workshop example).\n", "\n", "Note that `MY_ACCESS_KEY`, `MY_SECRET_KEY`, `MY_SESSION_KEY` need to be set for this example to work." - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "0d44a995-a96e-45c9-a2fc-4d387c667e80" - }, - "id": "gsap1D7uzPTe" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"Enter your AWS Access Key:\")\n", "MY_ACCESS_KEY = input()" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "8dd30b65-3f31-4b1a-9c7d-daca88d9ee37" - }, - "id": "qjjL_Ez0zPTe" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"Enter your AWS Secret Key:\")\n", "MY_SECRET_KEY = input()" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "05dce6bc-9820-488f-a735-27a786d48253" - }, - "id": "tjojm4vczPTf" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"Enter your AWS Session Key:\")\n", "MY_SESSION_KEY = input()" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "1c707c44-b63b-442d-8869-4280a11ef94b" - }, - "id": "8pjzIQ_tzPTf" - }, - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "from sparknlp.annotator import *\n", @@ -363,52 +253,40 @@ " .setCaseSensitive(False)\\\n", " .setDimension(768)\\\n", " .setStorageRef('albert_base_uncased') \n" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "title": "", - "showTitle": false, - "inputWidgets": {}, - "nuid": "5c181b23-d184-47b0-ab21-0d5ca1ff68f7" - }, - "id": "MHPagZILzPTf" - }, - "outputs": [], - "execution_count": null + ] } ], "metadata": { - "language_info": { - "mimetype": "text/x-python", - "name": "python", - "pygments_lexer": "ipython3", - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "version": "3.8.10", - "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "application/vnd.databricks.v1+notebook": { - "notebookName": "Import External SavedModel From Remote Storage", "dashboards": [], + "language": "python", "notebookMetadata": { "pythonIndentUnit": 2 }, - "language": "python", - "widgets": {}, - "notebookOrigID": 3917489032437656 + "notebookName": "Import External SavedModel From Remote Storage", + "notebookOrigID": 3917489032437656, + "widgets": {} }, "colab": { "collapsed_sections": [], "provenance": [] }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, "nteract": { "version": "0.28.0" } diff --git a/examples/python/transformers/TF Hub in Spark NLP - ALBERT.ipynb b/examples/python/transformers/TF Hub in Spark NLP - ALBERT.ipynb index 0aab19fb373a23..25c104d8cdfccb 100644 --- a/examples/python/transformers/TF Hub in Spark NLP - ALBERT.ipynb +++ b/examples/python/transformers/TF Hub in Spark NLP - ALBERT.ipynb @@ -1,16 +1,17 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/TF%20Hub%20in%20Spark%20NLP%20-%20ALBERT.ipynb)\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/TF%20Hub%20in%20Spark%20NLP%20-%20ALBERT.ipynb)\n", "\n", - "## Import ALBERT models from TF Hub into Spark NLP 🚀 \n", + "## Import ALBERT models from TF Hub into Spark NLP 🚀\n", "\n", - "Let's keep in mind a few things before we start 😊 \n", + "Let's keep in mind a few things before we start 😊\n", "\n", "- This feature is only in `Spark NLP 3.1.x` and after. So please make sure you have upgraded to the latest Spark NLP release\n", "- You can import any ALBERT models from TF Hub but they have to be `TF2.0 Saved Model` models. Meaning, you cannot use `ALBERT models for TF1` which are `DEPRECATED`" @@ -18,18 +19,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Save TF Hub model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- We do not need to install `tensorflow` nor `tensorflow-hub`\n", "- We can simple download the model and extract it\n", @@ -39,9 +36,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "YTKqt3fnkaXH" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf /content/*" @@ -50,34 +45,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 85099, - "status": "ok", - "timestamp": 1626180515415, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "SP-Op9Kirtxp", - "outputId": "d51288fb-a28c-4d69-c0b8-ae54447f392f" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 394.3MB 42kB/s \n", - "\u001B[K |████████████████████████████████| 2.9MB 30.2MB/s \n", - "\u001B[K |████████████████████████████████| 3.8MB 19.6MB/s \n", - "\u001B[K |████████████████████████████████| 471kB 34.8MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 394.3MB 42kB/s \n", + "\u001b[K |████████████████████████████████| 2.9MB 30.2MB/s \n", + "\u001b[K |████████████████████████████████| 3.8MB 19.6MB/s \n", + "\u001b[K |████████████████████████████████| 471kB 34.8MB/s \n", + "\u001b[?25h" ] } ], @@ -88,9 +66,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "0oVoy6nrRMmk" - }, + "metadata": {}, "outputs": [], "source": [ "EXPORTED_MODEL = 'albert_en_base'\n", @@ -100,24 +76,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10965, - "status": "ok", - "timestamp": 1626181255547, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "6fb69d81-c41a-443c-838f-7fbf9aacd757" - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -174,9 +133,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -184,24 +141,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 165, - "status": "ok", - "timestamp": 1626181313281, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "07907b1a-56da-4d29-fb7f-aa4b440368b8" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -221,24 +161,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 296, - "status": "ok", - "timestamp": 1626181314521, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "e076997a-f34a-46fa-ad81-80e3c3b27415" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -255,9 +178,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- The `SentencePiece` model is already in the `assets` directory, but let's rename it to something Spark NLP recognize it\n", "- we all set! We can got to Spark NLP 😊 " @@ -266,24 +187,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 156, - "status": "ok", - "timestamp": 1626181440131, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "3-67YHqt9sj3", - "outputId": "f12db64d-58b2-4a59-80ef-10f517296550" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -301,18 +205,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save ALBERT in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -320,25 +220,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 114600, - "status": "ok", - "timestamp": 1626182423554, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "8tpW5nkMc53m", - "outputId": "18701fe4-9905-4852-ae26-25d19ec30e92" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -387,10 +270,10 @@ "Get:24 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 Packages [40.9 kB]\n", "Fetched 12.2 MB in 4s (3,322 kB/s)\n", "Reading package lists... Done\n", - "\u001B[K |████████████████████████████████| 209.1MB 68kB/s \n", - "\u001B[K |████████████████████████████████| 51kB 5.7MB/s \n", - "\u001B[K |████████████████████████████████| 204kB 52.5MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" + "\u001b[K |████████████████████████████████| 209.1MB 68kB/s \n", + "\u001b[K |████████████████████████████████| 51kB 5.7MB/s \n", + "\u001b[K |████████████████████████████████| 204kB 52.5MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ], @@ -400,30 +283,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "executionInfo": { - "elapsed": 24988, - "status": "ok", - "timestamp": 1626182448537, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -433,9 +301,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `AlbertEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `AlbertEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -448,21 +314,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "executionInfo": { - "elapsed": 5703, - "status": "ok", - "timestamp": 1626182471448, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -480,30 +333,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "executionInfo": { - "elapsed": 27466, - "status": "ok", - "timestamp": 1626182499692, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "albert.write().overwrite().save(\"./{}_spark_nlp\".format(EXPORTED_MODEL))" @@ -511,30 +349,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "executionInfo": { - "elapsed": 15, - "status": "ok", - "timestamp": 1626182499693, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {EXPORTED_MODEL}" @@ -542,9 +365,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -553,25 +374,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 294, - "status": "ok", - "timestamp": 1626182499984, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "b14c2b1a-315c-4917-8a55-08983ac66470" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -591,30 +395,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BERT model 😊 " ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "executionInfo": { - "elapsed": 3811, - "status": "ok", - "timestamp": 1626182503794, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "albert_loaded = AlbertEmbeddings.load(\"./{}_spark_nlp\".format(EXPORTED_MODEL))\\\n", @@ -625,26 +414,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 23, - "status": "ok", - "timestamp": 1626182503801, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "d89b04d4-5065-47af-f1a0-9cc436f29a82" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -655,7 +426,7 @@ "'albert_en_base'" ] }, - "execution_count": 19, + "execution_count": null, "metadata": { "tags": [] }, @@ -668,9 +439,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and import ALBERT models from TF Hub in Spark NLP 🚀 \n" ] @@ -697,8 +466,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/TF Hub in Spark NLP - BERT Sentence.ipynb b/examples/python/transformers/TF Hub in Spark NLP - BERT Sentence.ipynb index 17af8f20c9c02f..f08314af7ffc6e 100644 --- a/examples/python/transformers/TF Hub in Spark NLP - BERT Sentence.ipynb +++ b/examples/python/transformers/TF Hub in Spark NLP - BERT Sentence.ipynb @@ -1,16 +1,17 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/TF%20Hub%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb)\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/TF%20Hub%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb)\n", "\n", - "## Import BERT models for Sentenc Embeddings from TF Hub into Spark NLP 🚀 \n", + "## Import BERT models for Sentenc Embeddings from TF Hub into Spark NLP 🚀\n", "\n", - "Let's keep in mind a few things before we start 😊 \n", + "Let's keep in mind a few things before we start 😊\n", "\n", "- This feature is only in `Spark NLP 3.1.x` and after. So please make sure you have upgraded to the latest Spark NLP release\n", "- You can import any BERT models from TF Hub but they have to be `TF2.0 Saved Model` models. Meaning, you cannot use `BERT models for TF1` which are `DEPRECATED`" @@ -18,18 +19,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Save TF Hub model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- We do not need to install `tensorflow` nor `tensorflow-hub`\n", "- We can simple download the model and extract it\n", @@ -38,21 +35,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "executionInfo": { - "elapsed": 364, - "status": "ok", - "timestamp": 1626534112218, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "YTKqt3fnkaXH" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf /content/*" @@ -60,35 +44,18 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 88044, - "status": "ok", - "timestamp": 1626534200259, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "SP-Op9Kirtxp", - "outputId": "79408445-17c5-41a6-9faf-8e8241e08239" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 394.3MB 36kB/s \n", - "\u001B[K |████████████████████████████████| 2.9MB 35.4MB/s \n", - "\u001B[K |████████████████████████████████| 471kB 34.7MB/s \n", - "\u001B[K |████████████████████████████████| 3.8MB 36.5MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 394.3MB 36kB/s \n", + "\u001b[K |████████████████████████████████| 2.9MB 35.4MB/s \n", + "\u001b[K |████████████████████████████████| 471kB 34.7MB/s \n", + "\u001b[K |████████████████████████████████| 3.8MB 36.5MB/s \n", + "\u001b[?25h" ] } ], @@ -98,21 +65,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "executionInfo": { - "elapsed": 8, - "status": "ok", - "timestamp": 1626534200260, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "0oVoy6nrRMmk" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "EXPORTED_MODEL = 'bert_en_uncased_L-2_H-128_A-2'\n", @@ -121,25 +75,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11757, - "status": "ok", - "timestamp": 1626534431104, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "edcf0a15-2b18-4c19-dcce-5c3482f0046f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -196,34 +133,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 329, - "status": "ok", - "timestamp": 1626534438516, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "67234cb1-b3ae-4999-ce1b-949d2a5d0235" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -242,25 +160,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 417, - "status": "ok", - "timestamp": 1626534522474, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "d2068cd4-6c29-40d4-ebb3-e35abd900d25" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -277,9 +178,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, everything needed in Spark NLP is already here, including `vocab.txt` in `assets` directory\n", "- we all set! We can got to Spark NLP 😊 " @@ -287,18 +186,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BERT in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -306,25 +201,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 140351, - "status": "ok", - "timestamp": 1626534693515, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "8tpW5nkMc53m", - "outputId": "a2eb25b8-531f-4bde-c9f7-cabc8ddab485" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -368,10 +246,10 @@ "Get:21 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic/main amd64 Packages [911 kB]\n", "Fetched 11.4 MB in 6s (1,875 kB/s)\n", "Reading package lists... Done\n", - "\u001B[K |████████████████████████████████| 209.1MB 65kB/s \n", - "\u001B[K |████████████████████████████████| 51kB 5.8MB/s \n", - "\u001B[K |████████████████████████████████| 204kB 53.3MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" + "\u001b[K |████████████████████████████████| 209.1MB 65kB/s \n", + "\u001b[K |████████████████████████████████| 51kB 5.8MB/s \n", + "\u001b[K |████████████████████████████████| 204kB 53.3MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ], @@ -381,30 +259,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "executionInfo": { - "elapsed": 67892, - "status": "ok", - "timestamp": 1626534761404, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "cbNneAVCLU1y" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -414,9 +277,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertSentenceEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Unlike `BertEmbeddings` which uses `last_hidden_state` with (-1, -1, DIMENSION) shape, `BertSentenceEmbeddings` will use `pooler_output` with (-1, DIMENSION) shape for Sentence/Document embeddings. It will generate 1 vector for the entire sentence/document\n", @@ -430,21 +291,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "executionInfo": { - "elapsed": 6038, - "status": "ok", - "timestamp": 1626534767439, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "8W_almibVRTj" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -462,30 +310,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "executionInfo": { - "elapsed": 13160, - "status": "ok", - "timestamp": 1626534780950, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "iWu5HfbnXAlM" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sent_bert.write().overwrite().save(\"./{}_spark_nlp\".format(EXPORTED_MODEL))" @@ -493,30 +326,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "executionInfo": { - "elapsed": 21, - "status": "ok", - "timestamp": 1626534780951, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ZgkVIJshDtLx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {EXPORTED_MODEL}" @@ -524,9 +342,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -535,25 +351,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1626534780951, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "bfb88092-53e7-4b1e-8382-8815fef0aba4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -572,30 +371,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BERT model 😊 " ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "executionInfo": { - "elapsed": 3363, - "status": "ok", - "timestamp": 1626534834771, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "1mm3CvkwYRgs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sent_bert_loaded = BertSentenceEmbeddings.load(\"./{}_spark_nlp\".format(EXPORTED_MODEL))\\\n", @@ -606,26 +390,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 15, - "status": "ok", - "timestamp": 1626534834779, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "a7911a3f-9168-44b0-fa5b-3f71985cabe6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -636,7 +402,7 @@ "'sent_bert_en_uncased_L-2_H-128_A-2'" ] }, - "execution_count": 18, + "execution_count": null, "metadata": { "tags": [] }, @@ -649,9 +415,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and import BERT models from TF Hub in Spark NLP 🚀 \n" ] @@ -678,8 +442,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/transformers/TF Hub in Spark NLP - BERT.ipynb b/examples/python/transformers/TF Hub in Spark NLP - BERT.ipynb index 2af8f7d8496feb..e26290028a7d0b 100644 --- a/examples/python/transformers/TF Hub in Spark NLP - BERT.ipynb +++ b/examples/python/transformers/TF Hub in Spark NLP - BERT.ipynb @@ -1,16 +1,17 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", - "metadata": { - "id": "Zva6MvJyLeWi" - }, + "metadata": {}, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/TF%20Hub%20in%20Spark%20NLP%20-%20BERT.ipynb)\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/TF%20Hub%20in%20Spark%20NLP%20-%20BERT.ipynb)\n", "\n", - "## Import BERT models from TF Hub into Spark NLP 🚀 \n", + "## Import BERT models from TF Hub into Spark NLP 🚀\n", "\n", - "Let's keep in mind a few things before we start 😊 \n", + "Let's keep in mind a few things before we start 😊\n", "\n", "- This feature is only in `Spark NLP 3.1.x` and after. So please make sure you have upgraded to the latest Spark NLP release\n", "- You can import any BERT models from TF Hub but they have to be `TF2.0 Saved Model` models. Meaning, you cannot use `BERT models for TF1` which are `DEPRECATED`" @@ -18,18 +19,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MzxB-Nq6cxOA" - }, + "metadata": {}, "source": [ "## Save TF Hub model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "yNQkhyMHMgkE" - }, + "metadata": {}, "source": [ "- We do not need to install `tensorflow` nor `tensorflow-hub`\n", "- We can simple download the model and extract it\n", @@ -39,9 +36,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "YTKqt3fnkaXH" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf /content/*" @@ -50,34 +45,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 85899, - "status": "ok", - "timestamp": 1626085480126, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "SP-Op9Kirtxp", - "outputId": "762ae186-2ecf-4a02-a30f-75bb419a2f1b" - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[K |████████████████████████████████| 394.3MB 39kB/s \n", - "\u001B[K |████████████████████████████████| 3.8MB 32.3MB/s \n", - "\u001B[K |████████████████████████████████| 2.9MB 33.9MB/s \n", - "\u001B[K |████████████████████████████████| 471kB 42.4MB/s \n", - "\u001B[?25h" + "\u001b[K |████████████████████████████████| 394.3MB 39kB/s \n", + "\u001b[K |████████████████████████████████| 3.8MB 32.3MB/s \n", + "\u001b[K |████████████████████████████████| 2.9MB 33.9MB/s \n", + "\u001b[K |████████████████████████████████| 471kB 42.4MB/s \n", + "\u001b[?25h" ] } ], @@ -88,9 +66,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "0oVoy6nrRMmk" - }, + "metadata": {}, "outputs": [], "source": [ "EXPORTED_MODEL = 'bert_en_uncased_L-2_H-128_A-2'\n", @@ -100,24 +76,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 7629, - "status": "ok", - "timestamp": 1626085895517, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "hHXgqiWpMfCY", - "outputId": "08236c61-064b-4791-de54-55bec0156ff9" - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -174,9 +133,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nlgyZuJfS5IB" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] @@ -184,24 +141,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 187, - "status": "ok", - "timestamp": 1626085937009, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "p2XCole7TTef", - "outputId": "ccce0989-d69f-43f7-bef9-f4f20d4ee37e" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -221,24 +161,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 184, - "status": "ok", - "timestamp": 1626085939176, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "r0DOGz8VUR-r", - "outputId": "cad831b8-90d0-4d0b-a1d8-e0e6a5503130" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -255,9 +178,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "gZegMvuGTmHt" - }, + "metadata": {}, "source": [ "- as you can see, everything needed in Spark NLP is already here, including `vocab.txt` in `assets` directory\n", "- we all set! We can got to Spark NLP 😊 " @@ -265,18 +186,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NlJKd2tIU0PD" - }, + "metadata": {}, "source": [ "## Import and Save BERT in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "A0FXoxHJc5CU" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -285,24 +202,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 114196, - "status": "ok", - "timestamp": 1626086169582, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "8tpW5nkMc53m", - "outputId": "c21abc9e-38d2-443b-e90d-07a0f63b8640" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -351,10 +251,10 @@ "Get:24 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 Packages [40.8 kB]\n", "Fetched 12.2 MB in 4s (3,186 kB/s)\n", "Reading package lists... Done\n", - "\u001B[K |████████████████████████████████| 209.1MB 70kB/s \n", - "\u001B[K |████████████████████████████████| 51kB 6.0MB/s \n", - "\u001B[K |████████████████████████████████| 204kB 38.2MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" + "\u001b[K |████████████████████████████████| 209.1MB 70kB/s \n", + "\u001b[K |████████████████████████████████| 51kB 6.0MB/s \n", + "\u001b[K |████████████████████████████████| 204kB 38.2MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ], @@ -364,9 +264,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "m_NAgx4hdCGP" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] @@ -374,9 +272,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "cbNneAVCLU1y" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -386,9 +282,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ABTu9MrdVafM" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `BertEmbeddings` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `BertEmbeddings` in runtime, so don't worry what you are setting them now\n", @@ -402,9 +296,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "8W_almibVRTj" - }, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -422,9 +314,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "PjGiq4KnXWuy" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] @@ -432,9 +322,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "iWu5HfbnXAlM" - }, + "metadata": {}, "outputs": [], "source": [ "bert.write().overwrite().save(\"./{}_spark_nlp\".format(EXPORTED_MODEL))" @@ -442,9 +330,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "QCrjxPhzDplN" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] @@ -452,9 +338,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "ZgkVIJshDtLx" - }, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {EXPORTED_MODEL}" @@ -462,9 +346,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "-TSeTRZpXqWO" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -474,24 +356,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 192, - "status": "ok", - "timestamp": 1626086889325, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "ogpxSWxOXj3W", - "outputId": "1dac3744-f117-4065-a921-b33d114cc0df" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -510,9 +375,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Fbehje7fYTDj" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BERT model 😊 " ] @@ -520,9 +383,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "1mm3CvkwYRgs" - }, + "metadata": {}, "outputs": [], "source": [ "bert_loaded = BertEmbeddings.load(\"./{}_spark_nlp\".format(EXPORTED_MODEL))\\\n", @@ -534,25 +395,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 199, - "status": "ok", - "timestamp": 1626087285333, - "user": { - "displayName": "Maziyar Panahi", - "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhTmm4Srbdy-IOOALumHToD8y9PvjupF566HEz1zA=s64", - "userId": "06037986691777662786" - }, - "user_tz": -120 - }, - "id": "pGRTNISyYlnO", - "outputId": "ac2511bb-f1de-4619-c60c-30674c58b40a" - }, + "metadata": {}, "outputs": [ { "data": { @@ -563,7 +406,7 @@ "'bert_en_uncased_L-2_H-128_A-2'" ] }, - "execution_count": 24, + "execution_count": null, "metadata": { "tags": [] }, @@ -576,9 +419,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "_he2LDtBYo1h" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and import BERT models from TF Hub in Spark NLP 🚀 \n" ] @@ -605,8 +446,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/scala/annotation/SpellCheckersPerfTest.scala b/examples/scala/annotation/SpellCheckersPerfTest.scala deleted file mode 100644 index 76bb80bdc27152..00000000000000 --- a/examples/scala/annotation/SpellCheckersPerfTest.scala +++ /dev/null @@ -1,98 +0,0 @@ -import com.johnsnowlabs.nlp.annotator._ -import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel -import com.johnsnowlabs.nlp.base._ -import com.johnsnowlabs.nlp.util.io.ResourceHelper -import com.johnsnowlabs.util.Benchmark -import org.apache.spark.sql.functions.rand -import org.apache.spark.ml.Pipeline - -class NorvigSweetingTest extends App { - - ResourceHelper.spark - - import ResourceHelper.spark.implicits._ - - val documentAssembler = new DocumentAssembler(). - setInputCol("text"). - setOutputCol("document") - - val tokenizer = new Tokenizer(). - setInputCols(Array("document")). - setOutputCol("token") - - val spell = NorvigSweetingModel.pretrained(). - setInputCols("token"). - setOutputCol("spell"). - setDoubleVariants(true) - - val finisher = new Finisher(). - setInputCols("spell") - - val pipeline = new Pipeline(). - setStages(Array( - documentAssembler, - tokenizer, - spell, - finisher - )) - - val spellmodel = pipeline.fit(Seq.empty[String].toDF("text")) - val spellplight = new LightPipeline(spellmodel) - - val n = 50 - - val parquet = ResourceHelper.spark.read - .text("data/vivekn/training_negative") - .toDF("text").sort(rand()) - val data = parquet.as[String].take(n) - data.length - - Benchmark.time("Light annotate norvig spell") { - spellplight.annotate(data) - } -} - -class SymmetricDeleteTest extends App { - - ResourceHelper.spark - - import ResourceHelper.spark.implicits._ - - val documentAssembler = new DocumentAssembler(). - setInputCol("text"). - setOutputCol("document") - - val tokenizer = new Tokenizer(). - setInputCols(Array("document")). - setOutputCol("token") - - val spell = SymmetricDeleteModel.pretrained(). - setInputCols("token"). - setOutputCol("spell") - - val finisher = new Finisher(). - setInputCols("spell") - - val pipeline = new Pipeline(). - setStages(Array( - documentAssembler, - tokenizer, - spell, - finisher - )) - - val spellmodel = pipeline.fit(Seq.empty[String].toDF("text")) - val spellplight = new LightPipeline(spellmodel) - - val n = 50000 - - val parquet = ResourceHelper.spark.read - .text("data/vivekn/training_negative") - .toDF("text").sort(rand()) - val data = parquet.as[String].take(n) - data.length - - Benchmark.time("Light annotate symmetric spell") { - spellplight.annotate(data) - } -} diff --git a/examples/scala/annotation/TokenizerWithNGram.scala b/examples/scala/annotation/TokenizerWithNGram.scala index a6c2aba9ae883f..0691eb3317f334 100644 --- a/examples/scala/annotation/TokenizerWithNGram.scala +++ b/examples/scala/annotation/TokenizerWithNGram.scala @@ -39,11 +39,7 @@ object TokenizerWithNGram extends App { .setInputCol("finished_normal") .setOutputCol("3-gram") - val gramAssembler = new DocumentAssembler() - .setInputCol("3-gram") - .setOutputCol("3-grams") - - val pipeline = new Pipeline().setStages(Array(document, token, normalizer, finisher, ngram, gramAssembler)) + val pipeline = new Pipeline().setStages(Array(document, token, normalizer, finisher, ngram)) val testing = Seq( (1, "Google is a famous company"), diff --git a/examples/scala/training/Train Multi-Class Text Classification on News Articles.scala b/examples/scala/training/Train Multi-Class Text Classification on News Articles.scala index 472264089fc323..f60e12feb236f5 100644 --- a/examples/scala/training/Train Multi-Class Text Classification on News Articles.scala +++ b/examples/scala/training/Train Multi-Class Text Classification on News Articles.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.Pipeline // COMMAND ---------- -// MAGIC %sh +// MAGIC %sh // MAGIC curl -O 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_train.csv' // COMMAND ---------- @@ -67,7 +67,7 @@ val sentenceEmbeddings = new SentenceEmbeddings() .setOutputCol("sentence_embeddings") .setStorageRef("glove_100d") -//ClassifierDL accepts SENTENCE_EMBEDDINGS +//ClassifierDL accepts SENTENCE_EMBEDDINGS //UniversalSentenceEncoder or SentenceEmbeddings can produce SENTECE_EMBEDDINGS val docClassifier = new ClassifierDLApproach() .setInputCols("sentence_embeddings") diff --git a/examples/util/Load_Model_From_S3.ipynb b/examples/util/Load_Model_From_S3.ipynb index aea362688f7421..8e1604bc818609 100644 --- a/examples/util/Load_Model_From_S3.ipynb +++ b/examples/util/Load_Model_From_S3.ipynb @@ -1,12 +1,13 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/Load_Model_From_S3.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/util/Load_Model_From_S3.ipynb)" ] }, { @@ -18,14 +19,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-eUrx5szYw9u", - "outputId": "500e41f0-bcf3-49ff-df59-f1a7a398566c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -49,14 +44,15 @@ "\n", "2022-09-08 14:43:45 (37.8 MB/s) - written to stdout [1191/1191]\n", "\n", - "\u001B[K |████████████████████████████████| 281.4 MB 32 kB/s \n", - "\u001B[K |████████████████████████████████| 616 kB 33.3 MB/s \n", - "\u001B[K |████████████████████████████████| 198 kB 54.1 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n" + "\u001b[K |████████████████████████████████| 281.4 MB 32 kB/s \n", + "\u001b[K |████████████████████████████████| 616 kB 33.3 MB/s \n", + "\u001b[K |████████████████████████████████| 198 kB 54.1 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, @@ -171,15 +167,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 219 - }, - "id": "XSCAf1NOe7rC", - "outputId": "12014be5-e174-42c1-ad37-9f97f64652aa" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -210,7 +199,7 @@ "" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -251,14 +240,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FQ8jfnOR39DQ", - "outputId": "6800b159-2ada-4eb0-f8f2-06aaae482435" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -274,10 +257,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "oz4bRCvRnPWz" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -289,14 +270,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_qiC18IvnhIA", - "outputId": "2206db7e-2012-4041-b23e-96e04f59c89f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -318,10 +293,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "iCFm_eIwoA0P" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[\n", @@ -332,10 +305,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "F_Vin105oH2W" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "test_df = spark.createDataFrame([[\"This is a simple example. This is another sentence\"]]).toDF(\"text\")" @@ -343,10 +314,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "7wPFZJadoD-N" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "model = pipeline.fit(test_df)" @@ -354,14 +323,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "S-jN9LtwolmW", - "outputId": "0d676204-78b6-4460-fde3-0a0dfdcb8d5d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -382,14 +345,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XvOWCR6EXrss", - "outputId": "96cda5f0-55e4-442d-a4d3-780201647331" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -409,14 +366,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tz1Y8DKRX4sS", - "outputId": "7bf91165-7912-4028-ad23-a229942572d5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -454,8 +405,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/util/Load_Model_from_GCP_Storage.ipynb b/examples/util/Load_Model_from_GCP_Storage.ipynb index 2d766c6a523dcb..c6d7b18ae1c8b1 100644 --- a/examples/util/Load_Model_from_GCP_Storage.ipynb +++ b/examples/util/Load_Model_from_GCP_Storage.ipynb @@ -6,14 +6,12 @@ "source": [ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/Load_Model_From_GCP_Storage.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/util/Load_Model_from_GCP_Storage.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "DfdkWg6LThJP" - }, + "metadata": {}, "source": [ "## Loading Pretrained Models from S3" ] @@ -21,28 +19,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "-GmZvE5oTku4" - }, + "metadata": {}, "outputs": [], "source": [ + "# Only run this Cell when you are using Spark NLP on Google Colab\n", "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" ] }, { "cell_type": "markdown", - "metadata": { - "id": "r44X4OKlToLC" - }, + "metadata": {}, "source": [ "## Defining GCP Storage URI in cache_pretrained" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Cy14aeXATt0S" - }, + "metadata": {}, "source": [ "In this notebook, we are going to see the steps required to use an external GCP Storage URI as cache_pretrained folder\n", "\n", @@ -51,18 +44,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "CKVkaiTaULve" - }, + "metadata": {}, "source": [ "### Spark NLP Settings" ] }, { "cell_type": "markdown", - "metadata": { - "id": "G0DgEpU7UhBw" - }, + "metadata": {}, "source": [ "\n", "\n", @@ -75,18 +64,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NdluR0wzVVM_" - }, + "metadata": {}, "source": [ "### Spark ML Settings" ] }, { "cell_type": "markdown", - "metadata": { - "id": "gUeUonSiVkQj" - }, + "metadata": {}, "source": [ "Spark ML requires the following configuration to load a model from GCP using ADC:\n", "\n", @@ -101,9 +86,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "FEQKV1IRYhg0" - }, + "metadata": {}, "source": [ "Now, let's take a look at a simple ecxample the spark session creation below to see how to define each of the configurations with its values:" ] @@ -111,13 +94,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4JfeD8Rj-as2", - "outputId": "437ae866-f63e-43e0-b898-0860e3b19b7d" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -140,7 +117,7 @@ " .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n", " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", " .config(\"spark.jars\", \"./sparknlp.jar\") \\\n", - " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.4,com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.8\") \\\n", + " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.1,com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.8\") \\\n", " .config(\"spark.hadoop.fs.gs.impl\", \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\") \\\n", " .config(\"spark.driver.userClassPathFirst\", \"true\") \\\n", " .config(\"spark.hadoop.google.cloud.auth.service.account.json.keyfile\", \"/content/.config/application_default_credentials.json\") \\\n", @@ -154,9 +131,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "XLNO3Z9r6HgR" - }, + "metadata": {}, "outputs": [], "source": [ "import sparknlp\n", @@ -167,9 +142,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "_eB72Yzg8_Jx" - }, + "metadata": {}, "outputs": [], "source": [ "sample_text = \"This is a sentence. This is another sentence\"\n", @@ -181,9 +154,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "tRyju8D-6XJ1" - }, + "metadata": {}, "outputs": [], "source": [ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", @@ -193,13 +164,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "X5G4_BXwOYtC", - "outputId": "7f15118f-6c8e-46c0-c432-48de09bd72b0" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -221,9 +186,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "FhKPEMb09w6a" - }, + "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[document_assembler, sentence_detector_dl, tokenizer])\n", @@ -233,13 +196,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0CAp_AtrssPj", - "outputId": "4d579436-d3e5-429d-dabb-0d321dca1f0a" - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -278,8 +235,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "pygments_lexer": "ipython3" } }, "nbformat": 4,