From d7f017cc4263c9c4216afae4d792c5a37d7b2b98 Mon Sep 17 00:00:00 2001 From: aleksei Date: Thu, 14 Dec 2017 19:19:28 +0300 Subject: [PATCH 1/4] Rewrite EntityExtractor --- .../example/entities-extractor/entities.txt | 2 + .../entities-extractor/extractor.ipynb | 250 ++++++++++++++++++ .../ner_model/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ner_model/metadata/.part-00000.crc | Bin 0 -> 12 bytes .../ner_model/metadata/_SUCCESS | 0 .../ner_model/metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../ner_pipeline/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ner_pipeline/metadata/.part-00000.crc | Bin 0 -> 12 bytes .../ner_pipeline/metadata/_SUCCESS | 0 .../ner_pipeline/metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 + .../johnsnowlabs/collections/SearchTrie.scala | 8 + .../collections/SearchTrieSpec.scala | 8 + 52 files changed, 280 insertions(+) create mode 100644 python/example/entities-extractor/entities.txt create mode 100644 python/example/entities-extractor/extractor.ipynb create mode 100644 python/example/entities-extractor/ner_model/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_model/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_model/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_model/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_pipeline/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_pipeline/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_pipeline/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_pipeline/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 create mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc create mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS create mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 create mode 100644 src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala create mode 100644 src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala diff --git a/python/example/entities-extractor/entities.txt b/python/example/entities-extractor/entities.txt new file mode 100644 index 00000000000000..cd321331b2695f --- /dev/null +++ b/python/example/entities-extractor/entities.txt @@ -0,0 +1,2 @@ +I think +Feeling strange diff --git a/python/example/entities-extractor/extractor.ipynb b/python/example/entities-extractor/extractor.ipynb new file mode 100644 index 00000000000000..88e46c3100eea1 --- /dev/null +++ b/python/example/entities-extractor/extractor.ipynb @@ -0,0 +1,250 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../../')\n", + "\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.ml import Pipeline\n", + "\n", + "from sparknlp.annotator import *\n", + "from sparknlp.common import *\n", + "from sparknlp.base import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "spark = SparkSession.builder \\\n", + " .appName(\"ner\")\\\n", + " .master(\"local[1]\")\\\n", + " .config(\"spark.driver.memory\",\"4G\")\\\n", + " .config(\"spark.driver.maxResultSize\", \"2G\")\\\n", + " .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n", + " .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import time\n", + "\n", + "documentAssembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "sentenceDetector = SentenceDetectorModel()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"sentence\")\n", + "\n", + "tokenizer = RegexTokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "extractor = EntityExtractor()\\\n", + " .setEntitiesPath(\"entities.txt\")\\\n", + " .setInputCols([\"token\", \"sentence\"])\\\n", + " .setOutputCol(\"entites\")\n", + "\n", + "finisher = Finisher() \\\n", + " .setInputCols([\"entites\"]) \\\n", + " .setIncludeKeys(True)\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " documentAssembler,\n", + " sentenceDetector,\n", + " tokenizer,\n", + " extractor,\n", + " finisher\n", + " ])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+---------+--------------------+\n", + "|itemid|sentiment| text|\n", + "+------+---------+--------------------+\n", + "| 1| 0| ...|\n", + "| 2| 0| ...|\n", + "| 3| 1| omg...|\n", + "| 4| 0| .. Omga...|\n", + "| 5| 0| i think ...|\n", + "| 6| 0| or i jus...|\n", + "| 7| 1| Juuuuuuuuu...|\n", + "| 8| 0| Sunny Agai...|\n", + "| 9| 1| handed in m...|\n", + "| 10| 1| hmmmm.... i...|\n", + "| 11| 0| I must thin...|\n", + "| 12| 1| thanks to a...|\n", + "| 13| 0| this weeken...|\n", + "| 14| 0| jb isnt show...|\n", + "| 15| 0| ok thats it ...|\n", + "| 16| 0| <-------- ...|\n", + "| 17| 0| awhhe man.......|\n", + "| 18| 1| Feeling stran...|\n", + "| 19| 0| HUGE roll of ...|\n", + "| 20| 0| I just cut my...|\n", + "+------+---------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "#Load the input data to be annotated\n", + "data = spark. \\\n", + " read. \\\n", + " parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n", + " limit(1000)\n", + "data.cache()\n", + "data.count()\n", + "data.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start fitting\n", + "Fitting is ended\n" + ] + } + ], + "source": [ + "print(\"Start fitting\")\n", + "model = pipeline.fit(data)\n", + "print(\"Fitting is ended\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+---------+--------------------+-------------------+\n", + "|itemid|sentiment| text| finished_entites|\n", + "+------+---------+--------------------+-------------------+\n", + "| 1| 0| ...| |\n", + "| 2| 0| ...| |\n", + "| 3| 1| omg...| |\n", + "| 4| 0| .. Omga...| |\n", + "| 5| 0| i think ...| result->i think|\n", + "| 6| 0| or i jus...| |\n", + "| 7| 1| Juuuuuuuuu...| |\n", + "| 8| 0| Sunny Agai...| |\n", + "| 9| 1| handed in m...| |\n", + "| 10| 1| hmmmm.... i...| |\n", + "| 11| 0| I must thin...| |\n", + "| 12| 1| thanks to a...| |\n", + "| 13| 0| this weeken...| |\n", + "| 14| 0| jb isnt show...| |\n", + "| 15| 0| ok thats it ...| |\n", + "| 16| 0| <-------- ...| |\n", + "| 17| 0| awhhe man.......| |\n", + "| 18| 1| Feeling stran...|result->feel strang|\n", + "| 19| 0| HUGE roll of ...| |\n", + "| 20| 0| I just cut my...| |\n", + "+------+---------+--------------------+-------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "extracted = model.transform(data)\n", + "extracted.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pipeline.write().overwrite().save(\"./ner_pipeline\")\n", + "model.write().overwrite().save(\"./ner_model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "from pyspark.ml import PipelineModel, Pipeline\n", + "\n", + "Pipeline.read().load(\"./ner_pipeline\")\n", + "sameModel = PipelineModel.read().load(\"./ner_model\")" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/python/example/entities-extractor/ner_model/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..4a68940a54abe100c0cf1b39e9c598c51d5b528b GIT binary patch literal 12 TcmYc;N@ieSU}8wWH{BZm6E6c| literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_model/metadata/part-00000 b/python/example/entities-extractor/ner_model/metadata/part-00000 new file mode 100644 index 00000000000000..d80fc473f1dc07 --- /dev/null +++ b/python/example/entities-extractor/ner_model/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.PipelineModel","timestamp":1513268235824,"sparkVersion":"2.1.1","uid":"PipelineModel_449885197a0f0499a0e3","paramMap":{"stageUids":["DocumentAssembler_412ba9f96832acf4cb75","SentenceDetectorModel_4c7481f204d5778ca290","RegexTokenizer_493292f82ccf7e666226","EntityExtractor_41ce96978797578b1f70","Finisher_48cf85e812c32dbcb468"]}} diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..1a19cb36f866631aabe1cdb99c672a63b6eb3cc3 GIT binary patch literal 12 TcmYc;N@ieSU}CU1|Iq^g688gp literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 new file mode 100644 index 00000000000000..7028681f2adf91 --- /dev/null +++ b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.DocumentAssembler","timestamp":1513268235868,"sparkVersion":"2.1.1","uid":"DocumentAssembler_412ba9f96832acf4cb75","paramMap":{"inputCol":"text","outputCol":"document"}} diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..35c361627b95a9e5cf987436da1887a9f0408ea6 GIT binary patch literal 12 TcmYc;N@ieSU}CWGDl!HD5M=^p literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 new file mode 100644 index 00000000000000..6eb80c1707539f --- /dev/null +++ b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel","timestamp":1513268235914,"sparkVersion":"2.1.1","uid":"SentenceDetectorModel_4c7481f204d5778ca290","paramMap":{"inputCols":["document"],"useAbbreviations":false,"outputCol":"sentence"}} diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..ce437735e78a0f7bcc6a2b097a97b3f3a9b17fa4 GIT binary patch literal 12 TcmYc;N@ieSU}8|rpPL8(5bOfU literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 new file mode 100644 index 00000000000000..24571f68adb3d3 --- /dev/null +++ b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.annotators.RegexTokenizer","timestamp":1513268235959,"sparkVersion":"2.1.1","uid":"RegexTokenizer_493292f82ccf7e666226","paramMap":{"inputCols":["document"],"pattern":"\\S+","outputCol":"token"}} diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..b0b8bb15f4922f62041d662e4bb530167322e23c GIT binary patch literal 12 TcmYc;N@ieSU}CUnxL*qZ5*q_g literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 new file mode 100644 index 00000000000000..9208913f960926 --- /dev/null +++ b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.annotators.EntityExtractor","timestamp":1513268236038,"sparkVersion":"2.1.1","uid":"EntityExtractor_41ce96978797578b1f70","paramMap":{"insideSentences":true,"entitiesPath":"entities.txt","inputCols":["token","sentence"],"outputCol":"entites"}} diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..03d3a6a240cd7a2d0e0fc6c5d216dd799573f212 GIT binary patch literal 12 TcmYc;N@ieSU}A{m>i672&& literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 new file mode 100644 index 00000000000000..57e8942147cdaf --- /dev/null +++ b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.Finisher","timestamp":1513268236089,"sparkVersion":"2.1.1","uid":"Finisher_48cf85e812c32dbcb468","paramMap":{"cleanAnnotations":true,"inputCols":["entites"],"includeKeys":true,"outputAsArray":false,"valueSplitSymbol":"#","annotationSplitSymbol":"@"}} diff --git a/python/example/entities-extractor/ner_pipeline/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..0b75c925f240882684abd2d514c3920c01895125 GIT binary patch literal 12 TcmYc;N@ieSU}E5Jh+zf*5BCBo literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_pipeline/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/metadata/part-00000 new file mode 100644 index 00000000000000..fc47471064ef95 --- /dev/null +++ b/python/example/entities-extractor/ner_pipeline/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.Pipeline","timestamp":1513268234728,"sparkVersion":"2.1.1","uid":"Pipeline_4adb9432e07425015b1d","paramMap":{"stageUids":["DocumentAssembler_412ba9f96832acf4cb75","SentenceDetectorModel_4c7481f204d5778ca290","RegexTokenizer_493292f82ccf7e666226","EntityExtractor_41ce96978797578b1f70","Finisher_48cf85e812c32dbcb468"]}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..d61306999c78a9f7b8e3aaf1ff23a5bd2859024d GIT binary patch literal 12 TcmYc;N@ieSU}7*_^Wi4|6Cnfv literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 new file mode 100644 index 00000000000000..5bb387bbf392a9 --- /dev/null +++ b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.DocumentAssembler","timestamp":1513268235340,"sparkVersion":"2.1.1","uid":"DocumentAssembler_412ba9f96832acf4cb75","paramMap":{"inputCol":"text","outputCol":"document"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..5b9ffb70cf51489827ac73d9399263a47866a7e8 GIT binary patch literal 12 TcmYc;N@ieSU}9K(bFmx%6dMD8 literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 new file mode 100644 index 00000000000000..111e7ed5f59c68 --- /dev/null +++ b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel","timestamp":1513268235400,"sparkVersion":"2.1.1","uid":"SentenceDetectorModel_4c7481f204d5778ca290","paramMap":{"inputCols":["document"],"useAbbreviations":false,"outputCol":"sentence"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..de8fccc9dbaddbc58bac6a0b0df14eb3a400c515 GIT binary patch literal 12 TcmYc;N@ieSU}ESKYE=LL5f}o1 literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 new file mode 100644 index 00000000000000..c3d302e3f44253 --- /dev/null +++ b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.annotators.RegexTokenizer","timestamp":1513268235483,"sparkVersion":"2.1.1","uid":"RegexTokenizer_493292f82ccf7e666226","paramMap":{"inputCols":["document"],"pattern":"\\S+","outputCol":"token"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..db7dfb44cc89bd3cc5c553b200cc2d90936ffab1 GIT binary patch literal 12 TcmYc;N@ieSU}DIt`#c>06E6d? literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 new file mode 100644 index 00000000000000..9e20a0eb6abe2c --- /dev/null +++ b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.annotators.EntityExtractor","timestamp":1513268235660,"sparkVersion":"2.1.1","uid":"EntityExtractor_41ce96978797578b1f70","paramMap":{"insideSentences":true,"entitiesPath":"entities.txt","inputCols":["token","sentence"],"outputCol":"entites"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..b030a2acb6ffda036c802d26d0c80244e7374c08 GIT binary patch literal 12 TcmYc;N@ieSU}AVL-?0<`6HWtm literal 0 HcmV?d00001 diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 new file mode 100644 index 00000000000000..47c98cac1960f6 --- /dev/null +++ b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"com.johnsnowlabs.nlp.Finisher","timestamp":1513268235747,"sparkVersion":"2.1.1","uid":"Finisher_48cf85e812c32dbcb468","paramMap":{"cleanAnnotations":true,"inputCols":["entites"],"includeKeys":true,"outputAsArray":false,"valueSplitSymbol":"#","annotationSplitSymbol":"@"}} diff --git a/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala b/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala new file mode 100644 index 00000000000000..ec76dcc37ec848 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala @@ -0,0 +1,8 @@ +package com.johnsnowlabs.collections + +/** + * Created by aleksei on 14.12.17. + */ +class SearchTrie { + +} diff --git a/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala b/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala new file mode 100644 index 00000000000000..c5b98b3e85e568 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala @@ -0,0 +1,8 @@ +package com.johnsnowlabs.collections + +/** + * Created by aleksei on 14.12.17. + */ +class SearchTrieSpec { + +} From f36c084b83f665087b56176ebdf6d67f7663f2a7 Mon Sep 17 00:00:00 2001 From: aleksei Date: Thu, 14 Dec 2017 19:19:34 +0300 Subject: [PATCH 2/4] Rewrite EntityExtractor --- python/sparknlp/annotator.py | 31 ++-- .../johnsnowlabs/collections/SearchTrie.scala | 145 ++++++++++++++++- .../nlp/annotators/EntityExtractor.scala | 150 +++++++++++------- .../collections/SearchTrieSpec.scala | 83 +++++++++- .../johnsnowlabs/nlp/AnnotatorBuilder.scala | 8 +- .../annotators/EntityExtractorTestSpec.scala | 41 ++++- 6 files changed, 367 insertions(+), 91 deletions(-) diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index d358168695e114..d42e2b247a63cc 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -167,32 +167,27 @@ def setDateFormat(self, value): class EntityExtractor(AnnotatorTransformer): - maxLen = Param(Params._dummy(), - "maxLen", - "max amounts of words in a phrase", - typeConverter=TypeConverters.toInt) - requireSentences = Param(Params._dummy(), - "requireSentences", - "whether to require sbd in pipeline or not. Might improve performance on accuracy hit", + + entitiesPath = Param(Params._dummy(), + "entitiesPath", + "Path to entities (phrases) to extract", + typeConverter=TypeConverters.toString) + + insideSentences = Param(Params._dummy(), + "insideSentences", + "Should extractor search only within sentences borders?", typeConverter=TypeConverters.toBoolean) - entities = Param(Params._dummy(), - "entities", - "file path overrides config", - typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(EntityExtractor, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.EntityExtractor", self.uid) - def setMaxLen(self, value): - return self._set(maxLen=value) - - def setRequireSentences(self, value): - return self._set(requireSentences=value) + def setInsideSentences(self, value): + return self._set(insideSentences=value) - def setEntities(self, value): - return self._set(entities=value) + def setEntitiesPath(self, value): + return self._set(entitiesPath=value) class PerceptronApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties): diff --git a/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala b/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala index ec76dcc37ec848..a831a20a9b38a6 100644 --- a/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala +++ b/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala @@ -1,8 +1,149 @@ package com.johnsnowlabs.collections +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + /** - * Created by aleksei on 14.12.17. + * Immutable Collection that used for fast substring search + * Implementation of Aho-Corasick algorithm https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm */ -class SearchTrie { +case class SearchTrie +( + vocabulary: Map[String, Int], + edges: Map[(Int, Int), Int], + + // In order to optimize 4 values are stored in the same Vector + // Pi - prefix function + // Is Leaf - whether node is leaf? + // Length - length from Root to node (in words) + // Previous Leaf - Link to leaf that suffix of current path from root + nodes: Vector[(Int, Boolean, Int, Int)] +) +{ + /** + * Searchs phrases in the text + * @param text test to search in + * @return Iterator with pairs of (begin, end) + */ + def search(text: Seq[String]): Seq[(Int, Int)] = { + var nodeId = 0 + val result = new ArrayBuffer[(Int, Int)]() + + def addResultIfNeed(nodeId: Int, index: Int): Unit = { + var currentId = nodeId + + while (currentId >= 0) { + if (isLeaf(currentId)) + result.append((index - length(currentId) + 1, index)) + + currentId = lastLeaf(currentId) + } + } + + for ((word, index) <- text.zipWithIndex) { + val wordId = vocabulary.getOrElse(word, -1) + if (wordId < 0) { + nodeId = 0 + } else { + var found = false + + while (nodeId > 0 && !found) { + val newId = edges.getOrElse((nodeId, wordId), -1) + if (newId < 0) { + nodeId = pi(nodeId) + } + else { + nodeId = newId + addResultIfNeed(nodeId, index) + found = true + } + } + + if (!found) { + nodeId = edges.getOrElse((nodeId, wordId), 0) + addResultIfNeed(nodeId, index) + } + } + } + + result + } + + def pi(nodeId: Int): Int = nodes(nodeId)._1 + + def isLeaf(nodeId: Int): Boolean = nodes(nodeId)._2 + + def length(nodeId: Int): Int = nodes(nodeId)._3 + + def lastLeaf(nodeId: Int): Int = nodes(nodeId)._4 +} + + +object SearchTrie { + def apply(phrases: Array[Array[String]]): SearchTrie = { + + // Have only root at the beginning + val vocab = mutable.Map[String, Int]() + val edges = mutable.Map[(Int, Int), Int]() + val parents = mutable.ArrayBuffer(0) + val parentWord = mutable.ArrayBuffer(0) + + val isLeaf = mutable.ArrayBuffer(false) + val length = mutable.ArrayBuffer(0) + + def addNode(parentNodeId: Int, wordId: Int): Int = { + parents.append(parentNodeId) + parentWord.append(wordId) + length.append(length(parentNodeId) + 1) + isLeaf.append(false) + + parents.length - 1 + } + + // Add every phrase as root from root in the tree + for (phrase <- phrases) { + var nodeId = 0 + + for (word <- phrase) { + val wordId = vocab.getOrElseUpdate(word, vocab.size) + nodeId = edges.getOrElseUpdate((nodeId, wordId), addNode(nodeId, wordId)) + } + + if (nodeId > 0) + isLeaf(nodeId) = true + } + + // Calculate pi function + val pi = mutable.ArrayBuffer[Int](0) + for (i <- 1 until parents.size) { + val wordId = parentWord(i) + var candidate = parents(i) + pi.append(0) + + while (candidate > 0) { + candidate = pi(candidate) + val answer = edges.getOrElse((candidate, wordId), 0) + if (answer > 0) { + pi(i) = answer + candidate = 0 + } + } + } + + val lastLeaf = ArrayBuffer[Int](-1) + for (i <- 1 until parents.size) { + lastLeaf.append(-1) + + val piNode = pi(i) + if (isLeaf(piNode)) + lastLeaf(i) = piNode + else + lastLeaf(i) = lastLeaf(piNode) + } + + val nodes = pi.zip(isLeaf).zip(length).zip(lastLeaf) + .map{case (((a,b),c),d) => (a,b,c,d)}.toVector + SearchTrie(vocab.toMap, edges.toMap, nodes) + } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala index 981f3144110227..13848223439e1b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala @@ -1,63 +1,85 @@ package com.johnsnowlabs.nlp.annotators +import com.johnsnowlabs.collections.SearchTrie import com.johnsnowlabs.nlp.util.ConfigHelper import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.annotators.common.{Tokenized, TokenizedSentence} +import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, Tokenized} import com.typesafe.config.Config -import org.apache.spark.ml.param.{IntParam, Param} -import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.ml.param.{BooleanParam, Param} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} +import com.johnsnowlabs.nlp.AnnotatorType._ + +import scala.collection.mutable.ArrayBuffer -/** - * Created by alext on 10/23/16. - */ /** * Extracts entities out of provided phrases * @param uid internally required UID to make it writable - * @@ entities: Unique set of phrases - * @@ requireSentences: May use sentence boundaries provided by a previous SBD annotator - * @@ maxLen: Auto limit for phrase lenght + * @@ entitiesPath: Path to file with phrases to search + * @@ insideSentences: Should Extractor search only within sentence borders? */ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExtractor] { - import com.johnsnowlabs.nlp.AnnotatorType._ - - val maxLen: IntParam = new IntParam(this, "maxLen", "maximum phrase length") - - val entities: Param[String] = new Param(this, "entities", "set of entities (phrases)") - private var loadedEntities: Array[Array[String]] = loadEntities + val entitiesPath = new Param[String](this, "entitiesPath", "Path to entities (phrases) to extract") + val insideSentences = new BooleanParam(this, "insideSentences", + "Should extractor search only within sentences borders?") override val annotatorType: AnnotatorType = ENTITY - override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT) + override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT, TOKEN) - setDefault(inputCols, Array(DOCUMENT)) + setDefault( + inputCols -> Array(DOCUMENT, TOKEN), + insideSentences -> true + ) /** internal constructor for writabale annotator */ def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR")) - def setEntities(value: String): this.type = { - set(entities, value) - loadedEntities = loadEntities + def setEntitiesPath(value: String): this.type = { + set(entitiesPath, value) this } - def getEntities: Array[Array[String]] = loadedEntities + def setInsideSentences(value: Boolean) = set(insideSentences, value) + + + lazy val stemmer = new Stemmer() + lazy val normalizer = new Normalizer() + + private def convertTokens(tokens: Seq[Annotation]): Seq[Annotation] = { + val stems = stemmer.annotate(tokens) + normalizer.annotate(stems) + } + + def getEntities: Array[Array[String]] = { + if (loadedPath != get(entitiesPath)) + loadEntities() + + loadedEntities + } + + def getSearchTrie: SearchTrie = { + if (loadedPath != get(entitiesPath)) + loadEntities() - def setMaxLen(value: Int): this.type = set(maxLen, value) + searchTrie + } - def getMaxLen: Int = $(maxLen) + private var loadedEntities = Array.empty[Array[String]] + private var loadedPath = get(entitiesPath) + private var searchTrie = SearchTrie(Array.empty) /** * Loads entities from a provided source. */ - private def loadEntities: Array[Array[String]] = { - val src = get(entities).map(path => EntityExtractor.retrieveEntityExtractorPhrases(path)) + private def loadEntities() = { + val src = get(entitiesPath) + .map(path => EntityExtractor.retrieveEntityExtractorPhrases(path)) .getOrElse(EntityExtractor.retrieveEntityExtractorPhrases()) + val tokenizer = new RegexTokenizer().setPattern("\\w+") - val stemmer = new Stemmer() - val normalizer = new Normalizer() val phrases: Array[Array[String]] = src.map { line => val annotation = Seq(Annotation(line)) @@ -66,42 +88,53 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt val nTokens = normalizer.annotate(stems) nTokens.map(_.result).toArray } - phrases + + loadedEntities = phrases + searchTrie = SearchTrie.apply(loadedEntities) + loadedPath = get(entitiesPath) } /** - * matches entities depending on utilized annotators and stores them in the annotation - * @param sentence pads annotation content to phrase limits - * @param maxLen applies limit not to exceed results - * @param entities entities to find within annotators results - * @return + * Searches entities and stores them in the annotation + * @param text Tokenized text to search + * @return Extracted Entities */ - private def phraseMatch(sentence: TokenizedSentence, maxLen: Int, entities: Array[Array[String]]): Seq[Annotation] = { - val tokens = sentence.indexedTokens - tokens.padTo(tokens.length + maxLen - (tokens.length % maxLen), null).sliding(maxLen).flatMap { - window => - window.filter(_ != null).inits.filter { - phraseCandidate => - entities.contains(phraseCandidate.map(_.token)) - }.map { - phrase => - Annotation( - ENTITY, - phrase.head.begin, - phrase.last.end, - phrase.map(_.token).mkString(" "), - Map.empty[String, String] - ) - } - }.toSeq + private def search(text: Array[IndexedToken]): Seq[Annotation] = { + val words = text.map(t => t.token) + val result = ArrayBuffer[Annotation]() + + for ((begin, end) <- getSearchTrie.search(words)) { + val normalizedText = (begin to end).map(i => words(i)).mkString(" ") + + val annotation = Annotation( + ENTITY, + text(begin).begin, + text(end).end, + normalizedText, + Map() + ) + + result.append(annotation) + } + + result } /** Defines annotator phrase matching depending on whether we are using SBD or not */ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { - val sentences = Tokenized.unpack(annotations) - sentences.flatMap{ sentence => - phraseMatch(sentence, $(maxLen), loadedEntities) - } + val stemmed = annotations.flatMap { + case a@Annotation(AnnotatorType.TOKEN, result, _, _, _) => + convertTokens(Seq(a)) + case a => Some(a) + } + + val sentences = Tokenized.unpack(stemmed) + if ($(insideSentences)) { + sentences.flatMap(sentence => search(sentence.indexedTokens)) + } else { + val allTokens = sentences.flatMap(s => s.indexedTokens).toArray + search(allTokens) + } } } @@ -114,9 +147,10 @@ object EntityExtractor extends DefaultParamsReadable[EntityExtractor] { entitiesPath: String = "__default", fileFormat: String = config.getString("nlp.entityExtractor.format") ): Array[String] = { - val filePath = if (entitiesPath == "__default") config.getString("nlp.entityExtractor.file") else entitiesPath + val filePath = if (entitiesPath == "__default") + config.getString("nlp.entityExtractor.file") + else entitiesPath + ResourceHelper.parseLinesText(filePath, fileFormat) } - - } \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala b/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala index c5b98b3e85e568..33b60921367024 100644 --- a/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala +++ b/src/test/scala/com/johnsnowlabs/collections/SearchTrieSpec.scala @@ -1,8 +1,83 @@ package com.johnsnowlabs.collections -/** - * Created by aleksei on 14.12.17. - */ -class SearchTrieSpec { +import org.scalatest.FlatSpec + +class SearchTrieSpec extends FlatSpec { + val trie = SearchTrie( + Array( + Array("a", "b", "a", "b", "a"), + Array("a", "a", "a") + ) + ) + + val aTrie = SearchTrie( + Array( + Array("a", "a", "a", "a", "a"), + Array("a", "a", "a"), + Array("a", "a"), + Array("a") + ) + ) + + "SearchTrie" should "create correct encode words" in { + assert(trie.vocabulary.size == 2) + assert(trie.vocabulary("a") == 0) + assert(trie.vocabulary("b") == 1) + } + + "SearchTrie" should "create correct number of nodes" in { + assert(trie.nodes.size == 8) + } + + "SearchTrie" should "correct fill nodes info" in { + val isLeaf = Seq(false, false, false, false, false, true, false, true) + val trieIsLeaf = trie.nodes.map(n => n._2) + assert(trieIsLeaf == isLeaf) + + val length = Seq(0, 1, 2, 3, 4, 5, 2, 3) + val trieLength = trie.nodes.map(n => n._3) + assert(trieLength == length) + + val pi = Seq(0, 0, 0, 1, 2, 3, 1, 6) + val triePi = trie.nodes.map(n => n._1) + assert(triePi == pi) + } + + "SearchTrie" should "correct search" in { + val text = "a b a a a b a b a a a a".split(" ") + val result = trie.search(text) + + assert(result == Seq((2, 4), (4, 8), (8, 10), (9, 11))) + } + + "SearchTrie" should "correct handle out of vocabulary words" in { + val text = "a b a c a b a b a c a a a".split(" ") + val result = trie.search(text) + + assert(result == Seq((4, 8), (10, 12))) + } + + "SearchTrie" should "correctly calculate lastLeaf" in { + val lastLeafs = aTrie.nodes.map(n => n._4) + val expected = Seq(-1, -1, 1, 2, 3, 3) + + assert(lastLeafs == expected) + } + + "SearchTrie" should "correctly find substrings" in { + val text = "a a a a c a a a a a a".split(" ") + val result = aTrie.search(text) + val shouldFound = + Seq((0, 0), (1, 1), (2, 2), (3, 3), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10)) ++ + Seq((0, 1), (1, 2), (2, 3), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10)) ++ + Seq((0, 2), (1, 3), (5, 7), (6, 8), (7, 9), (8, 10)) ++ + Seq((5, 9), (6, 10)) + + for (pair <- shouldFound) { + assert(result.contains(pair)) + } + + assert(result.size == shouldFound.size) + } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index d162dfc836cdca..fd2933382dd9b0 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -55,11 +55,13 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => lemmatizer.transform(tokenized) } - def withFullEntityExtractor(dataset: Dataset[Row]): Dataset[Row] = { + def withFullEntityExtractor(dataset: Dataset[Row], insideSentences: Boolean = true): Dataset[Row] = { val entityExtractor = new EntityExtractor() - .setMaxLen(4) + .setInputCols("sentence", "token") + .setInsideSentences(insideSentences) + .setEntitiesPath("/entity-extractor/test-phrases.txt") .setOutputCol("entity") - entityExtractor.transform(withFullLemmatizer(dataset)) + entityExtractor.transform(withTokenizer(dataset)) } def withFullPragmaticSentenceDetector(dataset: Dataset[Row]): Dataset[Row] = { diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala index dcee20850d6c75..492a2627001145 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala @@ -1,17 +1,46 @@ package com.johnsnowlabs.nlp.annotators -import com.johnsnowlabs.nlp.{AnnotatorType, ContentProvider, DataBuilder} +import com.johnsnowlabs.nlp.AnnotatorType._ +import com.johnsnowlabs.nlp._ import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ -/** - * Created by saif on 02/05/17. - */ + class EntityExtractorTestSpec extends FlatSpec with EntityExtractorBehaviors { - "An EntityExtractor" should s"be of type ${AnnotatorType.ENTITY}" in { + "An EntityExtractor" should s"be of type ${ENTITY}" in { val entityExtractor = new EntityExtractor - assert(entityExtractor.annotatorType == AnnotatorType.ENTITY) + assert(entityExtractor.annotatorType == ENTITY) + } + + "An EntityExtractor" should "extracts entities" in { + val dataset = DataBuilder.basicDataBuild("Hello dolore magna aliqua Lorem ipsum dolor sit in laborum") + val result = AnnotatorBuilder.withFullEntityExtractor(dataset) + val extracted = Annotation.collect(result, "entity").flatten.toSeq + + val expected = Seq( + Annotation(ENTITY, 6, 24, "dolor magna aliqua", Map()), + Annotation(ENTITY, 26, 46, "lorem ipsum dolor sit", Map()), + Annotation(ENTITY, 51, 57, "laborum", Map()) + ) + + assert(extracted == expected) + } + + "An Entity Extractor" should "search inside sentences" in { + val dataset = DataBuilder.basicDataBuild("Hello dolore magna. Aliqua") + val result = AnnotatorBuilder.withFullEntityExtractor(dataset) + val extracted = Annotation.collect(result, "entity").flatten.toSeq + + assert(extracted == Seq.empty[Annotation]) + } + + "An Entity Extractor" should "search in all text" in { + val dataset = DataBuilder.basicDataBuild("Hello dolore magna. Aliqua") + val result = AnnotatorBuilder.withFullEntityExtractor(dataset, false) + val extracted = Annotation.collect(result, "entity").flatten.toSeq + + assert(extracted.length == 1) } val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody) From 6e4e23f2368f6e88bf2682607534392123f78832 Mon Sep 17 00:00:00 2001 From: aleksei Date: Thu, 14 Dec 2017 19:26:07 +0300 Subject: [PATCH 3/4] Remove useless folders --- .../ner_model/metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../ner_model/metadata/.part-00000.crc | Bin 12 -> 0 bytes .../entities-extractor/ner_model/metadata/_SUCCESS | 0 .../ner_model/metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../ner_pipeline/metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../ner_pipeline/metadata/.part-00000.crc | Bin 12 -> 0 bytes .../ner_pipeline/metadata/_SUCCESS | 0 .../ner_pipeline/metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - .../metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../metadata/.part-00000.crc | Bin 12 -> 0 bytes .../metadata/_SUCCESS | 0 .../metadata/part-00000 | 1 - 48 files changed, 12 deletions(-) delete mode 100644 python/example/entities-extractor/ner_model/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_model/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_model/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_model/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_pipeline/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_pipeline/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS delete mode 100644 python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 diff --git a/python/example/entities-extractor/ner_model/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_model/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/metadata/.part-00000.crc deleted file mode 100644 index 4a68940a54abe100c0cf1b39e9c598c51d5b528b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}8wWH{BZm6E6c| diff --git a/python/example/entities-extractor/ner_model/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_model/metadata/part-00000 b/python/example/entities-extractor/ner_model/metadata/part-00000 deleted file mode 100644 index d80fc473f1dc07..00000000000000 --- a/python/example/entities-extractor/ner_model/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"org.apache.spark.ml.PipelineModel","timestamp":1513268235824,"sparkVersion":"2.1.1","uid":"PipelineModel_449885197a0f0499a0e3","paramMap":{"stageUids":["DocumentAssembler_412ba9f96832acf4cb75","SentenceDetectorModel_4c7481f204d5778ca290","RegexTokenizer_493292f82ccf7e666226","EntityExtractor_41ce96978797578b1f70","Finisher_48cf85e812c32dbcb468"]}} diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc deleted file mode 100644 index 1a19cb36f866631aabe1cdb99c672a63b6eb3cc3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}CU1|Iq^g688gp diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 deleted file mode 100644 index 7028681f2adf91..00000000000000 --- a/python/example/entities-extractor/ner_model/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.DocumentAssembler","timestamp":1513268235868,"sparkVersion":"2.1.1","uid":"DocumentAssembler_412ba9f96832acf4cb75","paramMap":{"inputCol":"text","outputCol":"document"}} diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc deleted file mode 100644 index 35c361627b95a9e5cf987436da1887a9f0408ea6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}CWGDl!HD5M=^p diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 deleted file mode 100644 index 6eb80c1707539f..00000000000000 --- a/python/example/entities-extractor/ner_model/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel","timestamp":1513268235914,"sparkVersion":"2.1.1","uid":"SentenceDetectorModel_4c7481f204d5778ca290","paramMap":{"inputCols":["document"],"useAbbreviations":false,"outputCol":"sentence"}} diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc deleted file mode 100644 index ce437735e78a0f7bcc6a2b097a97b3f3a9b17fa4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}8|rpPL8(5bOfU diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 deleted file mode 100644 index 24571f68adb3d3..00000000000000 --- a/python/example/entities-extractor/ner_model/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.annotators.RegexTokenizer","timestamp":1513268235959,"sparkVersion":"2.1.1","uid":"RegexTokenizer_493292f82ccf7e666226","paramMap":{"inputCols":["document"],"pattern":"\\S+","outputCol":"token"}} diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc deleted file mode 100644 index b0b8bb15f4922f62041d662e4bb530167322e23c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}CUnxL*qZ5*q_g diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 deleted file mode 100644 index 9208913f960926..00000000000000 --- a/python/example/entities-extractor/ner_model/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.annotators.EntityExtractor","timestamp":1513268236038,"sparkVersion":"2.1.1","uid":"EntityExtractor_41ce96978797578b1f70","paramMap":{"insideSentences":true,"entitiesPath":"entities.txt","inputCols":["token","sentence"],"outputCol":"entites"}} diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc deleted file mode 100644 index 03d3a6a240cd7a2d0e0fc6c5d216dd799573f212..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}A{m>i672&& diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 b/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 deleted file mode 100644 index 57e8942147cdaf..00000000000000 --- a/python/example/entities-extractor/ner_model/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.Finisher","timestamp":1513268236089,"sparkVersion":"2.1.1","uid":"Finisher_48cf85e812c32dbcb468","paramMap":{"cleanAnnotations":true,"inputCols":["entites"],"includeKeys":true,"outputAsArray":false,"valueSplitSymbol":"#","annotationSplitSymbol":"@"}} diff --git a/python/example/entities-extractor/ner_pipeline/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_pipeline/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/metadata/.part-00000.crc deleted file mode 100644 index 0b75c925f240882684abd2d514c3920c01895125..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}E5Jh+zf*5BCBo diff --git a/python/example/entities-extractor/ner_pipeline/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_pipeline/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/metadata/part-00000 deleted file mode 100644 index fc47471064ef95..00000000000000 --- a/python/example/entities-extractor/ner_pipeline/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"org.apache.spark.ml.Pipeline","timestamp":1513268234728,"sparkVersion":"2.1.1","uid":"Pipeline_4adb9432e07425015b1d","paramMap":{"stageUids":["DocumentAssembler_412ba9f96832acf4cb75","SentenceDetectorModel_4c7481f204d5778ca290","RegexTokenizer_493292f82ccf7e666226","EntityExtractor_41ce96978797578b1f70","Finisher_48cf85e812c32dbcb468"]}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/.part-00000.crc deleted file mode 100644 index d61306999c78a9f7b8e3aaf1ff23a5bd2859024d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}7*_^Wi4|6Cnfv diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 deleted file mode 100644 index 5bb387bbf392a9..00000000000000 --- a/python/example/entities-extractor/ner_pipeline/stages/0_DocumentAssembler_412ba9f96832acf4cb75/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.DocumentAssembler","timestamp":1513268235340,"sparkVersion":"2.1.1","uid":"DocumentAssembler_412ba9f96832acf4cb75","paramMap":{"inputCol":"text","outputCol":"document"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/.part-00000.crc deleted file mode 100644 index 5b9ffb70cf51489827ac73d9399263a47866a7e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}9K(bFmx%6dMD8 diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 deleted file mode 100644 index 111e7ed5f59c68..00000000000000 --- a/python/example/entities-extractor/ner_pipeline/stages/1_SentenceDetectorModel_4c7481f204d5778ca290/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel","timestamp":1513268235400,"sparkVersion":"2.1.1","uid":"SentenceDetectorModel_4c7481f204d5778ca290","paramMap":{"inputCols":["document"],"useAbbreviations":false,"outputCol":"sentence"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/.part-00000.crc deleted file mode 100644 index de8fccc9dbaddbc58bac6a0b0df14eb3a400c515..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}ESKYE=LL5f}o1 diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 deleted file mode 100644 index c3d302e3f44253..00000000000000 --- a/python/example/entities-extractor/ner_pipeline/stages/2_RegexTokenizer_493292f82ccf7e666226/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.annotators.RegexTokenizer","timestamp":1513268235483,"sparkVersion":"2.1.1","uid":"RegexTokenizer_493292f82ccf7e666226","paramMap":{"inputCols":["document"],"pattern":"\\S+","outputCol":"token"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/.part-00000.crc deleted file mode 100644 index db7dfb44cc89bd3cc5c553b200cc2d90936ffab1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}DIt`#c>06E6d? diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 deleted file mode 100644 index 9e20a0eb6abe2c..00000000000000 --- a/python/example/entities-extractor/ner_pipeline/stages/3_EntityExtractor_41ce96978797578b1f70/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.annotators.EntityExtractor","timestamp":1513268235660,"sparkVersion":"2.1.1","uid":"EntityExtractor_41ce96978797578b1f70","paramMap":{"insideSentences":true,"entitiesPath":"entities.txt","inputCols":["token","sentence"],"outputCol":"entites"}} diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/.part-00000.crc deleted file mode 100644 index b030a2acb6ffda036c802d26d0c80244e7374c08..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}AVL-?0<`6HWtm diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 b/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 deleted file mode 100644 index 47c98cac1960f6..00000000000000 --- a/python/example/entities-extractor/ner_pipeline/stages/4_Finisher_48cf85e812c32dbcb468/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"com.johnsnowlabs.nlp.Finisher","timestamp":1513268235747,"sparkVersion":"2.1.1","uid":"Finisher_48cf85e812c32dbcb468","paramMap":{"cleanAnnotations":true,"inputCols":["entites"],"includeKeys":true,"outputAsArray":false,"valueSplitSymbol":"#","annotationSplitSymbol":"@"}} From 11577c9e6e1d292588a0f25610d8b51b5e22362a Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Sat, 16 Dec 2017 20:10:05 -0300 Subject: [PATCH 4/4] - Few syntax improvements - Removed built-in normalization and stemming from annotate, let to user decide - Removed built-in stemming to input data. Could be improved with settings - Fixed unit testing accordingly - Tested python - Cleaned notebook output --- .../entities-extractor/extractor.ipynb | 121 ++++-------------- python/test/annotators.py | 6 +- .../nlp/annotators/EntityExtractor.scala | 38 ++---- .../johnsnowlabs/nlp/AnnotatorBuilder.scala | 10 +- .../annotators/EntityExtractorTestSpec.scala | 8 +- 5 files changed, 52 insertions(+), 131 deletions(-) diff --git a/python/example/entities-extractor/extractor.ipynb b/python/example/entities-extractor/extractor.ipynb index 88e46c3100eea1..6d46d300f22ed4 100644 --- a/python/example/entities-extractor/extractor.ipynb +++ b/python/example/entities-extractor/extractor.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": true }, @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": true }, @@ -39,9 +39,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -80,44 +80,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+---------+--------------------+\n", - "|itemid|sentiment| text|\n", - "+------+---------+--------------------+\n", - "| 1| 0| ...|\n", - "| 2| 0| ...|\n", - "| 3| 1| omg...|\n", - "| 4| 0| .. Omga...|\n", - "| 5| 0| i think ...|\n", - "| 6| 0| or i jus...|\n", - "| 7| 1| Juuuuuuuuu...|\n", - "| 8| 0| Sunny Agai...|\n", - "| 9| 1| handed in m...|\n", - "| 10| 1| hmmmm.... i...|\n", - "| 11| 0| I must thin...|\n", - "| 12| 1| thanks to a...|\n", - "| 13| 0| this weeken...|\n", - "| 14| 0| jb isnt show...|\n", - "| 15| 0| ok thats it ...|\n", - "| 16| 0| <-------- ...|\n", - "| 17| 0| awhhe man.......|\n", - "| 18| 1| Feeling stran...|\n", - "| 19| 0| HUGE roll of ...|\n", - "| 20| 0| I just cut my...|\n", - "+------+---------+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "#Load the input data to be annotated\n", "data = spark. \\\n", @@ -131,21 +96,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { - "collapsed": false, "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Start fitting\n", - "Fitting is ended\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Start fitting\")\n", "model = pipeline.fit(data)\n", @@ -154,44 +109,9 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+---------+--------------------+-------------------+\n", - "|itemid|sentiment| text| finished_entites|\n", - "+------+---------+--------------------+-------------------+\n", - "| 1| 0| ...| |\n", - "| 2| 0| ...| |\n", - "| 3| 1| omg...| |\n", - "| 4| 0| .. Omga...| |\n", - "| 5| 0| i think ...| result->i think|\n", - "| 6| 0| or i jus...| |\n", - "| 7| 1| Juuuuuuuuu...| |\n", - "| 8| 0| Sunny Agai...| |\n", - "| 9| 1| handed in m...| |\n", - "| 10| 1| hmmmm.... i...| |\n", - "| 11| 0| I must thin...| |\n", - "| 12| 1| thanks to a...| |\n", - "| 13| 0| this weeken...| |\n", - "| 14| 0| jb isnt show...| |\n", - "| 15| 0| ok thats it ...| |\n", - "| 16| 0| <-------- ...| |\n", - "| 17| 0| awhhe man.......| |\n", - "| 18| 1| Feeling stran...|result->feel strang|\n", - "| 19| 0| HUGE roll of ...| |\n", - "| 20| 0| I just cut my...| |\n", - "+------+---------+--------------------+-------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "extracted = model.transform(data)\n", "extracted.show()" @@ -199,9 +119,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -211,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "collapsed": true, "scrolled": false @@ -223,12 +143,21 @@ "Pipeline.read().load(\"./ner_pipeline\")\n", "sameModel = PipelineModel.read().load(\"./ner_model\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [default]", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/python/test/annotators.py b/python/test/annotators.py index 596578116ff589..a4197a4328d8b6 100644 --- a/python/test/annotators.py +++ b/python/test/annotators.py @@ -101,11 +101,13 @@ def runTest(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") + tokenizer = RegexTokenizer() \ + .setOutputCol("token") entity_extractor = EntityExtractor() \ - .setMaxLen(4) \ .setOutputCol("entity") assembled = document_assembler.transform(self.data) - entity_extractor.transform(assembled).show() + tokenized = tokenizer.transform(assembled) + entity_extractor.transform(tokenized).show() class PerceptronApproachTestSpec(unittest.TestCase): diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala index 13848223439e1b..3d679de761c459 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala @@ -7,7 +7,7 @@ import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, Tokenized} import com.typesafe.config.Config import org.apache.spark.ml.param.{BooleanParam, Param} -import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import com.johnsnowlabs.nlp.AnnotatorType._ import scala.collection.mutable.ArrayBuffer @@ -37,21 +37,9 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt /** internal constructor for writabale annotator */ def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR")) - def setEntitiesPath(value: String): this.type = { - set(entitiesPath, value) - this - } - - def setInsideSentences(value: Boolean) = set(insideSentences, value) - + def setEntitiesPath(value: String): this.type = set(entitiesPath, value) - lazy val stemmer = new Stemmer() - lazy val normalizer = new Normalizer() - - private def convertTokens(tokens: Seq[Annotation]): Seq[Annotation] = { - val stems = stemmer.annotate(tokens) - normalizer.annotate(stems) - } + def setInsideSentences(value: Boolean): this.type = set(insideSentences, value) def getEntities: Array[Array[String]] = { if (loadedPath != get(entitiesPath)) @@ -74,18 +62,18 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt /** * Loads entities from a provided source. */ - private def loadEntities() = { + private def loadEntities(): Unit = { val src = get(entitiesPath) .map(path => EntityExtractor.retrieveEntityExtractorPhrases(path)) .getOrElse(EntityExtractor.retrieveEntityExtractorPhrases()) val tokenizer = new RegexTokenizer().setPattern("\\w+") + val normalizer = new Normalizer() val phrases: Array[Array[String]] = src.map { line => val annotation = Seq(Annotation(line)) val tokens = tokenizer.annotate(annotation) - val stems = stemmer.annotate(tokens) - val nTokens = normalizer.annotate(stems) + val nTokens = normalizer.annotate(tokens) nTokens.map(_.result).toArray } @@ -122,13 +110,13 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt /** Defines annotator phrase matching depending on whether we are using SBD or not */ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { - val stemmed = annotations.flatMap { - case a@Annotation(AnnotatorType.TOKEN, result, _, _, _) => - convertTokens(Seq(a)) + val tokens = annotations.flatMap { + case a@Annotation(AnnotatorType.TOKEN, _, _, _, _) => + Seq(a) case a => Some(a) } - val sentences = Tokenized.unpack(stemmed) + val sentences = Tokenized.unpack(tokens) if ($(insideSentences)) { sentences.flatMap(sentence => search(sentence.indexedTokens)) } else { @@ -144,9 +132,9 @@ object EntityExtractor extends DefaultParamsReadable[EntityExtractor] { private val config: Config = ConfigHelper.retrieve protected def retrieveEntityExtractorPhrases( - entitiesPath: String = "__default", - fileFormat: String = config.getString("nlp.entityExtractor.format") - ): Array[String] = { + entitiesPath: String = "__default", + fileFormat: String = config.getString("nlp.entityExtractor.format") + ): Array[String] = { val filePath = if (entitiesPath == "__default") config.getString("nlp.entityExtractor.file") else entitiesPath diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index fd2933382dd9b0..35f79f9d8a40d4 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -41,9 +41,9 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => def withFullNormalizer(dataset: Dataset[Row]): Dataset[Row] = { val normalizer = new Normalizer() - .setInputCols(Array("stem")) + .setInputCols(Array("token")) .setOutputCol("normalized") - normalizer.transform(withFullStemmer(dataset)) + normalizer.transform(withTokenizer(dataset)) } def withFullLemmatizer(dataset: Dataset[Row]): Dataset[Row] = { @@ -57,11 +57,13 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => def withFullEntityExtractor(dataset: Dataset[Row], insideSentences: Boolean = true): Dataset[Row] = { val entityExtractor = new EntityExtractor() - .setInputCols("sentence", "token") + .setInputCols("sentence", "normalized") .setInsideSentences(insideSentences) .setEntitiesPath("/entity-extractor/test-phrases.txt") .setOutputCol("entity") - entityExtractor.transform(withTokenizer(dataset)) + entityExtractor.transform( + withFullNormalizer( + withTokenizer(dataset))) } def withFullPragmaticSentenceDetector(dataset: Dataset[Row]): Dataset[Row] = { diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala index 492a2627001145..34612a81e91544 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/EntityExtractorTestSpec.scala @@ -8,18 +8,18 @@ import org.scalatest._ class EntityExtractorTestSpec extends FlatSpec with EntityExtractorBehaviors { - "An EntityExtractor" should s"be of type ${ENTITY}" in { + "An EntityExtractor" should s"be of type $ENTITY" in { val entityExtractor = new EntityExtractor assert(entityExtractor.annotatorType == ENTITY) } - "An EntityExtractor" should "extracts entities" in { + "An EntityExtractor" should "extract entities" in { val dataset = DataBuilder.basicDataBuild("Hello dolore magna aliqua Lorem ipsum dolor sit in laborum") val result = AnnotatorBuilder.withFullEntityExtractor(dataset) val extracted = Annotation.collect(result, "entity").flatten.toSeq val expected = Seq( - Annotation(ENTITY, 6, 24, "dolor magna aliqua", Map()), + Annotation(ENTITY, 6, 24, "dolore magna aliqua", Map()), Annotation(ENTITY, 26, 46, "lorem ipsum dolor sit", Map()), Annotation(ENTITY, 51, 57, "laborum", Map()) ) @@ -37,7 +37,7 @@ class EntityExtractorTestSpec extends FlatSpec with EntityExtractorBehaviors { "An Entity Extractor" should "search in all text" in { val dataset = DataBuilder.basicDataBuild("Hello dolore magna. Aliqua") - val result = AnnotatorBuilder.withFullEntityExtractor(dataset, false) + val result = AnnotatorBuilder.withFullEntityExtractor(dataset, insideSentences = false) val extracted = Annotation.collect(result, "entity").flatten.toSeq assert(extracted.length == 1)