Merge pull request #64 from JohnSnowLabs/rewrite_entity_extractor

Rewrite entity extractor
JohnSnowLabs · Dec 18, 2017 · 5e4dc62 · 5e4dc62
2 parents 91d8acb + 11577c9
commit 5e4dc62
Show file tree

Hide file tree

Showing 9 changed files with 558 additions and 93 deletions.
diff --git a/python/example/entities-extractor/entities.txt b/python/example/entities-extractor/entities.txt
@@ -0,0 +1,2 @@
+I think
+Feeling strange
diff --git a/python/example/entities-extractor/extractor.ipynb b/python/example/entities-extractor/extractor.ipynb
@@ -0,0 +1,179 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../../')\n",
+    "\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.ml import Pipeline\n",
+    "\n",
+    "from sparknlp.annotator import *\n",
+    "from sparknlp.common import *\n",
+    "from sparknlp.base import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder \\\n",
+    "    .appName(\"ner\")\\\n",
+    "    .master(\"local[1]\")\\\n",
+    "    .config(\"spark.driver.memory\",\"4G\")\\\n",
+    "    .config(\"spark.driver.maxResultSize\", \"2G\")\\\n",
+    "    .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n",
+    "    .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n",
+    "    .getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "documentAssembler = DocumentAssembler()\\\n",
+    "  .setInputCol(\"text\")\\\n",
+    "  .setOutputCol(\"document\")\n",
+    "\n",
+    "sentenceDetector = SentenceDetectorModel()\\\n",
+    "  .setInputCols([\"document\"])\\\n",
+    "  .setOutputCol(\"sentence\")\n",
+    "\n",
+    "tokenizer = RegexTokenizer()\\\n",
+    "  .setInputCols([\"document\"])\\\n",
+    "  .setOutputCol(\"token\")\n",
+    "\n",
+    "extractor = EntityExtractor()\\\n",
+    "  .setEntitiesPath(\"entities.txt\")\\\n",
+    "  .setInputCols([\"token\", \"sentence\"])\\\n",
+    "  .setOutputCol(\"entites\")\n",
+    "\n",
+    "finisher = Finisher() \\\n",
+    "    .setInputCols([\"entites\"]) \\\n",
+    "    .setIncludeKeys(True)\n",
+    "\n",
+    "pipeline = Pipeline(\n",
+    "    stages = [\n",
+    "    documentAssembler,\n",
+    "    sentenceDetector,\n",
+    "    tokenizer,\n",
+    "    extractor,\n",
+    "    finisher\n",
+    "  ])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Load the input data to be annotated\n",
+    "data = spark. \\\n",
+    "        read. \\\n",
+    "        parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n",
+    "        limit(1000)\n",
+    "data.cache()\n",
+    "data.count()\n",
+    "data.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "print(\"Start fitting\")\n",
+    "model = pipeline.fit(data)\n",
+    "print(\"Fitting is ended\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted = model.transform(data)\n",
+    "extracted.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "pipeline.write().overwrite().save(\"./ner_pipeline\")\n",
+    "model.write().overwrite().save(\"./ner_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true,
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml import PipelineModel, Pipeline\n",
+    "\n",
+    "Pipeline.read().load(\"./ner_pipeline\")\n",
+    "sameModel = PipelineModel.read().load(\"./ner_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
@@ -167,32 +167,27 @@ def setDateFormat(self, value):
 
 
 class EntityExtractor(AnnotatorTransformer):
-    maxLen = Param(Params._dummy(),
-                   "maxLen",
-                   "max amounts of words in a phrase",
-                   typeConverter=TypeConverters.toInt)
-    requireSentences = Param(Params._dummy(),
-                             "requireSentences",
-                             "whether to require sbd in pipeline or not. Might improve performance on accuracy hit",
+
+    entitiesPath = Param(Params._dummy(),
+                         "entitiesPath",
+                         "Path to entities (phrases) to extract",
+                         typeConverter=TypeConverters.toString)
+
+    insideSentences = Param(Params._dummy(),
+                             "insideSentences",
+                             "Should extractor search only within sentences borders?",
                              typeConverter=TypeConverters.toBoolean)
-    entities = Param(Params._dummy(),
-                     "entities",
-                     "file path overrides config",
-                     typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self):
         super(EntityExtractor, self).__init__()
         self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.EntityExtractor", self.uid)
 
-    def setMaxLen(self, value):
-        return self._set(maxLen=value)
-
-    def setRequireSentences(self, value):
-        return self._set(requireSentences=value)
+    def setInsideSentences(self, value):
+        return self._set(insideSentences=value)
 
-    def setEntities(self, value):
-        return self._set(entities=value)
+    def setEntitiesPath(self, value):
+        return self._set(entitiesPath=value)
 
 
 class PerceptronApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):

diff --git a/python/test/annotators.py b/python/test/annotators.py
@@ -101,11 +101,13 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
+        tokenizer = RegexTokenizer() \
+            .setOutputCol("token")
         entity_extractor = EntityExtractor() \
-            .setMaxLen(4) \
             .setOutputCol("entity")
         assembled = document_assembler.transform(self.data)
-        entity_extractor.transform(assembled).show()
+        tokenized = tokenizer.transform(assembled)
+        entity_extractor.transform(tokenized).show()
 
 
 class PerceptronApproachTestSpec(unittest.TestCase):

diff --git a/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala b/src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala
@@ -0,0 +1,149 @@
+package com.johnsnowlabs.collections
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+
+/**
+  * Immutable Collection that used for fast substring search
+  * Implementation of Aho-Corasick algorithm https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
+  */
+case class SearchTrie
+(
+  vocabulary: Map[String, Int],
+  edges: Map[(Int, Int), Int],
+
+  // In order to optimize 4 values are stored in the same Vector
+  // Pi - prefix function
+  // Is Leaf - whether node is leaf?
+  // Length - length from Root to node (in words)
+  // Previous Leaf - Link to leaf that suffix of current path from root
+  nodes: Vector[(Int, Boolean, Int, Int)]
+)
+{
+  /**
+    * Searchs phrases in the text
+    * @param text test to search in
+    * @return Iterator with pairs of (begin, end)
+    */
+  def search(text: Seq[String]): Seq[(Int, Int)] = {
+    var nodeId = 0
+    val result = new ArrayBuffer[(Int, Int)]()
+
+    def addResultIfNeed(nodeId: Int, index: Int): Unit = {
+      var currentId = nodeId
+
+      while (currentId >= 0) {
+        if (isLeaf(currentId))
+          result.append((index - length(currentId) + 1, index))
+
+        currentId = lastLeaf(currentId)
+      }
+    }
+
+    for ((word, index) <- text.zipWithIndex) {
+      val wordId = vocabulary.getOrElse(word, -1)
+      if (wordId < 0) {
+        nodeId = 0
+      } else {
+        var found = false
+
+        while (nodeId > 0 && !found) {
+          val newId = edges.getOrElse((nodeId, wordId), -1)
+          if (newId < 0) {
+            nodeId = pi(nodeId)
+          }
+          else {
+            nodeId = newId
+            addResultIfNeed(nodeId, index)
+            found = true
+          }
+        }
+
+        if (!found) {
+          nodeId = edges.getOrElse((nodeId, wordId), 0)
+          addResultIfNeed(nodeId, index)
+        }
+      }
+    }
+
+    result
+  }
+
+  def pi(nodeId: Int): Int = nodes(nodeId)._1
+
+  def isLeaf(nodeId: Int): Boolean = nodes(nodeId)._2
+
+  def length(nodeId: Int): Int = nodes(nodeId)._3
+
+  def lastLeaf(nodeId: Int): Int = nodes(nodeId)._4
+}
+
+
+object SearchTrie {
+  def apply(phrases: Array[Array[String]]): SearchTrie = {
+
+    // Have only root at the beginning
+    val vocab = mutable.Map[String, Int]()
+    val edges = mutable.Map[(Int, Int), Int]()
+    val parents = mutable.ArrayBuffer(0)
+    val parentWord = mutable.ArrayBuffer(0)
+
+    val isLeaf = mutable.ArrayBuffer(false)
+    val length = mutable.ArrayBuffer(0)
+
+    def addNode(parentNodeId: Int, wordId: Int): Int = {
+      parents.append(parentNodeId)
+      parentWord.append(wordId)
+      length.append(length(parentNodeId) + 1)
+      isLeaf.append(false)
+
+      parents.length - 1
+    }
+
+    // Add every phrase as root from root in the tree
+    for (phrase <- phrases) {
+      var nodeId = 0
+
+      for (word <- phrase) {
+        val wordId = vocab.getOrElseUpdate(word, vocab.size)
+        nodeId = edges.getOrElseUpdate((nodeId, wordId), addNode(nodeId, wordId))
+      }
+
+      if (nodeId > 0)
+        isLeaf(nodeId) = true
+    }
+
+    // Calculate pi function
+    val pi = mutable.ArrayBuffer[Int](0)
+    for (i <- 1 until parents.size) {
+      val wordId = parentWord(i)
+      var candidate = parents(i)
+      pi.append(0)
+
+      while (candidate > 0) {
+        candidate = pi(candidate)
+        val answer = edges.getOrElse((candidate, wordId), 0)
+        if (answer > 0) {
+          pi(i) = answer
+          candidate = 0
+        }
+      }
+    }
+
+    val lastLeaf = ArrayBuffer[Int](-1)
+    for (i <- 1 until parents.size) {
+      lastLeaf.append(-1)
+
+      val piNode = pi(i)
+      if (isLeaf(piNode))
+        lastLeaf(i) = piNode
+      else
+        lastLeaf(i) = lastLeaf(piNode)
+    }
+
+    val nodes = pi.zip(isLeaf).zip(length).zip(lastLeaf)
+      .map{case (((a,b),c),d) => (a,b,c,d)}.toVector
+
+    SearchTrie(vocab.toMap, edges.toMap, nodes)
+  }
+}