Skip to content

Commit

Permalink
Merge pull request #64 from JohnSnowLabs/rewrite_entity_extractor
Browse files Browse the repository at this point in the history
Rewrite entity extractor
  • Loading branch information
saif-ellafi authored Dec 18, 2017
2 parents 91d8acb + 11577c9 commit 5e4dc62
Show file tree
Hide file tree
Showing 9 changed files with 558 additions and 93 deletions.
2 changes: 2 additions & 0 deletions python/example/entities-extractor/entities.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
I think
Feeling strange
179 changes: 179 additions & 0 deletions python/example/entities-extractor/extractor.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../../')\n",
"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.ml import Pipeline\n",
"\n",
"from sparknlp.annotator import *\n",
"from sparknlp.common import *\n",
"from sparknlp.base import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"spark = SparkSession.builder \\\n",
" .appName(\"ner\")\\\n",
" .master(\"local[1]\")\\\n",
" .config(\"spark.driver.memory\",\"4G\")\\\n",
" .config(\"spark.driver.maxResultSize\", \"2G\")\\\n",
" .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n",
" .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n",
" .getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import time\n",
"\n",
"documentAssembler = DocumentAssembler()\\\n",
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
"sentenceDetector = SentenceDetectorModel()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
"tokenizer = RegexTokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
"extractor = EntityExtractor()\\\n",
" .setEntitiesPath(\"entities.txt\")\\\n",
" .setInputCols([\"token\", \"sentence\"])\\\n",
" .setOutputCol(\"entites\")\n",
"\n",
"finisher = Finisher() \\\n",
" .setInputCols([\"entites\"]) \\\n",
" .setIncludeKeys(True)\n",
"\n",
"pipeline = Pipeline(\n",
" stages = [\n",
" documentAssembler,\n",
" sentenceDetector,\n",
" tokenizer,\n",
" extractor,\n",
" finisher\n",
" ])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Load the input data to be annotated\n",
"data = spark. \\\n",
" read. \\\n",
" parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n",
" limit(1000)\n",
"data.cache()\n",
"data.count()\n",
"data.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"print(\"Start fitting\")\n",
"model = pipeline.fit(data)\n",
"print(\"Fitting is ended\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"extracted = model.transform(data)\n",
"extracted.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pipeline.write().overwrite().save(\"./ner_pipeline\")\n",
"model.write().overwrite().save(\"./ner_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"from pyspark.ml import PipelineModel, Pipeline\n",
"\n",
"Pipeline.read().load(\"./ner_pipeline\")\n",
"sameModel = PipelineModel.read().load(\"./ner_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
31 changes: 13 additions & 18 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,32 +167,27 @@ def setDateFormat(self, value):


class EntityExtractor(AnnotatorTransformer):
maxLen = Param(Params._dummy(),
"maxLen",
"max amounts of words in a phrase",
typeConverter=TypeConverters.toInt)
requireSentences = Param(Params._dummy(),
"requireSentences",
"whether to require sbd in pipeline or not. Might improve performance on accuracy hit",

entitiesPath = Param(Params._dummy(),
"entitiesPath",
"Path to entities (phrases) to extract",
typeConverter=TypeConverters.toString)

insideSentences = Param(Params._dummy(),
"insideSentences",
"Should extractor search only within sentences borders?",
typeConverter=TypeConverters.toBoolean)
entities = Param(Params._dummy(),
"entities",
"file path overrides config",
typeConverter=TypeConverters.toString)

@keyword_only
def __init__(self):
super(EntityExtractor, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.EntityExtractor", self.uid)

def setMaxLen(self, value):
return self._set(maxLen=value)

def setRequireSentences(self, value):
return self._set(requireSentences=value)
def setInsideSentences(self, value):
return self._set(insideSentences=value)

def setEntities(self, value):
return self._set(entities=value)
def setEntitiesPath(self, value):
return self._set(entitiesPath=value)


class PerceptronApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
Expand Down
6 changes: 4 additions & 2 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,13 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = RegexTokenizer() \
.setOutputCol("token")
entity_extractor = EntityExtractor() \
.setMaxLen(4) \
.setOutputCol("entity")
assembled = document_assembler.transform(self.data)
entity_extractor.transform(assembled).show()
tokenized = tokenizer.transform(assembled)
entity_extractor.transform(tokenized).show()


class PerceptronApproachTestSpec(unittest.TestCase):
Expand Down
149 changes: 149 additions & 0 deletions src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package com.johnsnowlabs.collections
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer


/**
* Immutable Collection that used for fast substring search
* Implementation of Aho-Corasick algorithm https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
*/
case class SearchTrie
(
vocabulary: Map[String, Int],
edges: Map[(Int, Int), Int],

// In order to optimize 4 values are stored in the same Vector
// Pi - prefix function
// Is Leaf - whether node is leaf?
// Length - length from Root to node (in words)
// Previous Leaf - Link to leaf that suffix of current path from root
nodes: Vector[(Int, Boolean, Int, Int)]
)
{
/**
* Searchs phrases in the text
* @param text test to search in
* @return Iterator with pairs of (begin, end)
*/
def search(text: Seq[String]): Seq[(Int, Int)] = {
var nodeId = 0
val result = new ArrayBuffer[(Int, Int)]()

def addResultIfNeed(nodeId: Int, index: Int): Unit = {
var currentId = nodeId

while (currentId >= 0) {
if (isLeaf(currentId))
result.append((index - length(currentId) + 1, index))

currentId = lastLeaf(currentId)
}
}

for ((word, index) <- text.zipWithIndex) {
val wordId = vocabulary.getOrElse(word, -1)
if (wordId < 0) {
nodeId = 0
} else {
var found = false

while (nodeId > 0 && !found) {
val newId = edges.getOrElse((nodeId, wordId), -1)
if (newId < 0) {
nodeId = pi(nodeId)
}
else {
nodeId = newId
addResultIfNeed(nodeId, index)
found = true
}
}

if (!found) {
nodeId = edges.getOrElse((nodeId, wordId), 0)
addResultIfNeed(nodeId, index)
}
}
}

result
}

def pi(nodeId: Int): Int = nodes(nodeId)._1

def isLeaf(nodeId: Int): Boolean = nodes(nodeId)._2

def length(nodeId: Int): Int = nodes(nodeId)._3

def lastLeaf(nodeId: Int): Int = nodes(nodeId)._4
}


object SearchTrie {
def apply(phrases: Array[Array[String]]): SearchTrie = {

// Have only root at the beginning
val vocab = mutable.Map[String, Int]()
val edges = mutable.Map[(Int, Int), Int]()
val parents = mutable.ArrayBuffer(0)
val parentWord = mutable.ArrayBuffer(0)

val isLeaf = mutable.ArrayBuffer(false)
val length = mutable.ArrayBuffer(0)

def addNode(parentNodeId: Int, wordId: Int): Int = {
parents.append(parentNodeId)
parentWord.append(wordId)
length.append(length(parentNodeId) + 1)
isLeaf.append(false)

parents.length - 1
}

// Add every phrase as root from root in the tree
for (phrase <- phrases) {
var nodeId = 0

for (word <- phrase) {
val wordId = vocab.getOrElseUpdate(word, vocab.size)
nodeId = edges.getOrElseUpdate((nodeId, wordId), addNode(nodeId, wordId))
}

if (nodeId > 0)
isLeaf(nodeId) = true
}

// Calculate pi function
val pi = mutable.ArrayBuffer[Int](0)
for (i <- 1 until parents.size) {
val wordId = parentWord(i)
var candidate = parents(i)
pi.append(0)

while (candidate > 0) {
candidate = pi(candidate)
val answer = edges.getOrElse((candidate, wordId), 0)
if (answer > 0) {
pi(i) = answer
candidate = 0
}
}
}

val lastLeaf = ArrayBuffer[Int](-1)
for (i <- 1 until parents.size) {
lastLeaf.append(-1)

val piNode = pi(i)
if (isLeaf(piNode))
lastLeaf(i) = piNode
else
lastLeaf(i) = lastLeaf(piNode)
}

val nodes = pi.zip(isLeaf).zip(length).zip(lastLeaf)
.map{case (((a,b),c),d) => (a,b,c,d)}.toVector

SearchTrie(vocab.toMap, edges.toMap, nodes)
}
}
Loading

0 comments on commit 5e4dc62

Please sign in to comment.