Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite entity extractor #64

Merged
merged 4 commits into from
Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/example/entities-extractor/entities.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
I think
Feeling strange
179 changes: 179 additions & 0 deletions python/example/entities-extractor/extractor.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../../')\n",
"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.ml import Pipeline\n",
"\n",
"from sparknlp.annotator import *\n",
"from sparknlp.common import *\n",
"from sparknlp.base import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"spark = SparkSession.builder \\\n",
" .appName(\"ner\")\\\n",
" .master(\"local[1]\")\\\n",
" .config(\"spark.driver.memory\",\"4G\")\\\n",
" .config(\"spark.driver.maxResultSize\", \"2G\")\\\n",
" .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n",
" .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n",
" .getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import time\n",
"\n",
"documentAssembler = DocumentAssembler()\\\n",
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
"sentenceDetector = SentenceDetectorModel()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
"tokenizer = RegexTokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
"extractor = EntityExtractor()\\\n",
" .setEntitiesPath(\"entities.txt\")\\\n",
" .setInputCols([\"token\", \"sentence\"])\\\n",
" .setOutputCol(\"entites\")\n",
"\n",
"finisher = Finisher() \\\n",
" .setInputCols([\"entites\"]) \\\n",
" .setIncludeKeys(True)\n",
"\n",
"pipeline = Pipeline(\n",
" stages = [\n",
" documentAssembler,\n",
" sentenceDetector,\n",
" tokenizer,\n",
" extractor,\n",
" finisher\n",
" ])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Load the input data to be annotated\n",
"data = spark. \\\n",
" read. \\\n",
" parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n",
" limit(1000)\n",
"data.cache()\n",
"data.count()\n",
"data.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"print(\"Start fitting\")\n",
"model = pipeline.fit(data)\n",
"print(\"Fitting is ended\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"extracted = model.transform(data)\n",
"extracted.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pipeline.write().overwrite().save(\"./ner_pipeline\")\n",
"model.write().overwrite().save(\"./ner_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"from pyspark.ml import PipelineModel, Pipeline\n",
"\n",
"Pipeline.read().load(\"./ner_pipeline\")\n",
"sameModel = PipelineModel.read().load(\"./ner_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
31 changes: 13 additions & 18 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,32 +167,27 @@ def setDateFormat(self, value):


class EntityExtractor(AnnotatorTransformer):
maxLen = Param(Params._dummy(),
"maxLen",
"max amounts of words in a phrase",
typeConverter=TypeConverters.toInt)
requireSentences = Param(Params._dummy(),
"requireSentences",
"whether to require sbd in pipeline or not. Might improve performance on accuracy hit",

entitiesPath = Param(Params._dummy(),
"entitiesPath",
"Path to entities (phrases) to extract",
typeConverter=TypeConverters.toString)

insideSentences = Param(Params._dummy(),
"insideSentences",
"Should extractor search only within sentences borders?",
typeConverter=TypeConverters.toBoolean)
entities = Param(Params._dummy(),
"entities",
"file path overrides config",
typeConverter=TypeConverters.toString)

@keyword_only
def __init__(self):
super(EntityExtractor, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.EntityExtractor", self.uid)

def setMaxLen(self, value):
return self._set(maxLen=value)

def setRequireSentences(self, value):
return self._set(requireSentences=value)
def setInsideSentences(self, value):
return self._set(insideSentences=value)

def setEntities(self, value):
return self._set(entities=value)
def setEntitiesPath(self, value):
return self._set(entitiesPath=value)


class PerceptronApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
Expand Down
6 changes: 4 additions & 2 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,13 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = RegexTokenizer() \
.setOutputCol("token")
entity_extractor = EntityExtractor() \
.setMaxLen(4) \
.setOutputCol("entity")
assembled = document_assembler.transform(self.data)
entity_extractor.transform(assembled).show()
tokenized = tokenizer.transform(assembled)
entity_extractor.transform(tokenized).show()


class PerceptronApproachTestSpec(unittest.TestCase):
Expand Down
149 changes: 149 additions & 0 deletions src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package com.johnsnowlabs.collections
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer


/**
* Immutable Collection that used for fast substring search
* Implementation of Aho-Corasick algorithm https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
*/
case class SearchTrie
(
vocabulary: Map[String, Int],
edges: Map[(Int, Int), Int],

// In order to optimize 4 values are stored in the same Vector
// Pi - prefix function
// Is Leaf - whether node is leaf?
// Length - length from Root to node (in words)
// Previous Leaf - Link to leaf that suffix of current path from root
nodes: Vector[(Int, Boolean, Int, Int)]
)
{
/**
* Searchs phrases in the text
* @param text test to search in
* @return Iterator with pairs of (begin, end)
*/
def search(text: Seq[String]): Seq[(Int, Int)] = {
var nodeId = 0
val result = new ArrayBuffer[(Int, Int)]()

def addResultIfNeed(nodeId: Int, index: Int): Unit = {
var currentId = nodeId

while (currentId >= 0) {
if (isLeaf(currentId))
result.append((index - length(currentId) + 1, index))

currentId = lastLeaf(currentId)
}
}

for ((word, index) <- text.zipWithIndex) {
val wordId = vocabulary.getOrElse(word, -1)
if (wordId < 0) {
nodeId = 0
} else {
var found = false

while (nodeId > 0 && !found) {
val newId = edges.getOrElse((nodeId, wordId), -1)
if (newId < 0) {
nodeId = pi(nodeId)
}
else {
nodeId = newId
addResultIfNeed(nodeId, index)
found = true
}
}

if (!found) {
nodeId = edges.getOrElse((nodeId, wordId), 0)
addResultIfNeed(nodeId, index)
}
}
}

result
}

def pi(nodeId: Int): Int = nodes(nodeId)._1

def isLeaf(nodeId: Int): Boolean = nodes(nodeId)._2

def length(nodeId: Int): Int = nodes(nodeId)._3

def lastLeaf(nodeId: Int): Int = nodes(nodeId)._4
}


object SearchTrie {
def apply(phrases: Array[Array[String]]): SearchTrie = {

// Have only root at the beginning
val vocab = mutable.Map[String, Int]()
val edges = mutable.Map[(Int, Int), Int]()
val parents = mutable.ArrayBuffer(0)
val parentWord = mutable.ArrayBuffer(0)

val isLeaf = mutable.ArrayBuffer(false)
val length = mutable.ArrayBuffer(0)

def addNode(parentNodeId: Int, wordId: Int): Int = {
parents.append(parentNodeId)
parentWord.append(wordId)
length.append(length(parentNodeId) + 1)
isLeaf.append(false)

parents.length - 1
}

// Add every phrase as root from root in the tree
for (phrase <- phrases) {
var nodeId = 0

for (word <- phrase) {
val wordId = vocab.getOrElseUpdate(word, vocab.size)
nodeId = edges.getOrElseUpdate((nodeId, wordId), addNode(nodeId, wordId))
}

if (nodeId > 0)
isLeaf(nodeId) = true
}

// Calculate pi function
val pi = mutable.ArrayBuffer[Int](0)
for (i <- 1 until parents.size) {
val wordId = parentWord(i)
var candidate = parents(i)
pi.append(0)

while (candidate > 0) {
candidate = pi(candidate)
val answer = edges.getOrElse((candidate, wordId), 0)
if (answer > 0) {
pi(i) = answer
candidate = 0
}
}
}

val lastLeaf = ArrayBuffer[Int](-1)
for (i <- 1 until parents.size) {
lastLeaf.append(-1)

val piNode = pi(i)
if (isLeaf(piNode))
lastLeaf(i) = piNode
else
lastLeaf(i) = lastLeaf(piNode)
}

val nodes = pi.zip(isLeaf).zip(length).zip(lastLeaf)
.map{case (((a,b),c),d) => (a,b,c,d)}.toVector

SearchTrie(vocab.toMap, edges.toMap, nodes)
}
}
Loading