-
Notifications
You must be signed in to change notification settings - Fork 718
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #64 from JohnSnowLabs/rewrite_entity_extractor
Rewrite entity extractor
- Loading branch information
Showing
9 changed files
with
558 additions
and
93 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
I think | ||
Feeling strange |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"sys.path.append('../../')\n", | ||
"\n", | ||
"from pyspark.sql import SparkSession\n", | ||
"from pyspark.ml import Pipeline\n", | ||
"\n", | ||
"from sparknlp.annotator import *\n", | ||
"from sparknlp.common import *\n", | ||
"from sparknlp.base import *" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"spark = SparkSession.builder \\\n", | ||
" .appName(\"ner\")\\\n", | ||
" .master(\"local[1]\")\\\n", | ||
" .config(\"spark.driver.memory\",\"4G\")\\\n", | ||
" .config(\"spark.driver.maxResultSize\", \"2G\")\\\n", | ||
" .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n", | ||
" .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n", | ||
" .getOrCreate()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import time\n", | ||
"\n", | ||
"documentAssembler = DocumentAssembler()\\\n", | ||
" .setInputCol(\"text\")\\\n", | ||
" .setOutputCol(\"document\")\n", | ||
"\n", | ||
"sentenceDetector = SentenceDetectorModel()\\\n", | ||
" .setInputCols([\"document\"])\\\n", | ||
" .setOutputCol(\"sentence\")\n", | ||
"\n", | ||
"tokenizer = RegexTokenizer()\\\n", | ||
" .setInputCols([\"document\"])\\\n", | ||
" .setOutputCol(\"token\")\n", | ||
"\n", | ||
"extractor = EntityExtractor()\\\n", | ||
" .setEntitiesPath(\"entities.txt\")\\\n", | ||
" .setInputCols([\"token\", \"sentence\"])\\\n", | ||
" .setOutputCol(\"entites\")\n", | ||
"\n", | ||
"finisher = Finisher() \\\n", | ||
" .setInputCols([\"entites\"]) \\\n", | ||
" .setIncludeKeys(True)\n", | ||
"\n", | ||
"pipeline = Pipeline(\n", | ||
" stages = [\n", | ||
" documentAssembler,\n", | ||
" sentenceDetector,\n", | ||
" tokenizer,\n", | ||
" extractor,\n", | ||
" finisher\n", | ||
" ])\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Load the input data to be annotated\n", | ||
"data = spark. \\\n", | ||
" read. \\\n", | ||
" parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n", | ||
" limit(1000)\n", | ||
"data.cache()\n", | ||
"data.count()\n", | ||
"data.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"print(\"Start fitting\")\n", | ||
"model = pipeline.fit(data)\n", | ||
"print(\"Fitting is ended\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"extracted = model.transform(data)\n", | ||
"extracted.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"pipeline.write().overwrite().save(\"./ner_pipeline\")\n", | ||
"model.write().overwrite().save(\"./ner_model\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true, | ||
"scrolled": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from pyspark.ml import PipelineModel, Pipeline\n", | ||
"\n", | ||
"Pipeline.read().load(\"./ner_pipeline\")\n", | ||
"sameModel = PipelineModel.read().load(\"./ner_model\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"anaconda-cloud": {}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
149 changes: 149 additions & 0 deletions
149
src/main/scala/com/johnsnowlabs/collections/SearchTrie.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
package com.johnsnowlabs.collections | ||
import scala.collection.mutable | ||
import scala.collection.mutable.ArrayBuffer | ||
|
||
|
||
/** | ||
* Immutable Collection that used for fast substring search | ||
* Implementation of Aho-Corasick algorithm https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm | ||
*/ | ||
case class SearchTrie | ||
( | ||
vocabulary: Map[String, Int], | ||
edges: Map[(Int, Int), Int], | ||
|
||
// In order to optimize 4 values are stored in the same Vector | ||
// Pi - prefix function | ||
// Is Leaf - whether node is leaf? | ||
// Length - length from Root to node (in words) | ||
// Previous Leaf - Link to leaf that suffix of current path from root | ||
nodes: Vector[(Int, Boolean, Int, Int)] | ||
) | ||
{ | ||
/** | ||
* Searchs phrases in the text | ||
* @param text test to search in | ||
* @return Iterator with pairs of (begin, end) | ||
*/ | ||
def search(text: Seq[String]): Seq[(Int, Int)] = { | ||
var nodeId = 0 | ||
val result = new ArrayBuffer[(Int, Int)]() | ||
|
||
def addResultIfNeed(nodeId: Int, index: Int): Unit = { | ||
var currentId = nodeId | ||
|
||
while (currentId >= 0) { | ||
if (isLeaf(currentId)) | ||
result.append((index - length(currentId) + 1, index)) | ||
|
||
currentId = lastLeaf(currentId) | ||
} | ||
} | ||
|
||
for ((word, index) <- text.zipWithIndex) { | ||
val wordId = vocabulary.getOrElse(word, -1) | ||
if (wordId < 0) { | ||
nodeId = 0 | ||
} else { | ||
var found = false | ||
|
||
while (nodeId > 0 && !found) { | ||
val newId = edges.getOrElse((nodeId, wordId), -1) | ||
if (newId < 0) { | ||
nodeId = pi(nodeId) | ||
} | ||
else { | ||
nodeId = newId | ||
addResultIfNeed(nodeId, index) | ||
found = true | ||
} | ||
} | ||
|
||
if (!found) { | ||
nodeId = edges.getOrElse((nodeId, wordId), 0) | ||
addResultIfNeed(nodeId, index) | ||
} | ||
} | ||
} | ||
|
||
result | ||
} | ||
|
||
def pi(nodeId: Int): Int = nodes(nodeId)._1 | ||
|
||
def isLeaf(nodeId: Int): Boolean = nodes(nodeId)._2 | ||
|
||
def length(nodeId: Int): Int = nodes(nodeId)._3 | ||
|
||
def lastLeaf(nodeId: Int): Int = nodes(nodeId)._4 | ||
} | ||
|
||
|
||
object SearchTrie { | ||
def apply(phrases: Array[Array[String]]): SearchTrie = { | ||
|
||
// Have only root at the beginning | ||
val vocab = mutable.Map[String, Int]() | ||
val edges = mutable.Map[(Int, Int), Int]() | ||
val parents = mutable.ArrayBuffer(0) | ||
val parentWord = mutable.ArrayBuffer(0) | ||
|
||
val isLeaf = mutable.ArrayBuffer(false) | ||
val length = mutable.ArrayBuffer(0) | ||
|
||
def addNode(parentNodeId: Int, wordId: Int): Int = { | ||
parents.append(parentNodeId) | ||
parentWord.append(wordId) | ||
length.append(length(parentNodeId) + 1) | ||
isLeaf.append(false) | ||
|
||
parents.length - 1 | ||
} | ||
|
||
// Add every phrase as root from root in the tree | ||
for (phrase <- phrases) { | ||
var nodeId = 0 | ||
|
||
for (word <- phrase) { | ||
val wordId = vocab.getOrElseUpdate(word, vocab.size) | ||
nodeId = edges.getOrElseUpdate((nodeId, wordId), addNode(nodeId, wordId)) | ||
} | ||
|
||
if (nodeId > 0) | ||
isLeaf(nodeId) = true | ||
} | ||
|
||
// Calculate pi function | ||
val pi = mutable.ArrayBuffer[Int](0) | ||
for (i <- 1 until parents.size) { | ||
val wordId = parentWord(i) | ||
var candidate = parents(i) | ||
pi.append(0) | ||
|
||
while (candidate > 0) { | ||
candidate = pi(candidate) | ||
val answer = edges.getOrElse((candidate, wordId), 0) | ||
if (answer > 0) { | ||
pi(i) = answer | ||
candidate = 0 | ||
} | ||
} | ||
} | ||
|
||
val lastLeaf = ArrayBuffer[Int](-1) | ||
for (i <- 1 until parents.size) { | ||
lastLeaf.append(-1) | ||
|
||
val piNode = pi(i) | ||
if (isLeaf(piNode)) | ||
lastLeaf(i) = piNode | ||
else | ||
lastLeaf(i) = lastLeaf(piNode) | ||
} | ||
|
||
val nodes = pi.zip(isLeaf).zip(length).zip(lastLeaf) | ||
.map{case (((a,b),c),d) => (a,b,c,d)}.toVector | ||
|
||
SearchTrie(vocab.toMap, edges.toMap, nodes) | ||
} | ||
} |
Oops, something went wrong.