Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
anju-jsl committed Nov 17, 2017
2 parents b81e95c + 9cb02ff commit d3b9086
Show file tree
Hide file tree
Showing 9 changed files with 788 additions and 325 deletions.
468 changes: 169 additions & 299 deletions python/example/crf-ner/ner.ipynb

Large diffs are not rendered by default.

535 changes: 535 additions & 0 deletions python/example/crf-ner/ner_benchmark.ipynb

Large diffs are not rendered by default.

18 changes: 13 additions & 5 deletions python/example/vivekn-sentiment/sentiment.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Load the input data to be annotated\n",
Expand Down Expand Up @@ -158,7 +160,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pipeline = Pipeline(stages=[\n",
Expand All @@ -178,7 +182,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for r in sentiment_data.take(5):\n",
Expand Down Expand Up @@ -211,7 +217,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Pipeline.read().load(\"./ps\")\n",
Expand All @@ -231,7 +239,7 @@
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
Expand Down
3 changes: 2 additions & 1 deletion src/main/resources/log4j.properties
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n

log4j.logger.AnnotatorLogger=WARNING
log4j.logger.AnnotatorLogger=WARNING
log4j.logger.CRF=INFO
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.johnsnowlabs.nlp.annotators.ner.crf

import com.johnsnowlabs.nlp.util.io.ResourceHelper

import scala.io.Source

case class DictionaryFeatures(dict: Map[String, String])
Expand Down Expand Up @@ -31,8 +33,8 @@ object DictionaryFeatures {
}

private def read(path: String): Iterator[(String, String)] = {
Source.fromFile(path).getLines().map{
line =>
ResourceHelper.SourceStream(path)
.content.getLines().map{line =>
val items = line.split(":")
require(items.size == 2)
(items(0), items(1))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class NerCrfApproach(override val uid: String) extends AnnotatorApproach[NerCrfM
.setOutputCol("token")

val posTagger = new PerceptronApproach()
.setCorpusPath("/anc-pos-corpus/")
.setCorpusPath("anc-pos-corpus/")
.setNIterations(10)
.setInputCols("token", "document")
.setOutputCol("pos")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package com.johnsnowlabs.nlp.annotators.pos.perceptron

import java.io.File
import java.nio.file.Paths

import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, TaggedWord}
import com.johnsnowlabs.nlp.util.io.ResourceHelper
import com.johnsnowlabs.nlp.util.io.ResourceHelper.{SourceStream, pathIsDirectory}
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.spark.ml.param.{IntParam, Param}
Expand Down Expand Up @@ -218,15 +220,12 @@ object PerceptronApproach extends DefaultParamsReadable[PerceptronApproach] {
.flatMap(fileName => parsePOSCorpusFromSource(fileName.toString, tagSeparator))
} catch {
case _: NullPointerException =>
val sourceStream = SourceStream(dirName)
val res = sourceStream
.content
.getLines()
ResourceHelper.listDirectory(dirName)
.take(fileLimit)
.flatMap(fileName => parsePOSCorpusFromSource(dirName + "/" + fileName, tagSeparator))
.flatMap{fileName =>
val path = Paths.get(dirName, fileName)
parsePOSCorpusFromSource(path.toString, tagSeparator)}
.toArray
sourceStream.close()
res
}
}

Expand All @@ -246,7 +245,7 @@ object PerceptronApproach extends DefaultParamsReadable[PerceptronApproach] {
if (pathIsDirectory(dirOrFilePath)) parsePOSCorpusFromDir(dirOrFilePath, posSeparator, fileLimit)
else parsePOSCorpusFromSource(dirOrFilePath, posSeparator)
}
if (result.isEmpty) throw new Exception("Empty corpus for POS")
if (result.isEmpty) throw new Exception(s"Empty corpus for POS in $posDirOrFilePath")
result
}

Expand Down
59 changes: 53 additions & 6 deletions src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ import org.apache.spark.sql.SparkSession
import scala.collection.mutable.{ArrayBuffer, Map => MMap}
import scala.io.Source

import java.net.URLDecoder
import java.util.jar.JarFile


/**
* Created by saif on 28/04/17.
Expand All @@ -24,15 +27,59 @@ object ResourceHelper {

private val spark: SparkSession = SparkSession.builder().getOrCreate()


def listDirectory(path: String): Seq[String] = {
var dirURL = getClass.getResource(path)

if (dirURL == null)
dirURL = getClass.getClassLoader.getResource(path)

if (dirURL != null && dirURL.getProtocol.equals("file")) {
/* A file path: easy enough */
return new File(dirURL.toURI).list().sorted
} else if (dirURL == null) {
/* path not in resources and not in disk */
throw new FileNotFoundException(path)
}

if (dirURL.getProtocol.equals("jar")) {
/* A JAR path */
val jarPath = dirURL.getPath.substring(5, dirURL.getPath.indexOf("!")) //strip out only the JAR file
val jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8"))
val entries = jar.entries()
val result = new ArrayBuffer[String]()

val pathToCheck = path.replaceFirst("/", "")
while(entries.hasMoreElements) {
val name = entries.nextElement().getName.replaceFirst("/", "")
if (name.startsWith(pathToCheck)) { //filter according to the path
var entry = name.substring(pathToCheck.length())
val checkSubdir = entry.indexOf("/")
if (checkSubdir >= 0) {
// if it is a subdirectory, we just return the directory name
entry = entry.substring(0, checkSubdir)
}
if (entry.nonEmpty)
result.append(entry)
}
}
return result.distinct.sorted
}

throw new UnsupportedOperationException(s"Cannot list files for URL $dirURL")
}

/** Structure for a SourceStream coming from compiled content */
case class SourceStream(resource: String) {
val pipe: Option[InputStream] = try {
getClass.getResourceAsStream(resource).close()
Some(getClass.getResourceAsStream(resource))
} catch {
case _: NullPointerException => None
val pipe: Option[InputStream] = {
var stream = getClass.getResourceAsStream(resource)
if (stream == null)
stream = getClass.getClassLoader.getResourceAsStream(resource)
Option(stream)
}
val content: Source = pipe.map(p => Source.fromInputStream(p)("UTF-8")).getOrElse(Source.fromFile(resource, "UTF-8"))
val content: Source = pipe.map(p => {
Source.fromInputStream(p)("UTF-8")
}).getOrElse(Source.fromFile(resource, "UTF-8"))
def close(): Unit = {
content.close()
pipe.foreach(_.close())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ object CoNLL2003PipelineTest extends App {
.setOutputCol("token")

val posTagger = new PerceptronApproach()
.setCorpusPath("/anc-pos-corpus/")
.setCorpusPath("anc-pos-corpus/")
.setNIterations(10)
.setInputCols("token", "document")
.setOutputCol("pos")
Expand All @@ -55,9 +55,10 @@ object CoNLL2003PipelineTest extends App {
val nerTagger = new NerCrfApproach()
.setInputCols("sentence", "token", "pos")
.setLabelColumn("label")
.setC0(1250000)
.setDatsetPath("eng.train")
.setC0(2250000)
.setRandomSeed(100)
.setMaxEpochs(10)
.setMaxEpochs(20)
.setOutputCol("ner")

getPosStages() :+ nerTagger
Expand Down

0 comments on commit d3b9086

Please sign in to comment.