diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index f592d16ca8f442..87934db7686034 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -46,7 +46,7 @@ private[johnsnowlabs] class Bart( with Generate { val bpeTokenizer: BartTokenizer = BpeTokenizer - .forModel("bart", merges = merges, vocab = vocabulary, padWithSentenceTokens = false) + .forModel("bart", merges = merges, vocab = vocabulary, padWithSequenceTokens = false) .asInstanceOf[BartTokenizer] private val _tfBartSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala index 3e80bedef517b1..1ce81583db7e55 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala @@ -18,10 +18,11 @@ package com.johnsnowlabs.ml.ai import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} -import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} +import com.johnsnowlabs.nlp.{ActivationFunction, Annotation, AnnotatorType} import org.tensorflow.ndarray.buffer.IntDataBuffer import scala.collection.JavaConverters._ @@ -63,7 +64,8 @@ private[johnsnowlabs] class RoBertaClassification( maxSeqLength: Int, caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { - val bpeTokenizer = BpeTokenizer.forModel("roberta", merges, vocabulary) + val bpeTokenizer = + BpeTokenizer.forModel("roberta", merges, vocabulary, alwaysAddPrefix = false) sentences.map { tokenIndex => // filter empty and only whitespace tokens @@ -106,7 +108,8 @@ private[johnsnowlabs] class RoBertaClassification( caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { // we need the original form of the token // let's lowercase if needed right before the encoding - val bpeTokenizer = BpeTokenizer.forModel("roberta", merges, vocabulary) + val bpeTokenizer = + BpeTokenizer.forModel("roberta", merges, vocabulary, alwaysAddPrefix = false) val sentences = docs.map { s => Sentence(s.result, s.begin, s.end, 0) } sentences.map { sentence => @@ -115,12 +118,10 @@ private[johnsnowlabs] class RoBertaClassification( val sentenceEnd = sentence.end val sentenceIndex = sentence.index - // TODO: we should implement dedicated the tokenize and tokenizeSubText methods for full a sentence rather than token by token val indexedTokens = bpeTokenizer.tokenize(Sentence(content, sentenceBegin, sentenceEnd, sentenceIndex)) - val wordpieceTokens = - indexedTokens.flatMap(token => bpeTokenizer.encode(token)).take(maxSeqLength) + val wordpieceTokens = bpeTokenizer.encode(indexedTokens).take(maxSeqLength) WordpieceTokenizedSentence(wordpieceTokens) } @@ -372,12 +373,10 @@ private[johnsnowlabs] class RoBertaClassification( tensors.clearTensors() val endDim = endLogits.length / batchLength - val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + val endScores: Array[Array[Float]] = endLogits.grouped(endDim).toArray val startDim = startLogits.length / batchLength - val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + val startScores: Array[Array[Float]] = startLogits.grouped(startDim).toArray (startScores, endScores) } @@ -389,4 +388,134 @@ private[johnsnowlabs] class RoBertaClassification( tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin) } + /** Encodes two sequences to be compatible with the RoBerta models. + * + * Unlike other models, ReBerta requires two eos tokens to join two sequences. + * + * For example, the pair of sequences A, B should be joined to: ` A B ` + */ + override def encodeSequence( + seq1: Seq[WordpieceTokenizedSentence], + seq2: Seq[WordpieceTokenizedSentence], + maxSequenceLength: Int): Seq[Array[Int]] = { + + val question = seq1 + .flatMap { wpTokSentence => + wpTokSentence.tokens.map(t => t.pieceId) + } + .toArray + .take(maxSequenceLength - 2) ++ Array(sentenceEndTokenId, sentenceEndTokenId) + + val context = seq2 + .flatMap { wpTokSentence => + wpTokSentence.tokens.map(t => t.pieceId) + } + .toArray + .take(maxSequenceLength - question.length - 2) ++ Array(sentenceEndTokenId) + + Seq(Array(sentenceStartTokenId) ++ question ++ context) + } + + /** Calculates the normalized softmax probabilities. + * + * @param scores + * Raw logits + * @return + * Normalized softmax probabilities + */ + private def normalizedSoftmax(scores: Array[Float]): Array[Float] = { + val max = scores.max + calculateSoftmax(scores.map(_ - max)) + } + + override def predictSpan( + documents: Seq[Annotation], + maxSentenceLength: Int, + caseSensitive: Boolean, + mergeTokenStrategy: String = MergeTokenStrategy.vocab, + engine: String = TensorFlow.name): Seq[Annotation] = { + + val questionAnnot = Seq(documents.head) + val contextAnnot = documents.drop(1) + + val wordPieceTokenizedQuestion = + tokenizeDocument(questionAnnot, maxSentenceLength, caseSensitive) + val wordPieceTokenizedContext = + tokenizeDocument(contextAnnot, maxSentenceLength, caseSensitive) + val questionLength = wordPieceTokenizedQuestion.head.tokens.length + + val encodedInput = + encodeSequence(wordPieceTokenizedQuestion, wordPieceTokenizedContext, maxSentenceLength) + val (startLogits, endLogits) = tagSpan(encodedInput) + + /** Sets log-logits to (almost) 0 for question and padding tokens so they can't contribute to + * the final softmax score. + * + * @param scores + * Logits of the combined sequences + * @return + * Scores, with unwanted tokens set to log-probability 0 + */ + def maskUndesiredTokens(scores: Array[Float]): Array[Float] = { + scores.zipWithIndex.map { case (score, i) => + // 3 added special tokens in encoded sequence (1 bos, 2 eos) + if ((i > 0 && i < questionLength + 3) || i == encodedInput.head.length - 1) + -10000.0f + else score + } + } + + val processedStartLogits = startLogits.map { scores => + normalizedSoftmax(maskUndesiredTokens(scores)) + } + val processedEndLogits = endLogits.map { scores => + normalizedSoftmax(maskUndesiredTokens(scores)) + } + + val startScores = processedStartLogits.transpose.map(_.sum / startLogits.length) + val endScores = processedEndLogits.transpose.map(_.sum / endLogits.length) + + // Drop BOS token from valid results + val startIndex = startScores.zipWithIndex.drop(1).maxBy(_._1) + val endIndex = endScores.zipWithIndex.drop(1).maxBy(_._1) + + val offsetStartIndex = 3 // 3 added special tokens + val offsetEndIndex = offsetStartIndex - 1 + + val allTokenPieces = + wordPieceTokenizedQuestion.head.tokens ++ wordPieceTokenizedContext.flatMap(x => x.tokens) + val decodedAnswer = + allTokenPieces.slice(startIndex._2 - offsetStartIndex, endIndex._2 - offsetEndIndex) + val content = + mergeTokenStrategy match { + case MergeTokenStrategy.vocab => + decodedAnswer.filter(_.isWordStart).map(x => x.token).mkString(" ") + case MergeTokenStrategy.sentencePiece => + val token = "" + decodedAnswer + .map(x => + if (x.isWordStart) " " + token + x.token + else token + x.token) + .mkString("") + .trim + } + + val totalScore = startIndex._1 * endIndex._1 + Seq( + Annotation( + annotatorType = AnnotatorType.CHUNK, + begin = 0, + end = if (content.isEmpty) 0 else content.length - 1, + result = content, + metadata = Map( + "sentence" -> "0", + "chunk" -> "0", + "start" -> decodedAnswer.head.begin.toString, + "start_score" -> startIndex._1.toString, + "end" -> decodedAnswer.last.end.toString, + "end_score" -> endIndex._1.toString, + "score" -> totalScore.toString))) + + } + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala index 29d76fcd0dea17..332177a51b2d63 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala @@ -402,8 +402,7 @@ class GPT2Transformer(override val uid: String) .forModel( "gpt2", merges = $$(merges), - vocab = $$(vocabulary), - padWithSentenceTokens = false) + vocab = $$(vocabulary)) .asInstanceOf[Gpt2Tokenizer] _tfModel = Some( diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala index c5b250578ec81d..801a1f38d54a0d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala @@ -20,12 +20,12 @@ class BartTokenizer( merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, - padWithSentenceTokens: Boolean = false, - addPrefixSpace: Boolean = false) + padWithSequenceTokens: Boolean = false, + addPrefixSpaceToSentence: Boolean = false) extends Gpt2Tokenizer( merges, vocab, specialTokens, - padWithSentenceTokens, + padWithSequenceTokens, prependString = "Ġ", - addPrefixSpace) + addPrefixSpaceToSentence) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala index ec08ade3ae9c58..5281af3867b4f3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala @@ -23,24 +23,30 @@ import scala.collection.mutable import scala.collection.mutable.ListBuffer /** A BPE Tokenizer based on GPT2's tokenization scheme. The tokenization can then be used for - * models based on this scheme (e.g. GPT2, roBERTa, DeBERTa) TODO: truncation assumed? + * models based on this scheme (e.g. GPT2, roBERTa, DeBERTa) + * + * TODO: truncation assumed? + * * @param merges * Map of tokens that are mergeable * @param vocab * Map of tokens to encoded representation * @param specialTokens * Collection of special tokens - * @param padWithSentenceTokens + * @param padWithSequenceTokens * Whether to pad the sentence with sentence tokens at the start and end - * @param addPrefixSpace + * @param addPrefixSpaceToSentence * Whether to add a space to the first word of a sentence + * @param alwaysAddPrefix + * Whether to always prefix token ids with `prefixForPieceId` */ private[nlp] abstract class BpeTokenizer( val merges: Map[(String, String), Int], val vocab: Map[String, Int], val specialTokens: SpecialTokens, - val padWithSentenceTokens: Boolean, - val addPrefixSpace: Boolean) { + val padWithSequenceTokens: Boolean, + val addPrefixSpaceToSentence: Boolean, + val alwaysAddPrefix: Boolean) { protected val bpeRanks: Map[(String, String), Int] = { merges @@ -60,8 +66,8 @@ private[nlp] abstract class BpeTokenizer( } // Can be overridden in inherited class - protected val prependForPieceId: Option[String] = None - protected val appendForPieceId: Option[String] = None + protected val prefixForPieceId: Option[String] = None + protected val suffixForPieceId: Option[String] = None protected def performMerges( wordChars: Array[String], @@ -122,16 +128,16 @@ private[nlp] abstract class BpeTokenizer( val isWordStart = indToken.begin == indexes._1 val isDocumentStart = indToken.begin == 0 var processedSubWord = subWord - processedSubWord = if (isDocumentStart && !addPrefixSpace) { + processedSubWord = if (isDocumentStart && !addPrefixSpaceToSentence) { processedSubWord } else - prependForPieceId match { - case None => processedSubWord - case Some(prepend) => + prefixForPieceId match { + case Some(prepend) if alwaysAddPrefix => if (isWordStart && subWord.indexOf(prepend) < 0) prepend + processedSubWord else processedSubWord + case _ => processedSubWord } - processedSubWord = appendForPieceId match { + processedSubWord = suffixForPieceId match { case None => processedSubWord case Some(append) => val isWordEnd = indToken.end == indexes._2 @@ -239,7 +245,7 @@ private[nlp] abstract class BpeTokenizer( } /** Needs to be implemented */ - def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] + protected def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] /** Special tokens of the model for processing */ val sentencePadding: (String, String) = @@ -264,7 +270,7 @@ private[nlp] abstract class BpeTokenizer( textList = splitTexts.clone() } - if (padWithSentenceTokens) { + if (padWithSequenceTokens) { text = sentencePadding._1 + text + sentencePadding._2 splitTexts.prepend(sentencePadding._1) splitTexts.append(sentencePadding._2) @@ -310,9 +316,10 @@ object BpeTokenizer { modelType: String, merges: Map[(String, String), Int], vocab: Map[String, Int], - padWithSentenceTokens: Boolean = false, - addPrefixSpace: Boolean = false, - specialTokens: Option[SpecialTokens] = None): BpeTokenizer = { + padWithSequenceTokens: Boolean = false, + addPrefixSpaceToSentence: Boolean = false, + specialTokens: Option[SpecialTokens] = None, + alwaysAddPrefix: Boolean = true): BpeTokenizer = { def modelSpecialTokens() = specialTokens match { case Some(specialTok) => specialTok @@ -325,24 +332,26 @@ object BpeTokenizer { merges, vocab, modelSpecialTokens(), - padWithSentenceTokens, - addPrefixSpace = addPrefixSpace) + padWithSequenceTokens, + addPrefixSpaceToSentence = addPrefixSpaceToSentence, + alwaysAddPrefix = alwaysAddPrefix) case "xlm" => - new XlmTokenizer(merges, vocab, modelSpecialTokens(), padWithSentenceTokens) + new XlmTokenizer(merges, vocab, modelSpecialTokens(), padWithSequenceTokens) case "gpt2" => new Gpt2Tokenizer( merges, vocab, modelSpecialTokens(), - padWithSentenceTokens, - addPrefixSpace = addPrefixSpace) + padWithSequenceTokens, + addPrefixSpaceToSentence = addPrefixSpaceToSentence, + alwaysAddPrefix = alwaysAddPrefix) case "bart" => new BartTokenizer( merges, vocab, modelSpecialTokens(), - padWithSentenceTokens, - addPrefixSpace = addPrefixSpace) + padWithSequenceTokens, + addPrefixSpaceToSentence = addPrefixSpaceToSentence) case _ => throw new IllegalArgumentException("Model type \"" + modelType + "\" not supported yet.") } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala index 61ecc3492e01cc..b237ee0288f899 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala @@ -26,10 +26,17 @@ class Gpt2Tokenizer( merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, - padWithSentenceTokens: Boolean = true, + padWithSequenceTokens: Boolean = true, prependString: String = "", - addPrefixSpace: Boolean = false) - extends BpeTokenizer(merges, vocab, specialTokens, padWithSentenceTokens, addPrefixSpace) { + addPrefixSpaceToSentence: Boolean = false, + alwaysAddPrefix: Boolean = true) + extends BpeTokenizer( + merges, + vocab, + specialTokens, + padWithSequenceTokens, + addPrefixSpaceToSentence, + alwaysAddPrefix) { /** Mapping for bytes to a different set of unicode characters (especially white spaces). This * improved model performance for gpt-2 @@ -53,7 +60,7 @@ class Gpt2Tokenizer( // Differs from Transformers, space is always prepended. // FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token // such as '.' get space prepended and they should not. - override val prependForPieceId: Option[String] = + override val prefixForPieceId: Option[String] = if (prependString.nonEmpty) Some(prependString) else None protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1)) @@ -62,7 +69,10 @@ class Gpt2Tokenizer( bytesToUnicodeMapping.map(x => (x._2, x._1)) override def preProcessTokenForBpe(token: String): String = { - token.foldLeft("")(_ + bytesToUnicodeMapping(_)) + token + .getBytes("UTF-8") + .map { b => if (b < 0) 256 + b else b } + .foldLeft("")(_ + bytesToUnicodeMapping(_)) } val splitPattern: Regex = @@ -71,7 +81,7 @@ class Gpt2Tokenizer( override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = { // split pattern based on gpt2's bpe tokenizer splitPattern - .findAllMatchIn(if (prependForPieceId.isDefined || text.startsWith(" ")) text + .findAllMatchIn(if (prefixForPieceId.isDefined || text.startsWith(" ")) text else " " + text) // Prepend space to the beginning of text .map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1)) .toArray diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala index c003a70def528b..d696256abdc493 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala @@ -20,12 +20,14 @@ class RobertaTokenizer( merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, - padWithSentenceTokens: Boolean = false, - addPrefixSpace: Boolean = false) + padWithSequenceTokens: Boolean = false, + addPrefixSpaceToSentence: Boolean = false, + alwaysAddPrefix: Boolean = true) extends Gpt2Tokenizer( merges, vocab, specialTokens, - padWithSentenceTokens, + padWithSequenceTokens, prependString = "Ġ", - addPrefixSpace) + addPrefixSpaceToSentence, + alwaysAddPrefix) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala index b3bc76e65e2c19..61ec78babf307c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala @@ -37,15 +37,16 @@ private[nlp] class XlmTokenizer( merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, - padWithSentenceTokens: Boolean = false, + padWithSequenceTokens: Boolean = false, lang: String = "en", doLowercaseAndRemoveAccent: Boolean = true) extends BpeTokenizer( merges, vocab, specialTokens, - padWithSentenceTokens, - addPrefixSpace = false) { + padWithSequenceTokens, + addPrefixSpaceToSentence = false, + alwaysAddPrefix = false) { require(lang == "en", "Only English is supported currently.") /** Lowercase and strips accents from a piece of text based on @@ -93,7 +94,7 @@ private[nlp] class XlmTokenizer( indexedTokens } - override val appendForPieceId: Option[String] = Some("") + override val suffixForPieceId: Option[String] = Some("") override def bpe(indToken: IndexedToken): Array[TokenPiece] = { val processedToken = preProcessTokenForBpe(indToken.token) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala index a42c1b334d9d0b..4b3515a0f10495 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala @@ -294,8 +294,7 @@ class LongformerEmbeddings(override val uid: String) val bpeTokenizer = BpeTokenizer.forModel( "roberta", merges = $$(merges), - vocab = $$(vocabulary), - padWithSentenceTokens = false) + vocab = $$(vocabulary)) tokens.map { tokenIndex => // filter empty and only whitespace tokens diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala index 02c06bca1b4e77..f28574f44ee4a6 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala @@ -308,8 +308,7 @@ class RoBertaEmbeddings(override val uid: String) val bpeTokenizer = BpeTokenizer.forModel( "roberta", merges = $$(merges), - vocab = $$(vocabulary), - padWithSentenceTokens = false) + vocab = $$(vocabulary)) tokens.map { tokenIndex => // filter empty and only whitespace tokens diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala index 46ce9c24968567..7129d14b8d791f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala @@ -304,8 +304,7 @@ class RoBertaSentenceEmbeddings(override val uid: String) val bpeTokenizer = BpeTokenizer.forModel( "roberta", merges = $$(merges), - vocab = $$(vocabulary), - padWithSentenceTokens = false) + vocab = $$(vocabulary)) sentences.map { s => // filter empty and only whitespace tokens diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala index 44b24216eee517..fcc811acafd249 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala @@ -16,6 +16,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl +import com.johnsnowlabs.nlp.Annotation import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.SlowTest @@ -97,7 +98,7 @@ class RoBertaForQuestionAnsweringTestSpec extends AnyFlatSpec { val pipelineDF = pipelineModel.transform(data) Benchmark.time("Time to show RoBertaForQuestionAnswering results") { - pipelineDF.select("answer").show(10, false) + pipelineDF.select("answer").show(10, truncate = false) } Benchmark.time("Time to save RoBertaForQuestionAnswering results") { @@ -107,6 +108,53 @@ class RoBertaForQuestionAnsweringTestSpec extends AnyFlatSpec { .mode("overwrite") .parquet("./tmp_question_answering") } + } + + "RoBertaForQuestionAnswering" should "produce correct score and index" taggedAs SlowTest in { + + val context = "My name is Sarah and I live in London" + val ddd = Seq(("Where do I live?", context)) + .toDF("question", "context") + .repartition(1) + + val document = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + + val questionAnswering = RoBertaForQuestionAnswering + .pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") + .setCaseSensitive(true) + .setMaxSentenceLength(512) + + val pipeline = new Pipeline().setStages(Array(document, questionAnswering)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + pipelineDF.select("answer").show(truncate = false) + + /* Expected: + { + "score": 0.7772300839424133, + "start": 31, + "end": 37, + "answer": "London" + } + */ + val expectedScore: Float = 0.7772300839424133f + val expectedAnswer: String = "London" + val result = Annotation.collect(pipelineDF, "answer").head.head + + val indexedAnswer: String = + context.slice(result.metadata("start").toInt + 1, result.metadata("end").toInt + 1) + val score: Float = result.metadata("score").toFloat + + assert(result.result == expectedAnswer) + assert(indexedAnswer == expectedAnswer, "Indexes don't seem to match") + + import com.johnsnowlabs.util.TestUtils.tolerantFloatEq + assert(score === expectedScore, "Score was not close enough") } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala index 9a74a68070f175..9350fd57fd0919 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala @@ -74,7 +74,7 @@ trait BpeTokenizerBehaviours { it should "add sentence padding correctly if requested" taggedAs FastTest in { val sentencePaddingTokenizer = - BpeTokenizer.forModel(modelType, merges, vocab, padWithSentenceTokens = true) + BpeTokenizer.forModel(modelType, merges, vocab, padWithSequenceTokens = true) val (tokenized: Array[IndexedToken], encoded: Array[TokenPiece]) = tokenizeAndEncode(sentencePaddingTokenizer, text) @@ -93,7 +93,7 @@ trait BpeTokenizerBehaviours { expectedIds: Array[Int]): Unit = { it should "encode words correctly with added prefix" taggedAs FastTest in { val addedPrefixTokenizer = - BpeTokenizer.forModel(modelType, merges, vocab, addPrefixSpace = true) + BpeTokenizer.forModel(modelType, merges, vocab, addPrefixSpaceToSentence = true) val (_, encoded: Array[TokenPiece]) = tokenizeAndEncode(addedPrefixTokenizer, text) assertEncodedCorrectly(text, encoded, expected, expectedIds) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala index 13c20af5afc53b..869dc931a5602f 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala @@ -16,6 +16,8 @@ package com.johnsnowlabs.nlp.annotators.tokenizer.bpe +import com.johnsnowlabs.nlp.annotators.common.Sentence +import com.johnsnowlabs.tags.FastTest import org.scalatest.flatspec.AnyFlatSpec class Gpt2TokenizerTestSpec extends AnyFlatSpec with BpeTokenizerBehaviours { @@ -91,4 +93,26 @@ class Gpt2TokenizerTestSpec extends AnyFlatSpec with BpeTokenizerBehaviours { "d", "<|endoftext|>"), expectedIds = Array(1, 2, 3, 4, 5, 6, 0, 7, 8, 9, 10, 0)) + + it should "encode non latin tokens" taggedAs FastTest in { + val text = "吳天恩" + + val vocab: Map[String, Int] = + "ĠåIJ³å¤©æģ©".map(_.toString).zipWithIndex.toMap ++ Seq(("<|endoftext|>", 100)) + + val merges: Map[(String, String), Int] = Map.empty + + val bpeTokenizer = + BpeTokenizer.forModel(modelType, merges, vocab, alwaysAddPrefix = false) + + val indexedTokens = + bpeTokenizer.tokenize(Sentence(text, 0, text.length, 0)) + + val encodedTokens = bpeTokenizer.encode(indexedTokens) + + assert( + encodedTokens.forall(_.pieceId != bpeTokenizer.specialTokens.unk.id), + "Tokens should be able to be encoded.") + + } }