diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala
index f592d16ca8f442..87934db7686034 100644
--- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala
@@ -46,7 +46,7 @@ private[johnsnowlabs] class Bart(
     with Generate {
 
   val bpeTokenizer: BartTokenizer = BpeTokenizer
-    .forModel("bart", merges = merges, vocab = vocabulary, padWithSentenceTokens = false)
+    .forModel("bart", merges = merges, vocab = vocabulary, padWithSequenceTokens = false)
     .asInstanceOf[BartTokenizer]
   private val _tfBartSignatures: Map[String, String] =
     signatures.getOrElse(ModelSignatureManager.apply())
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala
index 3e80bedef517b1..1ce81583db7e55 100644
--- a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala
@@ -18,10 +18,11 @@ package com.johnsnowlabs.ml.ai
 
 import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager}
 import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper}
+import com.johnsnowlabs.ml.util.TensorFlow
 import com.johnsnowlabs.nlp.annotators.common._
 import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
 import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder}
-import com.johnsnowlabs.nlp.{ActivationFunction, Annotation}
+import com.johnsnowlabs.nlp.{ActivationFunction, Annotation, AnnotatorType}
 import org.tensorflow.ndarray.buffer.IntDataBuffer
 
 import scala.collection.JavaConverters._
@@ -63,7 +64,8 @@ private[johnsnowlabs] class RoBertaClassification(
       maxSeqLength: Int,
       caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = {
 
-    val bpeTokenizer = BpeTokenizer.forModel("roberta", merges, vocabulary)
+    val bpeTokenizer =
+      BpeTokenizer.forModel("roberta", merges, vocabulary, alwaysAddPrefix = false)
 
     sentences.map { tokenIndex =>
       // filter empty and only whitespace tokens
@@ -106,7 +108,8 @@ private[johnsnowlabs] class RoBertaClassification(
       caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = {
     // we need the original form of the token
     // let's lowercase if needed right before the encoding
-    val bpeTokenizer = BpeTokenizer.forModel("roberta", merges, vocabulary)
+    val bpeTokenizer =
+      BpeTokenizer.forModel("roberta", merges, vocabulary, alwaysAddPrefix = false)
     val sentences = docs.map { s => Sentence(s.result, s.begin, s.end, 0) }
 
     sentences.map { sentence =>
@@ -115,12 +118,10 @@ private[johnsnowlabs] class RoBertaClassification(
       val sentenceEnd = sentence.end
       val sentenceIndex = sentence.index
 
-      // TODO: we should implement dedicated the tokenize and tokenizeSubText methods for full a sentence rather than token by token
       val indexedTokens =
         bpeTokenizer.tokenize(Sentence(content, sentenceBegin, sentenceEnd, sentenceIndex))
 
-      val wordpieceTokens =
-        indexedTokens.flatMap(token => bpeTokenizer.encode(token)).take(maxSeqLength)
+      val wordpieceTokens = bpeTokenizer.encode(indexedTokens).take(maxSeqLength)
 
       WordpieceTokenizedSentence(wordpieceTokens)
     }
@@ -372,12 +373,10 @@ private[johnsnowlabs] class RoBertaClassification(
     tensors.clearTensors()
 
     val endDim = endLogits.length / batchLength
-    val endScores: Array[Array[Float]] =
-      endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray
+    val endScores: Array[Array[Float]] = endLogits.grouped(endDim).toArray
 
     val startDim = startLogits.length / batchLength
-    val startScores: Array[Array[Float]] =
-      startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray
+    val startScores: Array[Array[Float]] = startLogits.grouped(startDim).toArray
 
     (startScores, endScores)
   }
@@ -389,4 +388,134 @@ private[johnsnowlabs] class RoBertaClassification(
     tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin)
   }
 
+  /** Encodes two sequences to be compatible with the RoBerta models.
+    *
+    * Unlike other models, ReBerta requires two eos tokens to join two sequences.
+    *
+    * For example, the pair of sequences A, B should be joined to: `<s> A </s></s> B </s>`
+    */
+  override def encodeSequence(
+      seq1: Seq[WordpieceTokenizedSentence],
+      seq2: Seq[WordpieceTokenizedSentence],
+      maxSequenceLength: Int): Seq[Array[Int]] = {
+
+    val question = seq1
+      .flatMap { wpTokSentence =>
+        wpTokSentence.tokens.map(t => t.pieceId)
+      }
+      .toArray
+      .take(maxSequenceLength - 2) ++ Array(sentenceEndTokenId, sentenceEndTokenId)
+
+    val context = seq2
+      .flatMap { wpTokSentence =>
+        wpTokSentence.tokens.map(t => t.pieceId)
+      }
+      .toArray
+      .take(maxSequenceLength - question.length - 2) ++ Array(sentenceEndTokenId)
+
+    Seq(Array(sentenceStartTokenId) ++ question ++ context)
+  }
+
+  /** Calculates the normalized softmax probabilities.
+    *
+    * @param scores
+    *   Raw logits
+    * @return
+    *   Normalized softmax probabilities
+    */
+  private def normalizedSoftmax(scores: Array[Float]): Array[Float] = {
+    val max = scores.max
+    calculateSoftmax(scores.map(_ - max))
+  }
+
+  override def predictSpan(
+      documents: Seq[Annotation],
+      maxSentenceLength: Int,
+      caseSensitive: Boolean,
+      mergeTokenStrategy: String = MergeTokenStrategy.vocab,
+      engine: String = TensorFlow.name): Seq[Annotation] = {
+
+    val questionAnnot = Seq(documents.head)
+    val contextAnnot = documents.drop(1)
+
+    val wordPieceTokenizedQuestion =
+      tokenizeDocument(questionAnnot, maxSentenceLength, caseSensitive)
+    val wordPieceTokenizedContext =
+      tokenizeDocument(contextAnnot, maxSentenceLength, caseSensitive)
+    val questionLength = wordPieceTokenizedQuestion.head.tokens.length
+
+    val encodedInput =
+      encodeSequence(wordPieceTokenizedQuestion, wordPieceTokenizedContext, maxSentenceLength)
+    val (startLogits, endLogits) = tagSpan(encodedInput)
+
+    /** Sets log-logits to (almost) 0 for question and padding tokens so they can't contribute to
+      * the final softmax score.
+      *
+      * @param scores
+      *   Logits of the combined sequences
+      * @return
+      *   Scores, with unwanted tokens set to log-probability 0
+      */
+    def maskUndesiredTokens(scores: Array[Float]): Array[Float] = {
+      scores.zipWithIndex.map { case (score, i) =>
+        // 3 added special tokens in encoded sequence (1 bos, 2 eos)
+        if ((i > 0 && i < questionLength + 3) || i == encodedInput.head.length - 1)
+          -10000.0f
+        else score
+      }
+    }
+
+    val processedStartLogits = startLogits.map { scores =>
+      normalizedSoftmax(maskUndesiredTokens(scores))
+    }
+    val processedEndLogits = endLogits.map { scores =>
+      normalizedSoftmax(maskUndesiredTokens(scores))
+    }
+
+    val startScores = processedStartLogits.transpose.map(_.sum / startLogits.length)
+    val endScores = processedEndLogits.transpose.map(_.sum / endLogits.length)
+
+    // Drop BOS token from valid results
+    val startIndex = startScores.zipWithIndex.drop(1).maxBy(_._1)
+    val endIndex = endScores.zipWithIndex.drop(1).maxBy(_._1)
+
+    val offsetStartIndex = 3 // 3 added special tokens
+    val offsetEndIndex = offsetStartIndex - 1
+
+    val allTokenPieces =
+      wordPieceTokenizedQuestion.head.tokens ++ wordPieceTokenizedContext.flatMap(x => x.tokens)
+    val decodedAnswer =
+      allTokenPieces.slice(startIndex._2 - offsetStartIndex, endIndex._2 - offsetEndIndex)
+    val content =
+      mergeTokenStrategy match {
+        case MergeTokenStrategy.vocab =>
+          decodedAnswer.filter(_.isWordStart).map(x => x.token).mkString(" ")
+        case MergeTokenStrategy.sentencePiece =>
+          val token = ""
+          decodedAnswer
+            .map(x =>
+              if (x.isWordStart) " " + token + x.token
+              else token + x.token)
+            .mkString("")
+            .trim
+      }
+
+    val totalScore = startIndex._1 * endIndex._1
+    Seq(
+      Annotation(
+        annotatorType = AnnotatorType.CHUNK,
+        begin = 0,
+        end = if (content.isEmpty) 0 else content.length - 1,
+        result = content,
+        metadata = Map(
+          "sentence" -> "0",
+          "chunk" -> "0",
+          "start" -> decodedAnswer.head.begin.toString,
+          "start_score" -> startIndex._1.toString,
+          "end" -> decodedAnswer.last.end.toString,
+          "end_score" -> endIndex._1.toString,
+          "score" -> totalScore.toString)))
+
+  }
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala
index 29d76fcd0dea17..332177a51b2d63 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala
@@ -402,8 +402,7 @@ class GPT2Transformer(override val uid: String)
         .forModel(
           "gpt2",
           merges = $$(merges),
-          vocab = $$(vocabulary),
-          padWithSentenceTokens = false)
+          vocab = $$(vocabulary))
         .asInstanceOf[Gpt2Tokenizer]
 
       _tfModel = Some(
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala
index c5b250578ec81d..801a1f38d54a0d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BartTokenizer.scala
@@ -20,12 +20,12 @@ class BartTokenizer(
     merges: Map[(String, String), Int],
     vocab: Map[String, Int],
     specialTokens: SpecialTokens,
-    padWithSentenceTokens: Boolean = false,
-    addPrefixSpace: Boolean = false)
+    padWithSequenceTokens: Boolean = false,
+    addPrefixSpaceToSentence: Boolean = false)
     extends Gpt2Tokenizer(
       merges,
       vocab,
       specialTokens,
-      padWithSentenceTokens,
+      padWithSequenceTokens,
       prependString = "Ġ",
-      addPrefixSpace)
+      addPrefixSpaceToSentence)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
index ec08ade3ae9c58..5281af3867b4f3 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
@@ -23,24 +23,30 @@ import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 
 /** A BPE Tokenizer based on GPT2's tokenization scheme. The tokenization can then be used for
-  * models based on this scheme (e.g. GPT2, roBERTa, DeBERTa) TODO: truncation assumed?
+ * models based on this scheme (e.g. GPT2, roBERTa, DeBERTa)
+ *
+ * TODO: truncation assumed?
+ *
   * @param merges
   *   Map of tokens that are mergeable
   * @param vocab
   *   Map of tokens to encoded representation
   * @param specialTokens
   *   Collection of special tokens
-  * @param padWithSentenceTokens
+ * @param padWithSequenceTokens
   *   Whether to pad the sentence with sentence tokens at the start and end
-  * @param addPrefixSpace
+ * @param addPrefixSpaceToSentence
   *   Whether to add a space to the first word of a sentence
+ * @param alwaysAddPrefix
+ *    Whether to always prefix token ids with `prefixForPieceId`
   */
 private[nlp] abstract class BpeTokenizer(
     val merges: Map[(String, String), Int],
     val vocab: Map[String, Int],
     val specialTokens: SpecialTokens,
-    val padWithSentenceTokens: Boolean,
-    val addPrefixSpace: Boolean) {
+    val padWithSequenceTokens: Boolean,
+    val addPrefixSpaceToSentence: Boolean,
+    val alwaysAddPrefix: Boolean) {
 
   protected val bpeRanks: Map[(String, String), Int] = {
     merges
@@ -60,8 +66,8 @@ private[nlp] abstract class BpeTokenizer(
   }
 
   // Can be overridden in inherited class
-  protected val prependForPieceId: Option[String] = None
-  protected val appendForPieceId: Option[String] = None
+  protected val prefixForPieceId: Option[String] = None
+  protected val suffixForPieceId: Option[String] = None
 
   protected def performMerges(
       wordChars: Array[String],
@@ -122,16 +128,16 @@ private[nlp] abstract class BpeTokenizer(
         val isWordStart = indToken.begin == indexes._1
         val isDocumentStart = indToken.begin == 0
         var processedSubWord = subWord
-        processedSubWord = if (isDocumentStart && !addPrefixSpace) {
+        processedSubWord = if (isDocumentStart && !addPrefixSpaceToSentence) {
           processedSubWord
         } else
-          prependForPieceId match {
-            case None => processedSubWord
-            case Some(prepend) =>
+          prefixForPieceId match {
+            case Some(prepend) if alwaysAddPrefix =>
               if (isWordStart && subWord.indexOf(prepend) < 0) prepend + processedSubWord
               else processedSubWord
+            case _ => processedSubWord
           }
-        processedSubWord = appendForPieceId match {
+        processedSubWord = suffixForPieceId match {
           case None => processedSubWord
           case Some(append) =>
             val isWordEnd = indToken.end == indexes._2
@@ -239,7 +245,7 @@ private[nlp] abstract class BpeTokenizer(
   }
 
   /** Needs to be implemented */
-  def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
+  protected def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
 
   /** Special tokens of the model for processing */
   val sentencePadding: (String, String) =
@@ -264,7 +270,7 @@ private[nlp] abstract class BpeTokenizer(
         textList = splitTexts.clone()
       }
 
-      if (padWithSentenceTokens) {
+      if (padWithSequenceTokens) {
         text = sentencePadding._1 + text + sentencePadding._2
         splitTexts.prepend(sentencePadding._1)
         splitTexts.append(sentencePadding._2)
@@ -310,9 +316,10 @@ object BpeTokenizer {
       modelType: String,
       merges: Map[(String, String), Int],
       vocab: Map[String, Int],
-      padWithSentenceTokens: Boolean = false,
-      addPrefixSpace: Boolean = false,
-      specialTokens: Option[SpecialTokens] = None): BpeTokenizer = {
+      padWithSequenceTokens: Boolean = false,
+      addPrefixSpaceToSentence: Boolean = false,
+      specialTokens: Option[SpecialTokens] = None,
+      alwaysAddPrefix: Boolean = true): BpeTokenizer = {
 
     def modelSpecialTokens() = specialTokens match {
       case Some(specialTok) => specialTok
@@ -325,24 +332,26 @@ object BpeTokenizer {
           merges,
           vocab,
           modelSpecialTokens(),
-          padWithSentenceTokens,
-          addPrefixSpace = addPrefixSpace)
+          padWithSequenceTokens,
+          addPrefixSpaceToSentence = addPrefixSpaceToSentence,
+          alwaysAddPrefix = alwaysAddPrefix)
       case "xlm" =>
-        new XlmTokenizer(merges, vocab, modelSpecialTokens(), padWithSentenceTokens)
+        new XlmTokenizer(merges, vocab, modelSpecialTokens(), padWithSequenceTokens)
       case "gpt2" =>
         new Gpt2Tokenizer(
           merges,
           vocab,
           modelSpecialTokens(),
-          padWithSentenceTokens,
-          addPrefixSpace = addPrefixSpace)
+          padWithSequenceTokens,
+          addPrefixSpaceToSentence = addPrefixSpaceToSentence,
+          alwaysAddPrefix = alwaysAddPrefix)
       case "bart" =>
         new BartTokenizer(
           merges,
           vocab,
           modelSpecialTokens(),
-          padWithSentenceTokens,
-          addPrefixSpace = addPrefixSpace)
+          padWithSequenceTokens,
+          addPrefixSpaceToSentence = addPrefixSpaceToSentence)
       case _ =>
         throw new IllegalArgumentException("Model type \"" + modelType + "\" not supported yet.")
     }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala
index 61ecc3492e01cc..b237ee0288f899 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.scala
@@ -26,10 +26,17 @@ class Gpt2Tokenizer(
     merges: Map[(String, String), Int],
     vocab: Map[String, Int],
     specialTokens: SpecialTokens,
-    padWithSentenceTokens: Boolean = true,
+    padWithSequenceTokens: Boolean = true,
     prependString: String = "",
-    addPrefixSpace: Boolean = false)
-    extends BpeTokenizer(merges, vocab, specialTokens, padWithSentenceTokens, addPrefixSpace) {
+    addPrefixSpaceToSentence: Boolean = false,
+    alwaysAddPrefix: Boolean = true)
+  extends BpeTokenizer(
+    merges,
+    vocab,
+    specialTokens,
+    padWithSequenceTokens,
+    addPrefixSpaceToSentence,
+    alwaysAddPrefix) {
 
   /** Mapping for bytes to a different set of unicode characters (especially white spaces). This
     * improved model performance for gpt-2
@@ -53,7 +60,7 @@ class Gpt2Tokenizer(
   // Differs from Transformers, space is always prepended.
   // FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
   // such as '.' get space prepended and they should not.
-  override val prependForPieceId: Option[String] =
+  override val prefixForPieceId: Option[String] =
     if (prependString.nonEmpty) Some(prependString) else None
 
   protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))
@@ -62,7 +69,10 @@ class Gpt2Tokenizer(
     bytesToUnicodeMapping.map(x => (x._2, x._1))
 
   override def preProcessTokenForBpe(token: String): String = {
-    token.foldLeft("")(_ + bytesToUnicodeMapping(_))
+    token
+      .getBytes("UTF-8")
+      .map { b => if (b < 0) 256 + b else b }
+      .foldLeft("")(_ + bytesToUnicodeMapping(_))
   }
 
   val splitPattern: Regex =
@@ -71,7 +81,7 @@ class Gpt2Tokenizer(
   override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
     // split pattern based on gpt2's bpe tokenizer
     splitPattern
-      .findAllMatchIn(if (prependForPieceId.isDefined || text.startsWith(" ")) text
+      .findAllMatchIn(if (prefixForPieceId.isDefined || text.startsWith(" ")) text
       else " " + text) // Prepend space to the beginning of text
       .map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))
       .toArray
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala
index c003a70def528b..d696256abdc493 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/RobertaTokenizer.scala
@@ -20,12 +20,14 @@ class RobertaTokenizer(
     merges: Map[(String, String), Int],
     vocab: Map[String, Int],
     specialTokens: SpecialTokens,
-    padWithSentenceTokens: Boolean = false,
-    addPrefixSpace: Boolean = false)
+    padWithSequenceTokens: Boolean = false,
+    addPrefixSpaceToSentence: Boolean = false,
+    alwaysAddPrefix: Boolean = true)
     extends Gpt2Tokenizer(
       merges,
       vocab,
       specialTokens,
-      padWithSentenceTokens,
+      padWithSequenceTokens,
       prependString = "Ġ",
-      addPrefixSpace)
+      addPrefixSpaceToSentence,
+      alwaysAddPrefix)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala
index b3bc76e65e2c19..61ec78babf307c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.scala
@@ -37,15 +37,16 @@ private[nlp] class XlmTokenizer(
     merges: Map[(String, String), Int],
     vocab: Map[String, Int],
     specialTokens: SpecialTokens,
-    padWithSentenceTokens: Boolean = false,
+    padWithSequenceTokens: Boolean = false,
     lang: String = "en",
     doLowercaseAndRemoveAccent: Boolean = true)
     extends BpeTokenizer(
       merges,
       vocab,
       specialTokens,
-      padWithSentenceTokens,
-      addPrefixSpace = false) {
+      padWithSequenceTokens,
+      addPrefixSpaceToSentence = false,
+      alwaysAddPrefix = false) {
   require(lang == "en", "Only English is supported currently.")
 
   /** Lowercase and strips accents from a piece of text based on
@@ -93,7 +94,7 @@ private[nlp] class XlmTokenizer(
     indexedTokens
   }
 
-  override val appendForPieceId: Option[String] = Some("</w>")
+  override val suffixForPieceId: Option[String] = Some("</w>")
 
   override def bpe(indToken: IndexedToken): Array[TokenPiece] = {
     val processedToken = preProcessTokenForBpe(indToken.token)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala
index a42c1b334d9d0b..4b3515a0f10495 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala
@@ -294,8 +294,7 @@ class LongformerEmbeddings(override val uid: String)
     val bpeTokenizer = BpeTokenizer.forModel(
       "roberta",
       merges = $$(merges),
-      vocab = $$(vocabulary),
-      padWithSentenceTokens = false)
+      vocab = $$(vocabulary))
 
     tokens.map { tokenIndex =>
       // filter empty and only whitespace tokens
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala
index 02c06bca1b4e77..f28574f44ee4a6 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala
@@ -308,8 +308,7 @@ class RoBertaEmbeddings(override val uid: String)
     val bpeTokenizer = BpeTokenizer.forModel(
       "roberta",
       merges = $$(merges),
-      vocab = $$(vocabulary),
-      padWithSentenceTokens = false)
+      vocab = $$(vocabulary))
 
     tokens.map { tokenIndex =>
       // filter empty and only whitespace tokens
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala
index 46ce9c24968567..7129d14b8d791f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala
@@ -304,8 +304,7 @@ class RoBertaSentenceEmbeddings(override val uid: String)
     val bpeTokenizer = BpeTokenizer.forModel(
       "roberta",
       merges = $$(merges),
-      vocab = $$(vocabulary),
-      padWithSentenceTokens = false)
+      vocab = $$(vocabulary))
 
     sentences.map { s =>
       // filter empty and only whitespace tokens
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala
index 44b24216eee517..fcc811acafd249 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala
@@ -16,6 +16,7 @@
 
 package com.johnsnowlabs.nlp.annotators.classifier.dl
 
+import com.johnsnowlabs.nlp.Annotation
 import com.johnsnowlabs.nlp.base._
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import com.johnsnowlabs.tags.SlowTest
@@ -97,7 +98,7 @@ class RoBertaForQuestionAnsweringTestSpec extends AnyFlatSpec {
     val pipelineDF = pipelineModel.transform(data)
 
     Benchmark.time("Time to show RoBertaForQuestionAnswering results") {
-      pipelineDF.select("answer").show(10, false)
+      pipelineDF.select("answer").show(10, truncate = false)
     }
 
     Benchmark.time("Time to save RoBertaForQuestionAnswering results") {
@@ -107,6 +108,53 @@ class RoBertaForQuestionAnsweringTestSpec extends AnyFlatSpec {
         .mode("overwrite")
         .parquet("./tmp_question_answering")
     }
+  }
+
+  "RoBertaForQuestionAnswering" should "produce correct score and index" taggedAs SlowTest in {
+
+    val context = "My name is Sarah and I live in London"
+    val ddd = Seq(("Where do I live?", context))
+      .toDF("question", "context")
+      .repartition(1)
+
+    val document = new MultiDocumentAssembler()
+      .setInputCols("question", "context")
+      .setOutputCols("document_question", "document_context")
+
+    val questionAnswering = RoBertaForQuestionAnswering
+      .pretrained()
+      .setInputCols(Array("document_question", "document_context"))
+      .setOutputCol("answer")
+      .setCaseSensitive(true)
+      .setMaxSentenceLength(512)
+
+    val pipeline = new Pipeline().setStages(Array(document, questionAnswering))
+
+    val pipelineModel = pipeline.fit(ddd)
+    val pipelineDF = pipelineModel.transform(ddd)
 
+    pipelineDF.select("answer").show(truncate = false)
+
+    /* Expected:
+       {
+           "score": 0.7772300839424133,
+           "start": 31,
+           "end": 37,
+           "answer": "London"
+       }
+     */
+    val expectedScore: Float = 0.7772300839424133f
+    val expectedAnswer: String = "London"
+    val result = Annotation.collect(pipelineDF, "answer").head.head
+
+    val indexedAnswer: String =
+      context.slice(result.metadata("start").toInt + 1, result.metadata("end").toInt + 1)
+    val score: Float = result.metadata("score").toFloat
+
+    assert(result.result == expectedAnswer)
+    assert(indexedAnswer == expectedAnswer, "Indexes don't seem to match")
+
+    import com.johnsnowlabs.util.TestUtils.tolerantFloatEq
+    assert(score === expectedScore, "Score was not close enough")
   }
 }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala
index 9a74a68070f175..9350fd57fd0919 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizerBehaviours.scala
@@ -74,7 +74,7 @@ trait BpeTokenizerBehaviours {
 
     it should "add sentence padding correctly if requested" taggedAs FastTest in {
       val sentencePaddingTokenizer =
-        BpeTokenizer.forModel(modelType, merges, vocab, padWithSentenceTokens = true)
+        BpeTokenizer.forModel(modelType, merges, vocab, padWithSequenceTokens = true)
 
       val (tokenized: Array[IndexedToken], encoded: Array[TokenPiece]) =
         tokenizeAndEncode(sentencePaddingTokenizer, text)
@@ -93,7 +93,7 @@ trait BpeTokenizerBehaviours {
       expectedIds: Array[Int]): Unit = {
     it should "encode words correctly with added prefix" taggedAs FastTest in {
       val addedPrefixTokenizer =
-        BpeTokenizer.forModel(modelType, merges, vocab, addPrefixSpace = true)
+        BpeTokenizer.forModel(modelType, merges, vocab, addPrefixSpaceToSentence = true)
 
       val (_, encoded: Array[TokenPiece]) = tokenizeAndEncode(addedPrefixTokenizer, text)
       assertEncodedCorrectly(text, encoded, expected, expectedIds)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala
index 13c20af5afc53b..869dc931a5602f 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2TokenizerTestSpec.scala
@@ -16,6 +16,8 @@
 
 package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
 
+import com.johnsnowlabs.nlp.annotators.common.Sentence
+import com.johnsnowlabs.tags.FastTest
 import org.scalatest.flatspec.AnyFlatSpec
 
 class Gpt2TokenizerTestSpec extends AnyFlatSpec with BpeTokenizerBehaviours {
@@ -91,4 +93,26 @@ class Gpt2TokenizerTestSpec extends AnyFlatSpec with BpeTokenizerBehaviours {
       "d",
       "<|endoftext|>"),
     expectedIds = Array(1, 2, 3, 4, 5, 6, 0, 7, 8, 9, 10, 0))
+
+  it should "encode non latin tokens" taggedAs FastTest in {
+    val text = "吳天恩"
+
+    val vocab: Map[String, Int] =
+      "ĠåĲ³å¤©æģ©".map(_.toString).zipWithIndex.toMap ++ Seq(("<|endoftext|>", 100))
+
+    val merges: Map[(String, String), Int] = Map.empty
+
+    val bpeTokenizer =
+      BpeTokenizer.forModel(modelType, merges, vocab, alwaysAddPrefix = false)
+
+    val indexedTokens =
+      bpeTokenizer.tokenize(Sentence(text, 0, text.length, 0))
+
+    val encodedTokens = bpeTokenizer.encode(indexedTokens)
+
+    assert(
+      encodedTokens.forall(_.pieceId != bpeTokenizer.specialTokens.unk.id),
+      "Tokens should be able to be encoded.")
+
+  }
 }