Qbeast-io · Jiaweihu08 · Jan 27, 2023 · May 10, 2022 · May 11, 2022 · Sep 8, 2022
diff --git a/core/src/main/scala/io/qbeast/core/model/MetadataManager.scala b/core/src/main/scala/io/qbeast/core/model/MetadataManager.scala
@@ -8,6 +8,7 @@ import io.qbeast.IISeq
  * @tparam FileDescriptor type of file descriptor
  */
 trait MetadataManager[DataSchema, FileDescriptor] {
+  type Configuration = Map[String, String]
 
   /**
    * Gets the Snapshot for a given table
@@ -33,6 +34,16 @@ trait MetadataManager[DataSchema, FileDescriptor] {
   def updateWithTransaction(tableID: QTableID, schema: DataSchema, append: Boolean)(
       writer: => (TableChanges, IISeq[FileDescriptor])): Unit
 
+  /**
+   * Updates the table metadata by overwriting the metadata configurations
+   * with the provided key-value pairs.
+   * @param tableID QTableID
+   * @param schema table schema
+   * @param update configurations used to overwrite the existing metadata
+   */
+  def updateMetadataWithTransaction(tableID: QTableID, schema: DataSchema)(
+      update: => Configuration): Unit
+
   /**
    * Updates the Revision with the given RevisionChanges
    * @param tableID the QTableID

diff --git a/core/src/main/scala/io/qbeast/core/model/RevisionClasses.scala b/core/src/main/scala/io/qbeast/core/model/RevisionClasses.scala
@@ -4,7 +4,8 @@ import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
 import com.fasterxml.jackson.databind.annotation.JsonSerialize
 import com.fasterxml.jackson.databind.annotation.JsonSerialize.Typing
 import io.qbeast.IISeq
-import io.qbeast.core.transform.{Transformation, Transformer}
+import io.qbeast.core.model.RevisionUtils.stagingID
+import io.qbeast.core.transform.{EmptyTransformer, Transformation, Transformer}
 
 import scala.collection.immutable.SortedMap
 
@@ -60,11 +61,46 @@ object Revision {
       desiredCubeSize,
       columnTransformers,
       Vector.empty)
+  }
+
+  /**
+   * Initialize Revision for table conversion. The RevisionID for a converted table is 0.
+   * EmptyTransformers and EmptyTransformations are used. This Revision should always be
+   * superseded.
+   */
+  def emptyRevision(
+      tableID: QTableID,
+      desiredCubeSize: Int,
+      columnsToIndex: Seq[String]): Revision = {
+    val emptyTransformers = columnsToIndex.map(s => EmptyTransformer(s)).toIndexedSeq
+    val emptyTransformations = emptyTransformers.map(_.makeTransformation(r => r))
 
+    Revision(
+      stagingID,
+      System.currentTimeMillis(),
+      tableID,
+      desiredCubeSize,
+      emptyTransformers,
+      emptyTransformations)
   }
 
 }
 
+object RevisionUtils {
+  val stagingID: RevisionID = 0
+
+  def isStaging(revisionID: RevisionID): Boolean =
+    revisionID == stagingID
+
+  def isStaging(revision: Revision): Boolean =
+    isStaging(revision.revisionID) &&
+      revision.columnTransformers.forall {
+        case _: EmptyTransformer => true
+        case _ => false
+      }
+
+}
+
 /**
  * A revision of a QTable.
  * @param revisionID the identifier of the revision
@@ -89,8 +125,7 @@ final case class Revision(
   assert(columnTransformers != null || transformations != null)
 
   /**
-   * *
-   * Controls that the this revision indexes all and only the provided columns.
+   * Controls that this revision indexes all and only the provided columns.
    *
    * @param columnsToIndex the column names to check.
    * @return true if the revision indexes all and only the provided columns.
@@ -117,7 +152,7 @@ final case class Revision(
 
   /**
    * returns the normalized values
-   * @param values
+   * @param values row values for the indexing columns
    * @return the normalized values
    */
   def transform(values: IISeq[_]): IISeq[Double] = {
@@ -193,8 +228,9 @@ case class IndexStatus(
     cubesStatuses: SortedMap[CubeId, CubeStatus] = SortedMap.empty)
     extends Serializable {
 
-  def addAnnouncements(newAnnouncedSet: Set[CubeId]): IndexStatus =
+  def addAnnouncements(newAnnouncedSet: Set[CubeId]): IndexStatus = {
     copy(announcedSet = announcedSet ++ newAnnouncedSet)
+  }
 
   def cubesToOptimize: Set[CubeId] = announcedSet.diff(replicatedSet)
 

diff --git a/core/src/main/scala/io/qbeast/core/transform/EmptyTransformation.scala b/core/src/main/scala/io/qbeast/core/transform/EmptyTransformation.scala
@@ -0,0 +1,13 @@
+package io.qbeast.core.transform
+
+/**
+ * An empty Transformation meant for empty revisions
+ */
+case class EmptyTransformation() extends Transformation {
+
+  override def transform(value: Any): Double = 0d
+
+  override def isSupersededBy(newTransformation: Transformation): Boolean = true
+
+  override def merge(other: Transformation): Transformation = other
+}
diff --git a/core/src/main/scala/io/qbeast/core/transform/EmptyTransformer.scala b/core/src/main/scala/io/qbeast/core/transform/EmptyTransformer.scala
@@ -0,0 +1,22 @@
+package io.qbeast.core.transform
+
+import io.qbeast.core.model.QDataType
+
+object EmptyTransformer extends TransformerType {
+  override def transformerSimpleName: String = "empty"
+
+  override def apply(columnName: String, dataType: QDataType): Transformer =
+    EmptyTransformer(columnName)
+
+}
+
+/**
+ * An empty Transformer meant for empty revisions
+ */
+case class EmptyTransformer(columnName: String) extends Transformer {
+  override protected def transformerType: TransformerType = EmptyTransformer
+
+  override def stats: ColumnStats = NoColumnStats
+
+  override def makeTransformation(row: String => Any): Transformation = EmptyTransformation()
+}
diff --git a/core/src/test/scala/io/qbeast/core/transform/EmptyTransformationTest.scala b/core/src/test/scala/io/qbeast/core/transform/EmptyTransformationTest.scala
@@ -0,0 +1,35 @@
+package io.qbeast.core.transform
+
+import io.qbeast.core.model.DoubleDataType
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+class EmptyTransformationTest extends AnyFlatSpec with Matchers {
+  it should "always map to the same value" in {
+    val t = EmptyTransformation()
+
+    (1 to 100).foreach { i =>
+      t.transform(i) shouldBe 0d
+    }
+
+    t.transform(null) shouldBe 0d
+  }
+
+  it should "be superseded by another Transformation" in {
+    val et = EmptyTransformation()
+    val ht = HashTransformation()
+    val lt = LinearTransformation(1d, 1.1, DoubleDataType)
+
+    et.isSupersededBy(ht) shouldBe true
+    et.isSupersededBy(lt) shouldBe true
+  }
+
+  it should "return the other Transformation when merging" in {
+    val et = EmptyTransformation()
+    val ht = HashTransformation()
+    val lt = LinearTransformation(1d, 1.1, DoubleDataType)
+
+    et.merge(ht) shouldBe ht
+    et.merge(lt) shouldBe lt
+  }
+}
diff --git a/core/src/test/scala/io/qbeast/core/transform/TransformerTest.scala b/core/src/test/scala/io/qbeast/core/transform/TransformerTest.scala
@@ -1,8 +1,9 @@
 package io.qbeast.core.transform
 
-import io.qbeast.core.model.{DateDataType, IntegerDataType, TimestampDataType}
+import io.qbeast.core.model.{DateDataType, IntegerDataType, StringDataType, TimestampDataType}
 import org.scalatest.flatspec.AnyFlatSpec
 import org.scalatest.matchers.should.Matchers
+
 import java.sql.{Date, Timestamp}
 
 class TransformerTest extends AnyFlatSpec with Matchers {
@@ -94,4 +95,14 @@ class TransformerTest extends AnyFlatSpec with Matchers {
 
     transformer.maybeUpdateTransformation(currentTransformation, transformation) shouldBe None
   }
+
+  "An EmptyTransformer" should "create an EmptyTransformation without stats" in {
+    EmptyTransformer.transformerSimpleName shouldBe "empty"
+
+    val colName = "a"
+    val transformer = EmptyTransformer(colName, StringDataType)
+
+    val transformation = transformer.makeTransformation(r => r)
+    transformation shouldBe a[EmptyTransformation]
+  }
 }
diff --git a/docs/QbeastFormat.md b/docs/QbeastFormat.md
@@ -153,6 +153,36 @@ In Revision, you can find different information about the tree status and config
 
 In this case, we index columns `user_id` and `product_id` (which are both `Integers`) with a linear transformation. This means that they will not suffer any transformation besides the normalization.
 
+### Staging Revision and ConvertToQbeastCommand
+The introduction of the staging revision enables reading tables in a hybrid `qbeast + delta` state.
+The non-qbeast `AddFile`s are considered as part of this staging revision, all belonging to the root.
+
+Its RevisionID is fixed to `stagingID = 0`, and it has `EmptyTransformer`s and `EmptyTransformation`s.
+It is automatically created during the first write or when overwriting a table using qbeast.
+For a table that is entirely written in `delta` or `parquet`, we can use the `ConvertToQbeastCommand` to create this revision:
+```scala
+import io.qbeast.spark.internal.commands.ConvertToQbeastCommand
+
+val path = "/pathToTable/"
+val tableIdentifier = s"parquet.`$path`"
+val columnsToIndex = Seq("col1", "col2", "col3")
+val desiredCubeSize = 50000
+
+ConvertToQbeastCommand(tableIdentifier, columnsToIndex, desiredCubeSize).run(spark)
+
+val qTable = spark.read.format("qbeast").load(path)
+```
+By doing so, we also enable subsequent appends using either delta or qbeast.
+Conversion on a partitioned table is not supported.
+
+`Compaction` can be performed on the staging revision to group small delta files:
+```scala
+import io.qbeast.spark.QbeastTable
+
+val table = QbeastTable.forPath(spark, "/pathToTable/")
+table.compact(0)
+```
+
 ### State changes in MetaData
 
 **Data de-normalization** is a crucial component behind our multi-dimensional index. Instead of storing an index in a separate tree-like data structure, we reorganize the data and their replications in an `OTree`, whose **hierarchical structure** is the actual index.

diff --git a/src/main/scala/io/qbeast/spark/QbeastTable.scala b/src/main/scala/io/qbeast/spark/QbeastTable.scala
@@ -4,6 +4,7 @@
 package io.qbeast.spark
 
 import io.qbeast.context.QbeastContext
+import io.qbeast.core.model.RevisionUtils.isStaging
 import io.qbeast.core.model.{CubeId, CubeStatus, QTableID, RevisionID}
 import io.qbeast.spark.delta.DeltaQbeastSnapshot
 import io.qbeast.spark.internal.commands.{
@@ -56,13 +57,17 @@ class QbeastTable private (
    *                          If doesn't exist or none is specified, would be the last available
    */
   def optimize(revisionID: RevisionID): Unit = {
-    checkRevisionAvailable(revisionID)
-    OptimizeTableCommand(revisionID, indexedTable)
-      .run(sparkSession)
+    if (!isStaging(revisionID)) {
+      checkRevisionAvailable(revisionID)
+      OptimizeTableCommand(revisionID, indexedTable)
+        .run(sparkSession)
+    }
   }
 
   def optimize(): Unit = {
-    optimize(latestRevisionAvailableID)
+    if (!isStaging(latestRevisionAvailableID)) {
+      optimize(latestRevisionAvailableID)
+    }
   }
 
   /**
@@ -73,14 +78,18 @@ class QbeastTable private (
    * @return the sequence of cubes to optimize in string representation
    */
   def analyze(revisionID: RevisionID): Seq[String] = {
-    checkRevisionAvailable(revisionID)
-    AnalyzeTableCommand(revisionID, indexedTable)
-      .run(sparkSession)
-      .map(_.getString(0))
+    if (isStaging(revisionID)) Seq.empty
+    else {
+      checkRevisionAvailable(revisionID)
+      AnalyzeTableCommand(revisionID, indexedTable)
+        .run(sparkSession)
+        .map(_.getString(0))
+    }
   }
 
   def analyze(): Seq[String] = {
-    analyze(latestRevisionAvailableID)
+    if (isStaging(latestRevisionAvailableID)) Seq.empty
+    else analyze(latestRevisionAvailableID)
   }
 
   /**
@@ -103,7 +112,7 @@ class QbeastTable private (
     val allCubeStatuses = qbeastSnapshot.loadLatestIndexStatus.cubesStatuses
 
     val cubeCount = allCubeStatuses.size
-    val depth = allCubeStatuses.map(_._1.depth).max
+    val depth = if (cubeCount == 0) 0 else allCubeStatuses.map(_._1.depth).max
     val rowCount = allCubeStatuses.flatMap(_._2.files.map(_.elementCount)).sum
 
     val dimensionCount = indexedColumns().size

diff --git a/src/main/scala/io/qbeast/spark/delta/DeltaMetadataWriter.scala b/src/main/scala/io/qbeast/spark/delta/DeltaMetadataWriter.scala
@@ -5,14 +5,9 @@ package io.qbeast.spark.delta
 
 import io.qbeast.core.model.{QTableID, RevisionID, TableChanges}
 import io.qbeast.spark.delta.writer.StatsTracker.registerStatsTrackers
+import io.qbeast.spark.utils.QbeastExceptionMessages.partitionedTableExceptionMsg
 import io.qbeast.spark.utils.TagColumns
-import org.apache.spark.sql.delta.actions.{
-  Action,
-  AddFile,
-  FileAction,
-  RemoveFile,
-  SetTransaction
-}
+import org.apache.spark.sql.delta.actions._
 import org.apache.spark.sql.delta.commands.DeltaCommand
 import org.apache.spark.sql.delta.{DeltaLog, DeltaOperations, DeltaOptions, OptimisticTransaction}
 import org.apache.spark.sql.execution.datasources.{
@@ -84,6 +79,24 @@ private[delta] case class DeltaMetadataWriter(
     }
   }
 
+  def updateMetadataWithTransaction(update: => Configuration): Unit = {
+    deltaLog.withNewTransaction { txn =>
+      if (txn.metadata.partitionColumns.nonEmpty) {
+        throw AnalysisExceptionFactory.create(partitionedTableExceptionMsg)
+      }
+
+      val config = update
+      val updatedConfig = config.foldLeft(txn.metadata.configuration) { case (accConf, (k, v)) =>
+        accConf.updated(k, v)
+      }
+      val updatedMetadata = txn.metadata.copy(configuration = updatedConfig)
+
+      val op = DeltaOperations.SetTableProperties(config)
+      txn.updateMetadata(updatedMetadata)
+      txn.commit(Seq.empty, op)
+    }
+  }
+
   private def updateReplicatedFiles(tableChanges: TableChanges): Seq[Action] = {
 
     val revision = tableChanges.updatedRevision
@@ -168,7 +181,7 @@ private[delta] case class DeltaMetadataWriter(
       addFiles.map(_.copy(dataChange = !rearrangeOnly)) ++
         deletedFiles.map(_.copy(dataChange = !rearrangeOnly))
     } else {
-      newFiles ++ deletedFiles
+      addFiles ++ deletedFiles
     }
 
     if (isOptimizeOperation) {