Skip to content

Commit

Permalink
Move bounding to the Transformation
Browse files Browse the repository at this point in the history
  • Loading branch information
osopardo1 committed Jan 21, 2025
1 parent 86f8189 commit c223a0e
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ case class LinearTransformation(
1.0 / (mx - mn)
}

/**
* The LinearTransformation is bounded by min and max values
* @return
*/
override def bounded: Boolean = true

override def transform(value: Any): Double = {
val v = if (value == null) nullValue else value
v match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ case class LinearTransformer(columnName: String, dataType: QDataType) extends Tr
}
}

override def isBounded: Boolean = true

override def stats: ColumnStats =
ColumnStats(
names = Seq(colMin, colMax),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ import java.time.Instant
property = "className")
trait Transformation extends Serializable {

/**
* Returns if the transformation is bounded within the space f.e: LinearTransformer is bounded
* with min and max (values outside the ranges can cause Indexing Errors), HashTransformer is
* not, because the transformation function is a Hash with a fixed range
*
* @return
*/
def bounded: Boolean = false

/**
* Normalize an input value to a Double between 0 and 1.
* @param value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,6 @@ trait Transformer extends Serializable {

def spec: String = s"$columnName:${transformerType.transformerSimpleName}"

/**
* Returns if the transformation is bounded within the space
* f.e:
* LinearTransformer is bounded with min and max (values outside the ranges can cause Indexing Errors),
* HashTransformer is not, because the transformation function is a Hash with a fixed range
* @return
*/
def isBounded: Boolean = false

}

trait ColumnStats extends Serializable {
Expand Down
18 changes: 11 additions & 7 deletions core/src/main/scala/io/qbeast/spark/index/OTreeDataAnalyzer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,16 @@ object DoublePassOTreeDataAnalyzer
dataFrame: DataFrame,
revision: Revision): Unit = {
// Check if the DataFrame and the Columns To Index are deterministic
val columnsToAnalyze =
revision.columnTransformers.filter(_.isBounded).map(_.columnName)
if (columnsToAnalyze.nonEmpty) {
logDebug(s"Some columnsToIndex need to come from a Deterministic Source: {${columnsToAnalyze
.mkString(",")}}. Checking the determinism of the input data")
val boundedColumnTransformations = revision.columnTransformers
.zip(revision.transformations)
.collect {
case (columnTransformer, transformation) if transformation.bounded =>
columnTransformer.columnName
}
if (boundedColumnTransformations.nonEmpty) {
logDebug(
s"Some columns to index need to come from a Deterministic Source: {${boundedColumnTransformations
.mkString(",")}}. Checking the determinism of the input data")

// Detect if the DataFrame's operations are deterministic
val isPlanDeterministic: Boolean = isDataFramePlanDeterministic(dataFrame)
Expand All @@ -252,8 +257,7 @@ object DoublePassOTreeDataAnalyzer
s"3. save the DF as delta and Convert it To Qbeast in a second step")
}
} else {
logDebug(
s"No columnsToIndex need to come from a Deterministic Source. Skipping determinism check.")
logDebug(s"No bounded columns to index. Skipping determinism check.")
}
}

Expand Down

0 comments on commit c223a0e

Please sign in to comment.