forked from Qbeast-io/qbeast-spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Issue Qbeast-io#418: Abstract RollupDataWriter and QbeastStats (Qbeas…
…t-io#423) * Update rollup data writer * Abstract QbeastStats * Update QbeastStats to use Number * Update QbeastStats to use String * Update QbeastStats * Update QbeastStats Delta
- Loading branch information
1 parent
1e1baeb
commit 566b9cd
Showing
23 changed files
with
264 additions
and
92 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
/* | ||
* Copyright 2021 Qbeast Analytics, S.L. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.qbeast.core.model | ||
|
||
case class QbeastStats( | ||
numRecords: Long, | ||
minValues: Map[String, String], | ||
maxValues: Map[String, String], | ||
nullCount: Map[String, Int]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
src/main/scala/io/qbeast/spark/delta/DeltaRollupDataWriter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
* Copyright 2021 Qbeast Analytics, S.L. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.qbeast.spark.delta | ||
|
||
import io.qbeast.core.model._ | ||
import io.qbeast.spark.index.QbeastColumns | ||
import io.qbeast.spark.writer.RollupDataWriter | ||
import io.qbeast.spark.writer.StatsTracker | ||
import io.qbeast.spark.writer.TaskStats | ||
import io.qbeast.IISeq | ||
import org.apache.hadoop.fs.Path | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.delta.actions.AddFile | ||
import org.apache.spark.sql.delta.stats.DeltaFileStatistics | ||
import org.apache.spark.sql.delta.stats.DeltaJobStatisticsTracker | ||
import org.apache.spark.sql.delta.DeltaStatsCollectionUtils | ||
import org.apache.spark.sql.execution.datasources.BasicWriteTaskStats | ||
import org.apache.spark.sql.execution.datasources.WriteJobStatsTracker | ||
import org.apache.spark.sql.execution.datasources.WriteTaskStats | ||
import org.apache.spark.sql.types.StructType | ||
import org.apache.spark.sql.DataFrame | ||
|
||
import java.net.URI | ||
|
||
/** | ||
* Delta implementation of DataWriter that applies rollup to compact the files. | ||
*/ | ||
object DeltaRollupDataWriter extends RollupDataWriter with DeltaStatsCollectionUtils { | ||
|
||
override type GetCubeMaxWeight = CubeId => Weight | ||
override type Extract = InternalRow => (InternalRow, Weight, CubeId, CubeId) | ||
override type WriteRows = Iterator[InternalRow] => Iterator[(IndexFile, TaskStats)] | ||
|
||
override def write( | ||
tableId: QTableID, | ||
schema: StructType, | ||
data: DataFrame, | ||
tableChanges: TableChanges): IISeq[IndexFile] = { | ||
val revision = tableChanges.updatedRevision | ||
val dimensionCount = revision.transformations.length | ||
|
||
val statsTrackers = StatsTracker.getStatsTrackers | ||
val fileStatsTracker = getFileStatsTracker(tableId, data) | ||
val trackers = statsTrackers ++ fileStatsTracker | ||
|
||
val filesAndStats = internalWrite(tableId, schema, data, tableChanges, trackers) | ||
val stats = filesAndStats.map(_._2) | ||
processStats(stats, statsTrackers, fileStatsTracker) | ||
filesAndStats | ||
.map(_._1) | ||
.map(QbeastFileUtils.toAddFile(dataChange = true)) | ||
.map(correctAddFileStats(fileStatsTracker)) | ||
.map(QbeastFileUtils.fromAddFile(dimensionCount)) | ||
} | ||
|
||
private def getFileStatsTracker( | ||
tableId: QTableID, | ||
data: DataFrame): Option[DeltaJobStatisticsTracker] = { | ||
val spark = data.sparkSession | ||
val originalColumns = data.schema.map(_.name).filterNot(QbeastColumns.contains) | ||
val originalData = data.selectExpr(originalColumns: _*) | ||
getDeltaOptionalTrackers(originalData, spark, tableId) | ||
} | ||
|
||
private def processStats( | ||
stats: IISeq[TaskStats], | ||
statsTrackers: Seq[WriteJobStatsTracker], | ||
fileStatsTracker: Option[DeltaJobStatisticsTracker]): Unit = { | ||
val basicStatsBuilder = Seq.newBuilder[WriteTaskStats] | ||
val fileStatsBuilder = Seq.newBuilder[WriteTaskStats] | ||
var endTime = 0L | ||
stats.foreach(stats => { | ||
fileStatsBuilder ++= stats.writeTaskStats.filter(_.isInstanceOf[DeltaFileStatistics]) | ||
basicStatsBuilder ++= stats.writeTaskStats.filter(_.isInstanceOf[BasicWriteTaskStats]) | ||
endTime = math.max(endTime, stats.endTime) | ||
}) | ||
val basicStats = basicStatsBuilder.result() | ||
val fileStats = fileStatsBuilder.result() | ||
statsTrackers.foreach(_.processStats(basicStats, endTime)) | ||
fileStatsTracker.foreach(_.processStats(fileStats, endTime)) | ||
} | ||
|
||
private def correctAddFileStats(fileStatsTracker: Option[DeltaJobStatisticsTracker])( | ||
file: AddFile): AddFile = { | ||
val path = new Path(new URI(file.path)).toString | ||
fileStatsTracker | ||
.map(_.recordedStats(path)) | ||
.map(stats => file.copy(stats = stats)) | ||
.getOrElse(file) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
src/main/scala/io/qbeast/spark/delta/QbeastStatsUtils.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
/* | ||
* Copyright 2021 Qbeast Analytics, S.L. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.qbeast.spark.delta | ||
|
||
import com.fasterxml.jackson.core.JsonGenerator | ||
import com.fasterxml.jackson.core.JsonParseException | ||
import com.fasterxml.jackson.core.JsonParser | ||
import com.fasterxml.jackson.databind.module.SimpleModule | ||
import com.fasterxml.jackson.databind.DeserializationContext | ||
import com.fasterxml.jackson.databind.JsonDeserializer | ||
import com.fasterxml.jackson.databind.JsonMappingException | ||
import com.fasterxml.jackson.databind.JsonNode | ||
import com.fasterxml.jackson.databind.JsonSerializer | ||
import com.fasterxml.jackson.databind.SerializerProvider | ||
import com.fasterxml.jackson.module.scala.ClassTagExtensions | ||
import io.qbeast.core.model.mapper | ||
import io.qbeast.core.model.QbeastStats | ||
|
||
object QbeastStatsUtils { | ||
private val module = new SimpleModule() | ||
module.addSerializer(classOf[String], new ValueSerializer) | ||
module.addDeserializer(classOf[String], new ValueDeserializer) | ||
mapper.registerModule(module) | ||
|
||
def fromString(jsonString: String): Option[QbeastStats] = { | ||
try { | ||
Some(mapper.asInstanceOf[ClassTagExtensions].readValue[QbeastStats](jsonString)) | ||
} catch { | ||
case e: JsonParseException => | ||
println(s"Failed to parse JSON: ${e.getMessage}") | ||
None | ||
case e: JsonMappingException => | ||
println(s"Error mapping JSON: ${e.getMessage}") | ||
None | ||
case e: Exception => | ||
println(s"An error occurred: ${e.getMessage}") | ||
None | ||
} | ||
} | ||
|
||
def toString(qbeastStats: QbeastStats): String = mapper.writeValueAsString(qbeastStats) | ||
|
||
} | ||
|
||
class ValueSerializer extends JsonSerializer[String] { | ||
|
||
override def serialize( | ||
value: String, | ||
gen: JsonGenerator, | ||
serializers: SerializerProvider): Unit = { | ||
try { | ||
val intValue = value.toInt | ||
gen.writeNumber(intValue) | ||
} catch { | ||
case _: NumberFormatException => | ||
try { | ||
val doubleValue = value.toDouble | ||
gen.writeNumber(doubleValue) | ||
} catch { | ||
case _: NumberFormatException => | ||
gen.writeString(value) | ||
} | ||
} | ||
} | ||
|
||
} | ||
|
||
class ValueDeserializer extends JsonDeserializer[String] { | ||
|
||
override def deserialize(p: JsonParser, ct: DeserializationContext): String = { | ||
val node = p.getCodec.readTree[JsonNode](p) | ||
if (node.isNumber) { | ||
node.asText() | ||
} else if (node.isTextual) { | ||
node.asText() | ||
} else { | ||
throw new IllegalArgumentException("Unsupported JSON type for value") | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.