Skip to content

Commit

Permalink
refactor(dao): Compare packages in a single query
Browse files Browse the repository at this point in the history
The `findByPackage` function of the `PackageDao` compares a `package` to
already stored packages to find any duplicates. The previous
implementation used separate queries for all existing tables, which
could result in a huge amount of queries, especially when many packages
with almost identical metadata are already stored in the database.

The comparison is replaced with a single query that loads all relevant
data, significantly reducing the amount of queries executed when storing
packages.

If the query finds a duplicate, the DAO API is still used to load the
data to simplify the code. This could be further optimized, but for now
the most important change is to improve the performance of finding
duplicates.

Signed-off-by: Martin Nonnenmacher <martin.nonnenmacher@bosch.com>
  • Loading branch information
mnonnenmacher committed Jan 22, 2025
1 parent 59d0a6f commit d837499
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 67 deletions.
111 changes: 90 additions & 21 deletions dao/src/main/kotlin/repositories/analyzerrun/PackagesTable.kt
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,26 @@

package org.eclipse.apoapsis.ortserver.dao.repositories.analyzerrun

import org.eclipse.apoapsis.ortserver.dao.mapAndCompare
import org.eclipse.apoapsis.ortserver.dao.tables.shared.DeclaredLicenseDao
import org.eclipse.apoapsis.ortserver.dao.tables.shared.DeclaredLicensesTable
import org.eclipse.apoapsis.ortserver.dao.tables.shared.IdentifierDao
import org.eclipse.apoapsis.ortserver.dao.tables.shared.IdentifiersTable
import org.eclipse.apoapsis.ortserver.dao.tables.shared.RemoteArtifactDao
import org.eclipse.apoapsis.ortserver.dao.tables.shared.RemoteArtifactsTable
import org.eclipse.apoapsis.ortserver.dao.tables.shared.VcsInfoDao
import org.eclipse.apoapsis.ortserver.dao.tables.shared.VcsInfoTable
import org.eclipse.apoapsis.ortserver.dao.utils.ArrayAggColumnEquals
import org.eclipse.apoapsis.ortserver.dao.utils.ArrayAggTwoColumnsEquals
import org.eclipse.apoapsis.ortserver.dao.utils.SortableEntityClass
import org.eclipse.apoapsis.ortserver.dao.utils.SortableTable
import org.eclipse.apoapsis.ortserver.model.runs.Package

import org.jetbrains.exposed.dao.LongEntity
import org.jetbrains.exposed.dao.id.EntityID
import org.jetbrains.exposed.sql.and
import org.jetbrains.exposed.sql.JoinType
import org.jetbrains.exposed.sql.alias
import org.jetbrains.exposed.sql.andHaving
import org.jetbrains.exposed.sql.andWhere

/**
* A table to represent all metadata for a software package.
Expand All @@ -55,25 +60,89 @@ object PackagesTable : SortableTable("packages") {

class PackageDao(id: EntityID<Long>) : LongEntity(id) {
companion object : SortableEntityClass<PackageDao>(PackagesTable) {
fun findByPackage(pkg: Package): PackageDao? =
// TODO: Implement a more efficient way to check if an identical package already exists.
find {
PackagesTable.purl eq pkg.purl and
(PackagesTable.cpe eq pkg.cpe) and
(PackagesTable.description eq pkg.description) and
(PackagesTable.homepageUrl eq pkg.homepageUrl) and
(PackagesTable.isMetadataOnly eq pkg.isMetadataOnly) and
(PackagesTable.isModified eq pkg.isModified)
}.firstOrNull {
it.identifier.mapToModel() == pkg.identifier &&
mapAndCompare(it.authors, pkg.authors, AuthorDao::name) &&
mapAndCompare(it.declaredLicenses, pkg.declaredLicenses, DeclaredLicenseDao::name) &&
it.processedDeclaredLicense.mapToModel() == pkg.processedDeclaredLicense &&
it.vcs.mapToModel() == pkg.vcs &&
it.vcsProcessed.mapToModel() == pkg.vcsProcessed &&
it.binaryArtifact.mapToModel() == pkg.binaryArtifact &&
it.sourceArtifact.mapToModel() == pkg.sourceArtifact
}
fun findByPackage(pkg: Package): PackageDao? {
val vcsProcessed = VcsInfoTable.alias("vcs_processed_info")
val binaryArtifacts = RemoteArtifactsTable.alias("binary_artifacts")
val sourceArtifacts = RemoteArtifactsTable.alias("source_artifacts")
val query = PackagesTable
.leftJoin(IdentifiersTable)
.join(VcsInfoTable, JoinType.LEFT, onColumn = PackagesTable.vcsId, otherColumn = VcsInfoTable.id)
.join(vcsProcessed, JoinType.LEFT, PackagesTable.vcsProcessedId, vcsProcessed[VcsInfoTable.id])
.join(
binaryArtifacts,
JoinType.LEFT,
PackagesTable.binaryArtifactId,
binaryArtifacts[RemoteArtifactsTable.id]
)
.join(
sourceArtifacts,
JoinType.LEFT,
PackagesTable.sourceArtifactId,
sourceArtifacts[RemoteArtifactsTable.id]
)
.leftJoin(PackagesAuthorsTable)
.leftJoin(AuthorsTable)
.leftJoin(PackagesDeclaredLicensesTable)
.leftJoin(DeclaredLicensesTable)
.leftJoin(ProcessedDeclaredLicensesTable)
.leftJoin(ProcessedDeclaredLicensesMappedDeclaredLicensesTable)
.leftJoin(MappedDeclaredLicensesTable)
.leftJoin(ProcessedDeclaredLicensesUnmappedDeclaredLicensesTable)
.leftJoin(UnmappedDeclaredLicensesTable)
.select(PackagesTable.id)
.where { PackagesTable.purl eq pkg.purl }
.andWhere { PackagesTable.cpe eq pkg.cpe }
.andWhere { PackagesTable.description eq pkg.description }
.andWhere { PackagesTable.homepageUrl eq pkg.homepageUrl }
.andWhere { PackagesTable.isMetadataOnly eq pkg.isMetadataOnly }
.andWhere { PackagesTable.isModified eq pkg.isModified }
.andWhere { IdentifiersTable.type eq pkg.identifier.type }
.andWhere { IdentifiersTable.namespace eq pkg.identifier.namespace }
.andWhere { IdentifiersTable.name eq pkg.identifier.name }
.andWhere { IdentifiersTable.version eq pkg.identifier.version }
.andWhere { VcsInfoTable.type eq pkg.vcs.type.name }
.andWhere { VcsInfoTable.url eq pkg.vcs.url }
.andWhere { VcsInfoTable.revision eq pkg.vcs.revision }
.andWhere { VcsInfoTable.path eq pkg.vcs.path }
.andWhere { vcsProcessed[VcsInfoTable.type] eq pkg.vcsProcessed.type.name }
.andWhere { vcsProcessed[VcsInfoTable.url] eq pkg.vcsProcessed.url }
.andWhere { vcsProcessed[VcsInfoTable.revision] eq pkg.vcsProcessed.revision }
.andWhere { vcsProcessed[VcsInfoTable.path] eq pkg.vcsProcessed.path }
.andWhere { binaryArtifacts[RemoteArtifactsTable.url] eq pkg.binaryArtifact.url }
.andWhere { binaryArtifacts[RemoteArtifactsTable.hashValue] eq pkg.binaryArtifact.hashValue }
.andWhere { binaryArtifacts[RemoteArtifactsTable.hashAlgorithm] eq pkg.binaryArtifact.hashAlgorithm }
.andWhere { sourceArtifacts[RemoteArtifactsTable.url] eq pkg.sourceArtifact.url }
.andWhere { sourceArtifacts[RemoteArtifactsTable.hashValue] eq pkg.sourceArtifact.hashValue }
.andWhere { sourceArtifacts[RemoteArtifactsTable.hashAlgorithm] eq pkg.sourceArtifact.hashAlgorithm }
.andWhere { ProcessedDeclaredLicensesTable.spdxExpression eq pkg.processedDeclaredLicense.spdxExpression }
.groupBy(
PackagesTable.id,
IdentifiersTable.id,
VcsInfoTable.id,
vcsProcessed[VcsInfoTable.id],
binaryArtifacts[RemoteArtifactsTable.id],
sourceArtifacts[RemoteArtifactsTable.id]
)
.having { ArrayAggColumnEquals(AuthorsTable.name, pkg.authors) }
.andHaving { ArrayAggColumnEquals(DeclaredLicensesTable.name, pkg.declaredLicenses) }
.andHaving {
ArrayAggColumnEquals(
UnmappedDeclaredLicensesTable.unmappedLicense,
pkg.processedDeclaredLicense.unmappedLicenses
)
}
.andHaving {
ArrayAggTwoColumnsEquals(
MappedDeclaredLicensesTable.declaredLicense,
MappedDeclaredLicensesTable.mappedLicense,
pkg.processedDeclaredLicense.mappedLicenses
)
}

val id = query.firstOrNull()?.let { it[PackagesTable.id] } ?: return null

return PackageDao[id]
}
}

var identifier by IdentifierDao referencedOn PackagesTable.identifierId
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import org.eclipse.apoapsis.ortserver.dao.tables.shared.IdentifiersTable
import org.eclipse.apoapsis.ortserver.dao.tables.shared.VcsInfoDao
import org.eclipse.apoapsis.ortserver.dao.tables.shared.VcsInfoTable
import org.eclipse.apoapsis.ortserver.dao.utils.ArrayAggColumnEquals
import org.eclipse.apoapsis.ortserver.dao.utils.ArrayAggNullableColumnEquals
import org.eclipse.apoapsis.ortserver.dao.utils.ArrayAggTwoColumnsEquals
import org.eclipse.apoapsis.ortserver.model.runs.Project

Expand Down Expand Up @@ -88,16 +87,11 @@ class ProjectDao(id: EntityID<Long>) : LongEntity(id) {
.andWhere { vcsProcessed[VcsInfoTable.url] eq project.vcsProcessed.url }
.andWhere { vcsProcessed[VcsInfoTable.revision] eq project.vcsProcessed.revision }
.andWhere { vcsProcessed[VcsInfoTable.path] eq project.vcsProcessed.path }
.andWhere { ProcessedDeclaredLicensesTable.spdxExpression eq project.processedDeclaredLicense.spdxExpression }
.groupBy(ProjectsTable.id, IdentifiersTable.id, VcsInfoTable.id, vcsProcessed[VcsInfoTable.id])
.having { ArrayAggColumnEquals(AuthorsTable.name, project.authors) }
.andHaving { ArrayAggColumnEquals(DeclaredLicensesTable.name, project.declaredLicenses) }
.andHaving { ArrayAggColumnEquals(ProjectScopesTable.name, project.scopeNames) }
.andHaving {
ArrayAggNullableColumnEquals(
ProcessedDeclaredLicensesTable.spdxExpression,
setOf(project.processedDeclaredLicense.spdxExpression.toString())
)
}
.andHaving {
ArrayAggColumnEquals(
UnmappedDeclaredLicensesTable.unmappedLicense,
Expand Down
51 changes: 12 additions & 39 deletions dao/src/main/kotlin/utils/ArrayAggUtils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -40,48 +40,21 @@ class ArrayAggColumnEquals(
) : Op<Boolean>() {
@OptIn(ExperimentalEncodingApi::class)
override fun toQueryBuilder(queryBuilder: QueryBuilder) = queryBuilder {
val joinedValue = if (value.isEmpty()) {
"NULL"
if (value.isEmpty()) {
append("ARRAY_AGG(DISTINCT ")
append("ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64')")
append(") = ARRAY[NULL]")
} else {
value.joinToString { "'${Base64.encode(it.toByteArray())}'" }
}

append("ARRAY_AGG(DISTINCT ")
append("ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64')")
append(") <@ ARRAY[", joinedValue, "]")
append(" AND ")
append("ARRAY_AGG(DISTINCT ")
append("ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64')")
append(") @> ARRAY[", joinedValue, "]")
}
}
val joinedValue = value.joinToString { "'${Base64.encode(it.toByteArray())}'" }

/**
* A custom operation to check if the aggregated values of a nullable string column are equal to a set of values, to be
* used in a WHERE or HAVING clause. This is useful for checking if a column contains a set of values, which is not
* directly supported by Exposed.
*
* All values are encoded as base64 before comparison to handle special characters and to avoid SQL injection.
*/
class ArrayAggNullableColumnEquals(
private val column: Column<String?>,
private val value: Set<String>
) : Op<Boolean>() {
@OptIn(ExperimentalEncodingApi::class)
override fun toQueryBuilder(queryBuilder: QueryBuilder) = queryBuilder {
val joinedValue = if (value.isEmpty()) {
"NULL"
} else {
value.joinToString { "'${Base64.encode(it.toByteArray())}'" }
append("ARRAY_AGG(DISTINCT ")
append("ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64')")
append(") <@ ARRAY[", joinedValue, "]")
append(" AND ")
append("ARRAY_AGG(DISTINCT ")
append("ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64')")
append(") @> ARRAY[", joinedValue, "]")
}

append("ARRAY_AGG(DISTINCT ")
append("REPLACE(ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64'), E'\\n', '')")
append(") <@ ARRAY[", joinedValue, "]")
append(" AND ")
append("ARRAY_AGG(DISTINCT ")
append("REPLACE(ENCODE(CONVERT_TO(", column, ", 'UTF8'), 'base64'), E'\\n', '')")
append(") @> ARRAY[", joinedValue, "]")
}
}

Expand Down

0 comments on commit d837499

Please sign in to comment.