itmo-ml · asmetliness · May 28, 2023 · May 28, 2023 · May 29, 2023 · May 29, 2023
diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt
@@ -0,0 +1,20 @@
+package ru.itmo.stand.service.impl.neighbours
+
+import org.springframework.stereotype.Service
+import ru.itmo.stand.service.preprocessing.StopWordRemover
+import ru.itmo.stand.service.preprocessing.TextCleaner
+import ru.itmo.stand.service.preprocessing.Tokenizer
+
+@Service
+class TokensPipelineExecutor(
+    private val stopWordRemover: StopWordRemover,
+    private val textCleaner: TextCleaner,
+    private val tokenizer: Tokenizer,
+) {
+
+    fun execute(content: String): List<String> {
+        val cleanedContent = textCleaner.preprocess(content)
+        val tokens = tokenizer.preprocess(cleanedContent)
+        return stopWordRemover.preprocess(tokens)
+    }
+}
diff --git a/...ighbours/PreprocessingPipelineExecutor.kt → ...mpl/neighbours/WindowsPipelineExecutor.kt b/...ighbours/PreprocessingPipelineExecutor.kt → ...mpl/neighbours/WindowsPipelineExecutor.kt
@@ -9,7 +9,7 @@ import ru.itmo.stand.service.preprocessing.Tokenizer
 import ru.itmo.stand.util.Window
 
 @Service
-class PreprocessingPipelineExecutor(
+class WindowsPipelineExecutor(
     private val standProperties: StandProperties,
     private val contextSplitter: ContextSplitter,
     private val stopWordRemover: StopWordRemover,

diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt
@@ -68,6 +68,7 @@ class InvertedIndexBuilder(
                 documentEmbeddingRepository.findByDocId(docId).embedding
             }
             NeighboursDocument(
+                token = contextualizedEmbedding.tokenWithEmbeddingId.split(ContextualizedEmbedding.TOKEN_AND_EMBEDDING_ID_SEPARATOR).first(),
                 tokenWithEmbeddingId = contextualizedEmbedding.tokenWithEmbeddingId,
                 docId = docId,
                 score = documentEmbedding.dot(contextualizedEmbedding.embedding),

diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/WindowedTokenCreator.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/WindowedTokenCreator.kt
@@ -3,15 +3,15 @@ package ru.itmo.stand.service.impl.neighbours.indexing
 import io.github.oshai.KotlinLogging
 import org.springframework.stereotype.Service
 import ru.itmo.stand.config.StandProperties
-import ru.itmo.stand.service.impl.neighbours.PreprocessingPipelineExecutor
+import ru.itmo.stand.service.impl.neighbours.WindowsPipelineExecutor
 import ru.itmo.stand.service.model.Document
 import ru.itmo.stand.util.Window
 import ru.itmo.stand.util.createPath
 import java.io.File
 
 @Service
 class WindowedTokenCreator(
-    private val preprocessingPipelineExecutor: PreprocessingPipelineExecutor,
+    private val windowsPipelineExecutor: WindowsPipelineExecutor,
     private val standProperties: StandProperties,
 ) {
 
@@ -78,7 +78,7 @@ class WindowedTokenCreator(
     }
 
     fun create(document: Document): List<Window> {
-        return preprocessingPipelineExecutor.execute(document.content)
+        return windowsPipelineExecutor.execute(document.content)
     }
 
     companion object {

diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt
@@ -2,25 +2,35 @@ package ru.itmo.stand.service.impl.neighbours.search
 
 import org.springframework.stereotype.Service
 import ru.itmo.stand.service.bert.BertEmbeddingCalculator
-import ru.itmo.stand.service.impl.neighbours.PreprocessingPipelineExecutor
+import ru.itmo.stand.service.impl.neighbours.TokensPipelineExecutor
+import ru.itmo.stand.service.impl.neighbours.WindowsPipelineExecutor
 import ru.itmo.stand.storage.embedding.ContextualizedEmbeddingRepository
 import ru.itmo.stand.storage.lucene.repository.neighbours.InvertedIndex
 
 @Service
 class NeighboursSearcher(
     private val contextualizedEmbeddingRepository: ContextualizedEmbeddingRepository,
-    private val preprocessingPipelineExecutor: PreprocessingPipelineExecutor,
+    private val windowsPipelineExecutor: WindowsPipelineExecutor,
     private val bertEmbeddingCalculator: BertEmbeddingCalculator,
     private val invertedIndex: InvertedIndex,
+    private val tokensPipelineExecutor: TokensPipelineExecutor,
 ) {
 
     fun search(query: String): List<String> {
-        val windows = preprocessingPipelineExecutor.execute(query)
+        val tokens = tokensPipelineExecutor.execute(query)
+
+        val primaryDocuments = invertedIndex.findByTokens(tokens)
+
+        val windows = windowsPipelineExecutor.execute(query)
         val embeddings = bertEmbeddingCalculator.calculate(windows.map { it.toTranslatorInput() }.toTypedArray())
+
         return embeddings.flatMap { embedding -> contextualizedEmbeddingRepository.findByVector(embedding.toTypedArray()) }
             .let { contextualizedEmbeddings ->
                 val tokenWithEmbeddingIds = contextualizedEmbeddings.map { it.tokenWithEmbeddingId }
-                invertedIndex.findByTokenWithEmbeddingIds(tokenWithEmbeddingIds).groupingBy { it.docId }
+                val secondaryDocuments = invertedIndex.findByTokenWithEmbeddingIds(tokenWithEmbeddingIds)
+
+                sequenceOf(primaryDocuments, secondaryDocuments).flatten()
+                    .groupingBy { it.docId }
                     .foldTo(HashMap(), 0f) { acc, doc -> acc + doc.score }
             }.entries
             .sortedByDescending { (_, score) -> score }

diff --git a/src/main/kotlin/ru/itmo/stand/storage/lucene/model/neighbours/NeighboursDocument.kt b/src/main/kotlin/ru/itmo/stand/storage/lucene/model/neighbours/NeighboursDocument.kt
@@ -1,6 +1,7 @@
 package ru.itmo.stand.storage.lucene.model.neighbours
 
 data class NeighboursDocument(
+    val token: String,
     val tokenWithEmbeddingId: String,
     val docId: String,
     val score: Float,

diff --git a/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt b/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt
@@ -7,6 +7,7 @@ import org.apache.lucene.document.StringField
 import org.apache.lucene.index.ConcurrentMergeScheduler
 import org.apache.lucene.index.IndexWriterConfig
 import org.apache.lucene.index.Term
+import org.apache.lucene.search.BooleanQuery
 import org.apache.lucene.search.TermQuery
 import org.springframework.stereotype.Repository
 import ru.itmo.stand.config.StandProperties
@@ -29,6 +30,7 @@ class InvertedIndex(private val standProperties: StandProperties) : LuceneReposi
 
     fun save(entity: NeighboursDocument) {
         val document = Document()
+        document.add(StringField(NeighboursDocument::token.name, entity.token, YES))
         document.add(StringField(NeighboursDocument::tokenWithEmbeddingId.name, entity.tokenWithEmbeddingId, YES))
         document.add(StringField(NeighboursDocument::docId.name, entity.docId, YES))
         document.add(StringField(NeighboursDocument::score.name, entity.score.toString(), YES))
@@ -39,14 +41,27 @@ class InvertedIndex(private val standProperties: StandProperties) : LuceneReposi
         entities.forEach { save(it) }
     }
 
+    fun findByTokens(tokens: Collection<String>): Sequence<NeighboursDocument> {
+        val query = booleanQuery(tokens) { token ->
+            TermQuery(Term(NeighboursDocument::token.name, token))
+        }
+
+        return search(query)
+    }
+
     fun findByTokenWithEmbeddingIds(tokenWithEmbeddingIds: Collection<String>): Sequence<NeighboursDocument> {
         val query = booleanQuery(tokenWithEmbeddingIds) { tokenWithEmbeddingId ->
             TermQuery(Term(NeighboursDocument::tokenWithEmbeddingId.name, tokenWithEmbeddingId))
         }
 
+        return search(query)
+    }
+
+    private fun search(query: BooleanQuery): Sequence<NeighboursDocument> {
         return searcher.searchAll(query)
             .map {
                 NeighboursDocument(
+                    it.get(NeighboursDocument::token.name),
                     it.get(NeighboursDocument::tokenWithEmbeddingId.name),
                     it.get(NeighboursDocument::docId.name),
                     it.get(NeighboursDocument::score.name).toFloat(),

diff --git a/src/test/kotlin/ru/itmo/stand/fixtures/PreprocessingPipelineExecutorBuilder.kt b/src/test/kotlin/ru/itmo/stand/fixtures/PreprocessingPipelineExecutorBuilder.kt
@@ -1,10 +1,10 @@
 package ru.itmo.stand.fixtures
 
-import ru.itmo.stand.service.impl.neighbours.PreprocessingPipelineExecutor
+import ru.itmo.stand.service.impl.neighbours.WindowsPipelineExecutor
 import ru.itmo.stand.service.preprocessing.ContextSplitter
 import ru.itmo.stand.service.preprocessing.StopWordRemover
 
-fun preprocessingPipelineExecutor(): PreprocessingPipelineExecutor = PreprocessingPipelineExecutor(
+fun preprocessingPipelineExecutor(): WindowsPipelineExecutor = WindowsPipelineExecutor(
     standProperties(),
     ContextSplitter(),
     StopWordRemover(),

diff --git a/...ours/PreprocessingPipelineExecutorTest.kt → ...neighbours/WindowsPipelineExecutorTest.kt b/...ours/PreprocessingPipelineExecutorTest.kt → ...neighbours/WindowsPipelineExecutorTest.kt
@@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test
 import ru.itmo.stand.fixtures.preprocessingPipelineExecutor
 import ru.itmo.stand.util.Window
 
-class PreprocessingPipelineExecutorTest {
+class WindowsPipelineExecutorTest {
 
     private val preprocessingPipelineExecutor = preprocessingPipelineExecutor()